posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static void match_ctx_free_subtops (re_match_context_t *mctx)
  26      internal_function;
  27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  28                                           int str_idx, int from, int to)
  29      internal_function;
  30 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  31      internal_function;
  32 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  33                                            int str_idx) internal_function;
  34 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  35                                                    int node, int str_idx)
  36      internal_function;
  37 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  38                            re_dfastate_t **limited_sts, int last_node,
  39                            int last_str_idx)
  40      internal_function;
  41 static reg_errcode_t re_search_internal (const regex_t *preg,
  42                                          const char *string, int length,
  43                                          int start, int range, int stop,
  44                                          size_t nmatch, regmatch_t pmatch[],
  45                                          int eflags) internal_function;
  46 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  47                              const char *string1, int length1,
  48                              const char *string2, int length2,
  49                              int start, int range, struct re_registers *regs,
  50                              int stop, int ret_len) internal_function;
  51 static int re_search_stub (struct re_pattern_buffer *bufp,
  52                            const char *string, int length, int start,
  53                            int range, int stop, struct re_registers *regs,
  54                            int ret_len) internal_function;
  55 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  56                               int nregs, int regs_allocated) internal_function;
  57 static inline re_dfastate_t *acquire_init_state_context
  58      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  59      __attribute ((always_inline)) internal_function;
  60 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  61      internal_function;
  62 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  63                            int *p_match_first)
  64      internal_function;
  65 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  66                                     unsigned int context) internal_function;
  67 static int check_halt_state_context (const re_match_context_t *mctx,
  68                                      const re_dfastate_t *state, int idx)
  69      internal_function;
  70 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  71                          regmatch_t *prev_idx_match, int cur_node,
  72                          int cur_idx, int nmatch) internal_function;
  73 static int proceed_next_node (const re_match_context_t *mctx,
  74                               int nregs, regmatch_t *regs,
  75                               int *pidx, int node, re_node_set *eps_via_nodes,
  76                               struct re_fail_stack_t *fs) internal_function;
  77 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  78                                       int str_idx, int *dests, int nregs,
  79                                       regmatch_t *regs,
  80                                       re_node_set *eps_via_nodes) internal_function;
  81 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  82                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  83 static reg_errcode_t set_regs (const regex_t *preg,
  84                                const re_match_context_t *mctx,
  85                                size_t nmatch, regmatch_t *pmatch,
  86                                int fl_backtrack) internal_function;
  87 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  88
  89 #ifdef RE_ENABLE_I18N
  90 static int sift_states_iter_mb (const re_match_context_t *mctx,
  91                                 re_sift_context_t *sctx,
  92                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  93 #endif /* RE_ENABLE_I18N */
  94 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  95                                            re_sift_context_t *sctx) internal_function;
  96 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  97                                           re_sift_context_t *sctx, int str_idx,
  98                                           re_node_set *cur_dest) internal_function;
  99 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
 100                                               re_sift_context_t *sctx,
 101                                               int str_idx,
 102                                               re_node_set *dest_nodes) internal_function;
 103 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 104                                             re_node_set *dest_nodes,
 105                                             const re_node_set *candidates) internal_function;
 106 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 107                                             re_node_set *dest_nodes,
 108                                             const re_node_set *and_nodes) internal_function;
 109 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 110                              int dst_node, int dst_idx, int src_node,
 111                              int src_idx) internal_function;
 112 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 113                                         int boundaries, int subexp_idx,
 114                                         int from_node, int bkref_idx) internal_function;
 115 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 116                                       int limit, int subexp_idx,
 117                                       int node, int str_idx,
 118                                       int bkref_idx) internal_function;
 119 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 120                                           re_node_set *dest_nodes,
 121                                           const re_node_set *candidates,
 122                                           re_node_set *limits,
 123                                           struct re_backref_cache_entry *bkref_ents,
 124                                           int str_idx) internal_function;
 125 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 126                                         re_sift_context_t *sctx,
 127                                         int str_idx, const re_node_set *candidates) internal_function;
 128 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 129                                                 int next_state_log_idx) internal_function;
 130 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 131                                         re_dfastate_t **src, int num) internal_function;
 132 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 133                                          re_match_context_t *mctx) internal_function;
 134 static re_dfastate_t *transit_state (reg_errcode_t *err,
 135                                      re_match_context_t *mctx,
 136                                      re_dfastate_t *state) internal_function;
 137 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 138                                             re_match_context_t *mctx,
 139                                             re_dfastate_t *next_state) internal_function;
 140 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 141                                                 re_node_set *cur_nodes,
 142                                                 int str_idx) internal_function;
 143 #if 0
 144 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 145                                         re_match_context_t *mctx,
 146                                         re_dfastate_t *pstate) internal_function;
 147 #endif
 148 #ifdef RE_ENABLE_I18N
 149 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 150                                        re_dfastate_t *pstate) internal_function;
 151 #endif /* RE_ENABLE_I18N */
 152 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 153                                           const re_node_set *nodes) internal_function;
 154 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 155                                  int bkref_node, int bkref_str_idx) internal_function;
 156 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 157                                      const re_sub_match_top_t *sub_top,
 158                                      re_sub_match_last_t *sub_last,
 159                                      int bkref_node, int bkref_str) internal_function;
 160 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 161                              int subexp_idx, int type) internal_function;
 162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 163                                     state_array_t *path, int top_node,
 164                                     int top_str, int last_node, int last_str,
 165                                     int type) internal_function;
 166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 167                                                    int str_idx,
 168                                                    re_node_set *cur_nodes,
 169                                                    re_node_set *next_nodes) internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type) internal_function;
 173 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 174                                                    re_node_set *dst_nodes,
 175                                                    int target, int ex_subexp,
 176                                                    int type) internal_function;
 177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 178                                          re_node_set *cur_nodes, int cur_str,
 179                                          int subexp_num, int type) internal_function;
 180 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
 181                                       re_dfastate_t *state) internal_function;
 182 #ifdef RE_ENABLE_I18N
 183 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 184                                     const re_string_t *input, int idx) internal_function;
 185 # ifdef _LIBC
 186 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 187                                                    size_t name_len) internal_function;
 188 # endif /* _LIBC */
 189 #endif /* RE_ENABLE_I18N */
 190 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 191                                        const re_dfastate_t *state,
 192                                        re_node_set *states_node,
 193                                        bitset *states_ch) internal_function;
 194 static int check_node_accept (const re_match_context_t *mctx,
 195                               const re_token_t *node, int idx) internal_function;
 196 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 197 \f
 198 /* Entry point for POSIX code.  */
 199
 200 /* regexec searches for a given pattern, specified by PREG, in the
 201    string STRING.
 202
 203    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 204    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 205    least NMATCH elements, and we set them to the offsets of the
 206    corresponding matched substrings.
 207
 208    EFLAGS specifies `execution flags' which affect matching: if
 209    REG_NOTBOL is set, then ^ does not match at the beginning of the
 210    string; if REG_NOTEOL is set, then $ does not match at the end.
 211
 212    We return 0 if we find a match and REG_NOMATCH if not.  */
 213
 214 int
 215 regexec (preg, string, nmatch, pmatch, eflags)
 216     const regex_t *__restrict preg;
 217     const char *__restrict string;
 218     size_t nmatch;
 219     regmatch_t pmatch[];
 220     int eflags;
 221 {
 222   reg_errcode_t err;
 223   int start, length;
 224
 225   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 226     return REG_BADPAT;
 227
 228   if (eflags & REG_STARTEND)
 229     {
 230       start = pmatch[0].rm_so;
 231       length = pmatch[0].rm_eo;
 232     }
 233   else
 234     {
 235       start = 0;
 236       length = strlen (string);
 237     }
 238   if (preg->no_sub)
 239     err = re_search_internal (preg, string, length, start, length - start,
 240                               length, 0, NULL, eflags);
 241   else
 242     err = re_search_internal (preg, string, length, start, length - start,
 243                               length, nmatch, pmatch, eflags);
 244   return err != REG_NOERROR;
 245 }
 246
 247 #ifdef _LIBC
 248 # include <shlib-compat.h>
 249 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 250
 251 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 252 __typeof__ (__regexec) __compat_regexec;
 253
 254 int
 255 attribute_compat_text_section
 256 __compat_regexec (const regex_t *__restrict preg,
 257                   const char *__restrict string, size_t nmatch,
 258                   regmatch_t pmatch[], int eflags)
 259 {
 260   return regexec (preg, string, nmatch, pmatch,
 261                   eflags & (REG_NOTBOL | REG_NOTEOL));
 262 }
 263 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 264 # endif
 265 #endif
 266
 267 /* Entry points for GNU code.  */
 268
 269 /* re_match, re_search, re_match_2, re_search_2
 270
 271    The former two functions operate on STRING with length LENGTH,
 272    while the later two operate on concatenation of STRING1 and STRING2
 273    with lengths LENGTH1 and LENGTH2, respectively.
 274
 275    re_match() matches the compiled pattern in BUFP against the string,
 276    starting at index START.
 277
 278    re_search() first tries matching at index START, then it tries to match
 279    starting from index START + 1, and so on.  The last start position tried
 280    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 281    way as re_match().)
 282
 283    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 284    the first STOP characters of the concatenation of the strings should be
 285    concerned.
 286
 287    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 288    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 289    computed relative to the concatenation, not relative to the individual
 290    strings.)
 291
 292    On success, re_match* functions return the length of the match, re_search*
 293    return the position of the start of the match.  Return value -1 means no
 294    match was found and -2 indicates an internal error.  */
 295
 296 int
 297 re_match (bufp, string, length, start, regs)
 298     struct re_pattern_buffer *bufp;
 299     const char *string;
 300     int length, start;
 301     struct re_registers *regs;
 302 {
 303   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 304 }
 305 #ifdef _LIBC
 306 weak_alias (__re_match, re_match)
 307 #endif
 308
 309 int
 310 re_search (bufp, string, length, start, range, regs)
 311     struct re_pattern_buffer *bufp;
 312     const char *string;
 313     int length, start, range;
 314     struct re_registers *regs;
 315 {
 316   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 317 }
 318 #ifdef _LIBC
 319 weak_alias (__re_search, re_search)
 320 #endif
 321
 322 int
 323 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 324     struct re_pattern_buffer *bufp;
 325     const char *string1, *string2;
 326     int length1, length2, start, stop;
 327     struct re_registers *regs;
 328 {
 329   return re_search_2_stub (bufp, string1, length1, string2, length2,
 330                            start, 0, regs, stop, 1);
 331 }
 332 #ifdef _LIBC
 333 weak_alias (__re_match_2, re_match_2)
 334 #endif
 335
 336 int
 337 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 338     struct re_pattern_buffer *bufp;
 339     const char *string1, *string2;
 340     int length1, length2, start, range, stop;
 341     struct re_registers *regs;
 342 {
 343   return re_search_2_stub (bufp, string1, length1, string2, length2,
 344                            start, range, regs, stop, 0);
 345 }
 346 #ifdef _LIBC
 347 weak_alias (__re_search_2, re_search_2)
 348 #endif
 349
 350 static int
 351 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 352                   stop, ret_len)
 353     struct re_pattern_buffer *bufp;
 354     const char *string1, *string2;
 355     int length1, length2, start, range, stop, ret_len;
 356     struct re_registers *regs;
 357 {
 358   const char *str;
 359   int rval;
 360   int len = length1 + length2;
 361   int free_str = 0;
 362
 363   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 364     return -2;
 365
 366   /* Concatenate the strings.  */
 367   if (length2 > 0)
 368     if (length1 > 0)
 369       {
 370         char *s = re_malloc (char, len);
 371
 372         if (BE (s == NULL, 0))
 373           return -2;
 374         memcpy (s, string1, length1);
 375         memcpy (s + length1, string2, length2);
 376         str = s;
 377         free_str = 1;
 378       }
 379     else
 380       str = string2;
 381   else
 382     str = string1;
 383
 384   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 385                          ret_len);
 386   if (free_str)
 387     re_free ((char *) str);
 388   return rval;
 389 }
 390
 391 /* The parameters have the same meaning as those of re_search.
 392    Additional parameters:
 393    If RET_LEN is nonzero the length of the match is returned (re_match style);
 394    otherwise the position of the match is returned.  */
 395
 396 static int
 397 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 398     struct re_pattern_buffer *bufp;
 399     const char *string;
 400     int length, start, range, stop, ret_len;
 401     struct re_registers *regs;
 402 {
 403   reg_errcode_t result;
 404   regmatch_t *pmatch;
 405   int nregs, rval;
 406   int eflags = 0;
 407
 408   /* Check for out-of-range.  */
 409   if (BE (start < 0 || start > length, 0))
 410     return -1;
 411   if (BE (start + range > length, 0))
 412     range = length - start;
 413   else if (BE (start + range < 0, 0))
 414     range = -start;
 415
 416   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 417   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 418
 419   /* Compile fastmap if we haven't yet.  */
 420   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 421     re_compile_fastmap (bufp);
 422
 423   if (BE (bufp->no_sub, 0))
 424     regs = NULL;
 425
 426   /* We need at least 1 register.  */
 427   if (regs == NULL)
 428     nregs = 1;
 429   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 430                regs->num_regs < bufp->re_nsub + 1, 0))
 431     {
 432       nregs = regs->num_regs;
 433       if (BE (nregs < 1, 0))
 434         {
 435           /* Nothing can be copied to regs.  */
 436           regs = NULL;
 437           nregs = 1;
 438         }
 439     }
 440   else
 441     nregs = bufp->re_nsub + 1;
 442   pmatch = re_malloc (regmatch_t, nregs);
 443   if (BE (pmatch == NULL, 0))
 444     return -2;
 445
 446   result = re_search_internal (bufp, string, length, start, range, stop,
 447                                nregs, pmatch, eflags);
 448
 449   rval = 0;
 450
 451   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 452   if (result != REG_NOERROR)
 453     rval = -1;
 454   else if (regs != NULL)
 455     {
 456       /* If caller wants register contents data back, copy them.  */
 457       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 458                                            bufp->regs_allocated);
 459       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 460         rval = -2;
 461     }
 462
 463   if (BE (rval == 0, 1))
 464     {
 465       if (ret_len)
 466         {
 467           assert (pmatch[0].rm_so == start);
 468           rval = pmatch[0].rm_eo - start;
 469         }
 470       else
 471         rval = pmatch[0].rm_so;
 472     }
 473   re_free (pmatch);
 474   return rval;
 475 }
 476
 477 static unsigned
 478 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 479     struct re_registers *regs;
 480     regmatch_t *pmatch;
 481     int nregs, regs_allocated;
 482 {
 483   int rval = REGS_REALLOCATE;
 484   int i;
 485   int need_regs = nregs + 1;
 486   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 487      uses.  */
 488
 489   /* Have the register data arrays been allocated?  */
 490   if (regs_allocated == REGS_UNALLOCATED)
 491     { /* No.  So allocate them with malloc.  */
 492       regs->start = re_malloc (regoff_t, need_regs);
 493       regs->end = re_malloc (regoff_t, need_regs);
 494       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 495         return REGS_UNALLOCATED;
 496       regs->num_regs = need_regs;
 497     }
 498   else if (regs_allocated == REGS_REALLOCATE)
 499     { /* Yes.  If we need more elements than were already
 500          allocated, reallocate them.  If we need fewer, just
 501          leave it alone.  */
 502       if (BE (need_regs > regs->num_regs, 0))
 503         {
 504           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 505           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 506           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 507             return REGS_UNALLOCATED;
 508           regs->start = new_start;
 509           regs->end = new_end;
 510           regs->num_regs = need_regs;
 511         }
 512     }
 513   else
 514     {
 515       assert (regs_allocated == REGS_FIXED);
 516       /* This function may not be called with REGS_FIXED and nregs too big.  */
 517       assert (regs->num_regs >= nregs);
 518       rval = REGS_FIXED;
 519     }
 520
 521   /* Copy the regs.  */
 522   for (i = 0; i < nregs; ++i)
 523     {
 524       regs->start[i] = pmatch[i].rm_so;
 525       regs->end[i] = pmatch[i].rm_eo;
 526     }
 527   for ( ; i < regs->num_regs; ++i)
 528     regs->start[i] = regs->end[i] = -1;
 529
 530   return rval;
 531 }
 532
 533 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 534    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 535    this memory for recording register information.  STARTS and ENDS
 536    must be allocated using the malloc library routine, and must each
 537    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 538
 539    If NUM_REGS == 0, then subsequent matches should allocate their own
 540    register data.
 541
 542    Unless this function is called, the first search or match using
 543    PATTERN_BUFFER will allocate its own register data, without
 544    freeing the old data.  */
 545
 546 void
 547 re_set_registers (bufp, regs, num_regs, starts, ends)
 548     struct re_pattern_buffer *bufp;
 549     struct re_registers *regs;
 550     unsigned num_regs;
 551     regoff_t *starts, *ends;
 552 {
 553   if (num_regs)
 554     {
 555       bufp->regs_allocated = REGS_REALLOCATE;
 556       regs->num_regs = num_regs;
 557       regs->start = starts;
 558       regs->end = ends;
 559     }
 560   else
 561     {
 562       bufp->regs_allocated = REGS_UNALLOCATED;
 563       regs->num_regs = 0;
 564       regs->start = regs->end = (regoff_t *) 0;
 565     }
 566 }
 567 #ifdef _LIBC
 568 weak_alias (__re_set_registers, re_set_registers)
 569 #endif
 570 \f
 571 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 572    them unless specifically requested.  */
 573
 574 #if defined _REGEX_RE_COMP || defined _LIBC
 575 int
 576 # ifdef _LIBC
 577 weak_function
 578 # endif
 579 re_exec (s)
 580      const char *s;
 581 {
 582   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 583 }
 584 #endif /* _REGEX_RE_COMP */
 585 \f
 586 /* Internal entry point.  */
 587
 588 /* Searches for a compiled pattern PREG in the string STRING, whose
 589    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 590    mingings with regexec.  START, and RANGE have the same meanings
 591    with re_search.
 592    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 593    otherwise return the error code.
 594    Note: We assume front end functions already check ranges.
 595    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 596
 597 static reg_errcode_t
 598 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 599                     eflags)
 600     const regex_t *preg;
 601     const char *string;
 602     int length, start, range, stop, eflags;
 603     size_t nmatch;
 604     regmatch_t pmatch[];
 605 {
 606   reg_errcode_t err;
 607   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 608   int left_lim, right_lim, incr;
 609   int fl_longest_match, match_first, match_kind, match_last = -1;
 610   int sb, ch;
 611 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 612   re_match_context_t mctx = { .dfa = dfa };
 613 #else
 614   re_match_context_t mctx;
 615 #endif
 616   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 617                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 618   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 619
 620 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 621   memset (&mctx, '\0', sizeof (re_match_context_t));
 622   mctx.dfa = dfa;
 623 #endif
 624
 625   /* Check if the DFA haven't been compiled.  */
 626   if (BE (preg->used == 0 || dfa->init_state == NULL
 627           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 628           || dfa->init_state_begbuf == NULL, 0))
 629     return REG_NOMATCH;
 630
 631 #ifdef DEBUG
 632   /* We assume front-end functions already check them.  */
 633   assert (start + range >= 0 && start + range <= length);
 634 #endif
 635
 636   /* If initial states with non-begbuf contexts have no elements,
 637      the regex must be anchored.  If preg->newline_anchor is set,
 638      we'll never use init_state_nl, so do not check it.  */
 639   if (dfa->init_state->nodes.nelem == 0
 640       && dfa->init_state_word->nodes.nelem == 0
 641       && (dfa->init_state_nl->nodes.nelem == 0
 642           || !preg->newline_anchor))
 643     {
 644       if (start != 0 && start + range != 0)
 645         return REG_NOMATCH;
 646       start = range = 0;
 647     }
 648
 649   /* We must check the longest matching, if nmatch > 0.  */
 650   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 651
 652   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 653                             preg->translate, preg->syntax & RE_ICASE, dfa);
 654   if (BE (err != REG_NOERROR, 0))
 655     goto free_return;
 656   mctx.input.stop = stop;
 657   mctx.input.raw_stop = stop;
 658   mctx.input.newline_anchor = preg->newline_anchor;
 659
 660   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 661   if (BE (err != REG_NOERROR, 0))
 662     goto free_return;
 663
 664   /* We will log all the DFA states through which the dfa pass,
 665      if nmatch > 1, or this dfa has "multibyte node", which is a
 666      back-reference or a node which can accept multibyte character or
 667      multi character collating element.  */
 668   if (nmatch > 1 || dfa->has_mb_node)
 669     {
 670       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 671       if (BE (mctx.state_log == NULL, 0))
 672         {
 673           err = REG_ESPACE;
 674           goto free_return;
 675         }
 676     }
 677   else
 678     mctx.state_log = NULL;
 679
 680   match_first = start;
 681   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 682                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 683
 684   /* Check incrementally whether of not the input string match.  */
 685   incr = (range < 0) ? -1 : 1;
 686   left_lim = (range < 0) ? start + range : start;
 687   right_lim = (range < 0) ? start : start + range;
 688   sb = dfa->mb_cur_max == 1;
 689   match_kind =
 690     (fastmap
 691      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 692         | (range >= 0 ? 2 : 0)
 693         | (t != NULL ? 1 : 0))
 694      : 8);
 695
 696   for (;; match_first += incr)
 697     {
 698       err = REG_NOMATCH;
 699       if (match_first < left_lim || right_lim < match_first)
 700         goto free_return;
 701
 702       /* Advance as rapidly as possible through the string, until we
 703          find a plausible place to start matching.  This may be done
 704          with varying efficiency, so there are various possibilities:
 705          only the most common of them are specialized, in order to
 706          save on code size.  We use a switch statement for speed.  */
 707       switch (match_kind)
 708         {
 709         case 8:
 710           /* No fastmap.  */
 711           break;
 712
 713         case 7:
 714           /* Fastmap with single-byte translation, match forward.  */
 715           while (BE (match_first < right_lim, 1)
 716                  && !fastmap[t[(unsigned char) string[match_first]]])
 717             ++match_first;
 718           goto forward_match_found_start_or_reached_end;
 719
 720         case 6:
 721           /* Fastmap without translation, match forward.  */
 722           while (BE (match_first < right_lim, 1)
 723                  && !fastmap[(unsigned char) string[match_first]])
 724             ++match_first;
 725
 726         forward_match_found_start_or_reached_end:
 727           if (BE (match_first == right_lim, 0))
 728             {
 729               ch = match_first >= length
 730                        ? 0 : (unsigned char) string[match_first];
 731               if (!fastmap[t ? t[ch] : ch])
 732                 goto free_return;
 733             }
 734           break;
 735
 736         case 4:
 737         case 5:
 738           /* Fastmap without multi-byte translation, match backwards.  */
 739           while (match_first >= left_lim)
 740             {
 741               ch = match_first >= length
 742                        ? 0 : (unsigned char) string[match_first];
 743               if (fastmap[t ? t[ch] : ch])
 744                 break;
 745               --match_first;
 746             }
 747           if (match_first < left_lim)
 748             goto free_return;
 749           break;
 750
 751         default:
 752           /* In this case, we can't determine easily the current byte,
 753              since it might be a component byte of a multibyte
 754              character.  Then we use the constructed buffer instead.  */
 755           for (;;)
 756             {
 757               /* If MATCH_FIRST is out of the valid range, reconstruct the
 758                  buffers.  */
 759               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 760               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 761                 {
 762                   err = re_string_reconstruct (&mctx.input, match_first,
 763                                                eflags);
 764                   if (BE (err != REG_NOERROR, 0))
 765                     goto free_return;
 766
 767                   offset = match_first - mctx.input.raw_mbs_idx;
 768                 }
 769               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 770                  Note that MATCH_FIRST must not be smaller than 0.  */
 771               ch = (match_first >= length
 772                     ? 0 : re_string_byte_at (&mctx.input, offset));
 773               if (fastmap[ch])
 774                 break;
 775               match_first += incr;
 776               if (match_first < left_lim || match_first > right_lim)
 777                 {
 778                   err = REG_NOMATCH;
 779                   goto free_return;
 780                 }
 781             }
 782           break;
 783         }
 784
 785       /* Reconstruct the buffers so that the matcher can assume that
 786          the matching starts from the beginning of the buffer.  */
 787       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 788       if (BE (err != REG_NOERROR, 0))
 789         goto free_return;
 790
 791 #ifdef RE_ENABLE_I18N
 792      /* Don't consider this char as a possible match start if it part,
 793         yet isn't the head, of a multibyte character.  */
 794       if (!sb && !re_string_first_byte (&mctx.input, 0))
 795         continue;
 796 #endif
 797
 798       /* It seems to be appropriate one, then use the matcher.  */
 799       /* We assume that the matching starts from 0.  */
 800       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 801       match_last = check_matching (&mctx, fl_longest_match,
 802                                    range >= 0 ? &match_first : NULL);
 803       if (match_last != -1)
 804         {
 805           if (BE (match_last == -2, 0))
 806             {
 807               err = REG_ESPACE;
 808               goto free_return;
 809             }
 810           else
 811             {
 812               mctx.match_last = match_last;
 813               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 814                 {
 815                   re_dfastate_t *pstate = mctx.state_log[match_last];
 816                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 817                                                              match_last);
 818                 }
 819               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 820                   || dfa->nbackref)
 821                 {
 822                   err = prune_impossible_nodes (&mctx);
 823                   if (err == REG_NOERROR)
 824                     break;
 825                   if (BE (err != REG_NOMATCH, 0))
 826                     goto free_return;
 827                   match_last = -1;
 828                 }
 829               else
 830                 break; /* We found a match.  */
 831             }
 832         }
 833
 834       match_ctx_clean (&mctx);
 835     }
 836
 837 #ifdef DEBUG
 838   assert (match_last != -1);
 839   assert (err == REG_NOERROR);
 840 #endif
 841
 842   /* Set pmatch[] if we need.  */
 843   if (nmatch > 0)
 844     {
 845       int reg_idx;
 846
 847       /* Initialize registers.  */
 848       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 849         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 850
 851       /* Set the points where matching start/end.  */
 852       pmatch[0].rm_so = 0;
 853       pmatch[0].rm_eo = mctx.match_last;
 854
 855       if (!preg->no_sub && nmatch > 1)
 856         {
 857           err = set_regs (preg, &mctx, nmatch, pmatch,
 858                           dfa->has_plural_match && dfa->nbackref > 0);
 859           if (BE (err != REG_NOERROR, 0))
 860             goto free_return;
 861         }
 862
 863       /* At last, add the offset to the each registers, since we slided
 864          the buffers so that we could assume that the matching starts
 865          from 0.  */
 866       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 867         if (pmatch[reg_idx].rm_so != -1)
 868           {
 869 #ifdef RE_ENABLE_I18N
 870             if (BE (mctx.input.offsets_needed != 0, 0))
 871               {
 872                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 873                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 874                 else
 875                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 876                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 877                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 878                 else
 879                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 880               }
 881 #else
 882             assert (mctx.input.offsets_needed == 0);
 883 #endif
 884             pmatch[reg_idx].rm_so += match_first;
 885             pmatch[reg_idx].rm_eo += match_first;
 886           }
 887     }
 888
 889  free_return:
 890   re_free (mctx.state_log);
 891   if (dfa->nbackref)
 892     match_ctx_free (&mctx);
 893   re_string_destruct (&mctx.input);
 894   return err;
 895 }
 896
 897 static reg_errcode_t
 898 prune_impossible_nodes (mctx)
 899      re_match_context_t *mctx;
 900 {
 901   re_dfa_t *const dfa = mctx->dfa;
 902   int halt_node, match_last;
 903   reg_errcode_t ret;
 904   re_dfastate_t **sifted_states;
 905   re_dfastate_t **lim_states = NULL;
 906   re_sift_context_t sctx;
 907 #ifdef DEBUG
 908   assert (mctx->state_log != NULL);
 909 #endif
 910   match_last = mctx->match_last;
 911   halt_node = mctx->last_node;
 912   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 913   if (BE (sifted_states == NULL, 0))
 914     {
 915       ret = REG_ESPACE;
 916       goto free_return;
 917     }
 918   if (dfa->nbackref)
 919     {
 920       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 921       if (BE (lim_states == NULL, 0))
 922         {
 923           ret = REG_ESPACE;
 924           goto free_return;
 925         }
 926       while (1)
 927         {
 928           memset (lim_states, '\0',
 929                   sizeof (re_dfastate_t *) * (match_last + 1));
 930           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 931                          match_last);
 932           ret = sift_states_backward (mctx, &sctx);
 933           re_node_set_free (&sctx.limits);
 934           if (BE (ret != REG_NOERROR, 0))
 935               goto free_return;
 936           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 937             break;
 938           do
 939             {
 940               --match_last;
 941               if (match_last < 0)
 942                 {
 943                   ret = REG_NOMATCH;
 944                   goto free_return;
 945                 }
 946             } while (mctx->state_log[match_last] == NULL
 947                      || !mctx->state_log[match_last]->halt);
 948           halt_node = check_halt_state_context (mctx,
 949                                                 mctx->state_log[match_last],
 950                                                 match_last);
 951         }
 952       ret = merge_state_array (dfa, sifted_states, lim_states,
 953                                match_last + 1);
 954       re_free (lim_states);
 955       lim_states = NULL;
 956       if (BE (ret != REG_NOERROR, 0))
 957         goto free_return;
 958     }
 959   else
 960     {
 961       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 962       ret = sift_states_backward (mctx, &sctx);
 963       re_node_set_free (&sctx.limits);
 964       if (BE (ret != REG_NOERROR, 0))
 965         goto free_return;
 966     }
 967   re_free (mctx->state_log);
 968   mctx->state_log = sifted_states;
 969   sifted_states = NULL;
 970   mctx->last_node = halt_node;
 971   mctx->match_last = match_last;
 972   ret = REG_NOERROR;
 973  free_return:
 974   re_free (sifted_states);
 975   re_free (lim_states);
 976   return ret;
 977 }
 978
 979 /* Acquire an initial state and return it.
 980    We must select appropriate initial state depending on the context,
 981    since initial states may have constraints like "\<", "^", etc..  */
 982
 983 static inline re_dfastate_t *
 984 acquire_init_state_context (err, mctx, idx)
 985      reg_errcode_t *err;
 986      const re_match_context_t *mctx;
 987      int idx;
 988 {
 989   re_dfa_t *const dfa = mctx->dfa;
 990   if (dfa->init_state->has_constraint)
 991     {
 992       unsigned int context;
 993       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 994       if (IS_WORD_CONTEXT (context))
 995         return dfa->init_state_word;
 996       else if (IS_ORDINARY_CONTEXT (context))
 997         return dfa->init_state;
 998       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
 999         return dfa->init_state_begbuf;
1000       else if (IS_NEWLINE_CONTEXT (context))
1001         return dfa->init_state_nl;
1002       else if (IS_BEGBUF_CONTEXT (context))
1003         {
1004           /* It is relatively rare case, then calculate on demand.  */
1005           return re_acquire_state_context (err, dfa,
1006                                            dfa->init_state->entrance_nodes,
1007                                            context);
1008         }
1009       else
1010         /* Must not happen?  */
1011         return dfa->init_state;
1012     }
1013   else
1014     return dfa->init_state;
1015 }
1016
1017 /* Check whether the regular expression match input string INPUT or not,
1018    and return the index where the matching end, return -1 if not match,
1019    or return -2 in case of an error.
1020    FL_LONGEST_MATCH means we want the POSIX longest matching.
1021    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1022    next place where we may want to try matching.
1023    Note that the matcher assume that the maching starts from the current
1024    index of the buffer.  */
1025
1026 static int
1027 check_matching (mctx, fl_longest_match, p_match_first)
1028     re_match_context_t *mctx;
1029     int fl_longest_match;
1030     int *p_match_first;
1031 {
1032   re_dfa_t *const dfa = mctx->dfa;
1033   reg_errcode_t err;
1034   int match = 0;
1035   int match_last = -1;
1036   int cur_str_idx = re_string_cur_idx (&mctx->input);
1037   re_dfastate_t *cur_state;
1038   int at_init_state = p_match_first != NULL;
1039   int next_start_idx = cur_str_idx;
1040
1041   err = REG_NOERROR;
1042   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1043   /* An initial state must not be NULL (invalid).  */
1044   if (BE (cur_state == NULL, 0))
1045     {
1046       assert (err == REG_ESPACE);
1047       return -2;
1048     }
1049
1050   if (mctx->state_log != NULL)
1051     {
1052       mctx->state_log[cur_str_idx] = cur_state;
1053
1054       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1055          later.  E.g. Processing back references.  */
1056       if (BE (dfa->nbackref, 0))
1057         {
1058           at_init_state = 0;
1059           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1060           if (BE (err != REG_NOERROR, 0))
1061             return err;
1062
1063           if (cur_state->has_backref)
1064             {
1065               err = transit_state_bkref (mctx, &cur_state->nodes);
1066               if (BE (err != REG_NOERROR, 0))
1067                 return err;
1068             }
1069         }
1070     }
1071
1072   /* If the RE accepts NULL string.  */
1073   if (BE (cur_state->halt, 0))
1074     {
1075       if (!cur_state->has_constraint
1076           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1077         {
1078           if (!fl_longest_match)
1079             return cur_str_idx;
1080           else
1081             {
1082               match_last = cur_str_idx;
1083               match = 1;
1084             }
1085         }
1086     }
1087
1088   while (!re_string_eoi (&mctx->input))
1089     {
1090       re_dfastate_t *old_state = cur_state;
1091       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1092
1093       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1094           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1095               && mctx->input.valid_len < mctx->input.len))
1096         {
1097           err = extend_buffers (mctx);
1098           if (BE (err != REG_NOERROR, 0))
1099             {
1100               assert (err == REG_ESPACE);
1101               return -2;
1102             }
1103         }
1104
1105       cur_state = transit_state (&err, mctx, cur_state);
1106       if (mctx->state_log != NULL)
1107         cur_state = merge_state_with_log (&err, mctx, cur_state);
1108
1109       if (cur_state == NULL)
1110         {
1111           /* Reached the invalid state or an error.  Try to recover a valid
1112              state using the state log, if available and if we have not
1113              already found a valid (even if not the longest) match.  */
1114           if (BE (err != REG_NOERROR, 0))
1115             return -2;
1116
1117           if (mctx->state_log == NULL
1118               || (match && !fl_longest_match)
1119               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1120             break;
1121         }
1122
1123       if (BE (at_init_state, 0))
1124         {
1125           if (old_state == cur_state)
1126             next_start_idx = next_char_idx;
1127           else
1128             at_init_state = 0;
1129         }
1130
1131       if (cur_state->halt)
1132         {
1133           /* Reached a halt state.
1134              Check the halt state can satisfy the current context.  */
1135           if (!cur_state->has_constraint
1136               || check_halt_state_context (mctx, cur_state,
1137                                            re_string_cur_idx (&mctx->input)))
1138             {
1139               /* We found an appropriate halt state.  */
1140               match_last = re_string_cur_idx (&mctx->input);
1141               match = 1;
1142
1143               /* We found a match, do not modify match_first below.  */
1144               p_match_first = NULL;
1145               if (!fl_longest_match)
1146                 break;
1147             }
1148         }
1149     }
1150
1151   if (p_match_first)
1152     *p_match_first += next_start_idx;
1153
1154   return match_last;
1155 }
1156
1157 /* Check NODE match the current context.  */
1158
1159 static int check_halt_node_context (dfa, node, context)
1160     const re_dfa_t *dfa;
1161     int node;
1162     unsigned int context;
1163 {
1164   re_token_type_t type = dfa->nodes[node].type;
1165   unsigned int constraint = dfa->nodes[node].constraint;
1166   if (type != END_OF_RE)
1167     return 0;
1168   if (!constraint)
1169     return 1;
1170   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1171     return 0;
1172   return 1;
1173 }
1174
1175 /* Check the halt state STATE match the current context.
1176    Return 0 if not match, if the node, STATE has, is a halt node and
1177    match the context, return the node.  */
1178
1179 static int
1180 check_halt_state_context (mctx, state, idx)
1181     const re_match_context_t *mctx;
1182     const re_dfastate_t *state;
1183     int idx;
1184 {
1185   int i;
1186   unsigned int context;
1187 #ifdef DEBUG
1188   assert (state->halt);
1189 #endif
1190   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1191   for (i = 0; i < state->nodes.nelem; ++i)
1192     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1193       return state->nodes.elems[i];
1194   return 0;
1195 }
1196
1197 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1198    corresponding to the DFA).
1199    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1200    of errors.  */
1201
1202 static int
1203 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1204     const re_match_context_t *mctx;
1205     regmatch_t *regs;
1206     int nregs, *pidx, node;
1207     re_node_set *eps_via_nodes;
1208     struct re_fail_stack_t *fs;
1209 {
1210   re_dfa_t *const dfa = mctx->dfa;
1211   int i, err, dest_node;
1212   dest_node = -1;
1213   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1214     {
1215       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1216       int ndest, dest_nodes[2];
1217       err = re_node_set_insert (eps_via_nodes, node);
1218       if (BE (err < 0, 0))
1219         return -2;
1220       /* Pick up valid destinations.  */
1221       for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1222         {
1223           int candidate = dfa->edests[node].elems[i];
1224           if (!re_node_set_contains (cur_nodes, candidate))
1225             continue;
1226           dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1227           dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1228           ++ndest;
1229         }
1230       if (ndest <= 1)
1231         return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1232       /* In order to avoid infinite loop like "(a*)*".  */
1233       if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1234         return dest_nodes[1];
1235       if (fs != NULL
1236           && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1237                               eps_via_nodes))
1238         return -2;
1239       return dest_nodes[0];
1240     }
1241   else
1242     {
1243       int naccepted = 0;
1244       re_token_type_t type = dfa->nodes[node].type;
1245
1246 #ifdef RE_ENABLE_I18N
1247       if (ACCEPT_MB_NODE (type))
1248         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1249       else
1250 #endif /* RE_ENABLE_I18N */
1251       if (type == OP_BACK_REF)
1252         {
1253           int subexp_idx = dfa->nodes[node].opr.idx;
1254           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1255           if (fs != NULL)
1256             {
1257               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1258                 return -1;
1259               else if (naccepted)
1260                 {
1261                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1262                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1263                               naccepted) != 0)
1264                     return -1;
1265                 }
1266             }
1267
1268           if (naccepted == 0)
1269             {
1270               err = re_node_set_insert (eps_via_nodes, node);
1271               if (BE (err < 0, 0))
1272                 return -2;
1273               dest_node = dfa->edests[node].elems[0];
1274               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1275                                         dest_node))
1276                 return dest_node;
1277             }
1278         }
1279
1280       if (naccepted != 0
1281           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1282         {
1283           dest_node = dfa->nexts[node];
1284           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1285           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1286                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1287                                                dest_node)))
1288             return -1;
1289           re_node_set_empty (eps_via_nodes);
1290           return dest_node;
1291         }
1292     }
1293   return -1;
1294 }
1295
1296 static reg_errcode_t
1297 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1298      struct re_fail_stack_t *fs;
1299      int str_idx, *dests, nregs;
1300      regmatch_t *regs;
1301      re_node_set *eps_via_nodes;
1302 {
1303   reg_errcode_t err;
1304   int num = fs->num++;
1305   if (fs->num == fs->alloc)
1306     {
1307       struct re_fail_stack_ent_t *new_array;
1308       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1309                                        * fs->alloc * 2));
1310       if (new_array == NULL)
1311         return REG_ESPACE;
1312       fs->alloc *= 2;
1313       fs->stack = new_array;
1314     }
1315   fs->stack[num].idx = str_idx;
1316   fs->stack[num].node = dests[1];
1317   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1318   if (fs->stack[num].regs == NULL)
1319     return REG_ESPACE;
1320   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1321   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1322   return err;
1323 }
1324
1325 static int
1326 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1327      struct re_fail_stack_t *fs;
1328      int *pidx, nregs;
1329      regmatch_t *regs;
1330      re_node_set *eps_via_nodes;
1331 {
1332   int num = --fs->num;
1333   assert (num >= 0);
1334   *pidx = fs->stack[num].idx;
1335   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1336   re_node_set_free (eps_via_nodes);
1337   re_free (fs->stack[num].regs);
1338   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1339   return fs->stack[num].node;
1340 }
1341
1342 /* Set the positions where the subexpressions are starts/ends to registers
1343    PMATCH.
1344    Note: We assume that pmatch[0] is already set, and
1345    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1346
1347 static reg_errcode_t
1348 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1349      const regex_t *preg;
1350      const re_match_context_t *mctx;
1351      size_t nmatch;
1352      regmatch_t *pmatch;
1353      int fl_backtrack;
1354 {
1355   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1356   int idx, cur_node, real_nmatch;
1357   re_node_set eps_via_nodes;
1358   struct re_fail_stack_t *fs;
1359   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1360   regmatch_t *prev_idx_match;
1361
1362 #ifdef DEBUG
1363   assert (nmatch > 1);
1364   assert (mctx->state_log != NULL);
1365 #endif
1366   if (fl_backtrack)
1367     {
1368       fs = &fs_body;
1369       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1370       if (fs->stack == NULL)
1371         return REG_ESPACE;
1372     }
1373   else
1374     fs = NULL;
1375
1376   cur_node = dfa->init_node;
1377   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1378   re_node_set_init_empty (&eps_via_nodes);
1379
1380   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1381   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1382
1383   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1384     {
1385       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1386
1387       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1388         {
1389           int reg_idx;
1390           if (fs)
1391             {
1392               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1393                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1394                   break;
1395               if (reg_idx == nmatch)
1396                 {
1397                   re_node_set_free (&eps_via_nodes);
1398                   return free_fail_stack_return (fs);
1399                 }
1400               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1401                                          &eps_via_nodes);
1402             }
1403           else
1404             {
1405               re_node_set_free (&eps_via_nodes);
1406               return REG_NOERROR;
1407             }
1408         }
1409
1410       /* Proceed to next node.  */
1411       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1412                                     &eps_via_nodes, fs);
1413
1414       if (BE (cur_node < 0, 0))
1415         {
1416           if (BE (cur_node == -2, 0))
1417             {
1418               re_node_set_free (&eps_via_nodes);
1419               free_fail_stack_return (fs);
1420               return REG_ESPACE;
1421             }
1422           if (fs)
1423             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1424                                        &eps_via_nodes);
1425           else
1426             {
1427               re_node_set_free (&eps_via_nodes);
1428               return REG_NOMATCH;
1429             }
1430         }
1431     }
1432   re_node_set_free (&eps_via_nodes);
1433   return free_fail_stack_return (fs);
1434 }
1435
1436 static reg_errcode_t
1437 free_fail_stack_return (fs)
1438      struct re_fail_stack_t *fs;
1439 {
1440   if (fs)
1441     {
1442       int fs_idx;
1443       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1444         {
1445           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1446           re_free (fs->stack[fs_idx].regs);
1447         }
1448       re_free (fs->stack);
1449     }
1450   return REG_NOERROR;
1451 }
1452
1453 static void
1454 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1455      re_dfa_t *dfa;
1456      regmatch_t *pmatch, *prev_idx_match;
1457      int cur_node, cur_idx, nmatch;
1458 {
1459   int type = dfa->nodes[cur_node].type;
1460   if (type == OP_OPEN_SUBEXP)
1461     {
1462       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1463
1464       /* We are at the first node of this sub expression.  */
1465       if (reg_num < nmatch)
1466         {
1467           pmatch[reg_num].rm_so = cur_idx;
1468           pmatch[reg_num].rm_eo = -1;
1469         }
1470     }
1471   else if (type == OP_CLOSE_SUBEXP)
1472     {
1473       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1474       if (reg_num < nmatch)
1475         {
1476           /* We are at the last node of this sub expression.  */
1477           if (pmatch[reg_num].rm_so < cur_idx)
1478             {
1479               pmatch[reg_num].rm_eo = cur_idx;
1480               /* This is a non-empty match or we are not inside an optional
1481                  subexpression.  Accept this right away.  */
1482               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1483             }
1484           else
1485             {
1486               if (dfa->nodes[cur_node].opt_subexp
1487                   && prev_idx_match[reg_num].rm_so != -1)
1488                 /* We transited through an empty match for an optional
1489                    subexpression, like (a?)*, and this is not the subexp's
1490                    first match.  Copy back the old content of the registers
1491                    so that matches of an inner subexpression are undone as
1492                    well, like in ((a?))*.  */
1493                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1494               else
1495                 /* We completed a subexpression, but it may be part of
1496                    an optional one, so do not update PREV_IDX_MATCH.  */
1497                 pmatch[reg_num].rm_eo = cur_idx;
1498             }
1499         }
1500     }
1501 }
1502
1503 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1504    and sift the nodes in each states according to the following rules.
1505    Updated state_log will be wrote to STATE_LOG.
1506
1507    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1508      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1509         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1510         the LAST_NODE, we throw away the node `a'.
1511      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1512         string `s' and transit to `b':
1513         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1514            away the node `a'.
1515         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1516             thrown away, we throw away the node `a'.
1517      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1518         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1519            node `a'.
1520         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1521             we throw away the node `a'.  */
1522
1523 #define STATE_NODE_CONTAINS(state,node) \
1524   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1525
1526 static reg_errcode_t
1527 sift_states_backward (mctx, sctx)
1528      re_match_context_t *mctx;
1529      re_sift_context_t *sctx;
1530 {
1531   reg_errcode_t err;
1532   int null_cnt = 0;
1533   int str_idx = sctx->last_str_idx;
1534   re_node_set cur_dest;
1535
1536 #ifdef DEBUG
1537   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1538 #endif
1539
1540   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1541      transit to the last_node and the last_node itself.  */
1542   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1543   if (BE (err != REG_NOERROR, 0))
1544     return err;
1545   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1546   if (BE (err != REG_NOERROR, 0))
1547     goto free_return;
1548
1549   /* Then check each states in the state_log.  */
1550   while (str_idx > 0)
1551     {
1552       /* Update counters.  */
1553       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1554       if (null_cnt > mctx->max_mb_elem_len)
1555         {
1556           memset (sctx->sifted_states, '\0',
1557                   sizeof (re_dfastate_t *) * str_idx);
1558           re_node_set_free (&cur_dest);
1559           return REG_NOERROR;
1560         }
1561       re_node_set_empty (&cur_dest);
1562       --str_idx;
1563
1564       if (mctx->state_log[str_idx])
1565         {
1566           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1567           if (BE (err != REG_NOERROR, 0))
1568             goto free_return;
1569         }
1570
1571       /* Add all the nodes which satisfy the following conditions:
1572          - It can epsilon transit to a node in CUR_DEST.
1573          - It is in CUR_SRC.
1574          And update state_log.  */
1575       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1576       if (BE (err != REG_NOERROR, 0))
1577         goto free_return;
1578     }
1579   err = REG_NOERROR;
1580  free_return:
1581   re_node_set_free (&cur_dest);
1582   return err;
1583 }
1584
1585 static reg_errcode_t
1586 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1587      re_match_context_t *mctx;
1588      re_sift_context_t *sctx;
1589      int str_idx;
1590      re_node_set *cur_dest;
1591 {
1592   re_dfa_t *const dfa = mctx->dfa;
1593   re_node_set *cur_src = &mctx->state_log[str_idx]->nodes;
1594   int i;
1595
1596   /* Then build the next sifted state.
1597      We build the next sifted state on `cur_dest', and update
1598      `sifted_states[str_idx]' with `cur_dest'.
1599      Note:
1600      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1601      `cur_src' points the node_set of the old `state_log[str_idx]'.  */
1602   for (i = 0; i < cur_src->nelem; i++)
1603     {
1604       int prev_node = cur_src->elems[i];
1605       int naccepted = 0;
1606       re_token_type_t type = dfa->nodes[prev_node].type;
1607       int ret;
1608
1609       if (IS_EPSILON_NODE (type))
1610         continue;
1611 #ifdef RE_ENABLE_I18N
1612       /* If the node may accept `multi byte'.  */
1613       if (ACCEPT_MB_NODE (type))
1614         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1615                                          str_idx, sctx->last_str_idx);
1616 #endif /* RE_ENABLE_I18N */
1617
1618       /* We don't check backreferences here.
1619          See update_cur_sifted_state().  */
1620       if (!naccepted
1621           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1622           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1623                                   dfa->nexts[prev_node]))
1624         naccepted = 1;
1625
1626       if (naccepted == 0)
1627         continue;
1628
1629       if (sctx->limits.nelem)
1630         {
1631           int to_idx = str_idx + naccepted;
1632           if (check_dst_limits (mctx, &sctx->limits,
1633                                 dfa->nexts[prev_node], to_idx,
1634                                 prev_node, str_idx))
1635             continue;
1636         }
1637       ret = re_node_set_insert (cur_dest, prev_node);
1638       if (BE (ret == -1, 0))
1639         return REG_ESPACE;
1640     }
1641
1642   return REG_NOERROR;
1643 }
1644
1645 /* Helper functions.  */
1646
1647 static reg_errcode_t
1648 clean_state_log_if_needed (mctx, next_state_log_idx)
1649     re_match_context_t *mctx;
1650     int next_state_log_idx;
1651 {
1652   int top = mctx->state_log_top;
1653
1654   if (next_state_log_idx >= mctx->input.bufs_len
1655       || (next_state_log_idx >= mctx->input.valid_len
1656           && mctx->input.valid_len < mctx->input.len))
1657     {
1658       reg_errcode_t err;
1659       err = extend_buffers (mctx);
1660       if (BE (err != REG_NOERROR, 0))
1661         return err;
1662     }
1663
1664   if (top < next_state_log_idx)
1665     {
1666       memset (mctx->state_log + top + 1, '\0',
1667               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1668       mctx->state_log_top = next_state_log_idx;
1669     }
1670   return REG_NOERROR;
1671 }
1672
1673 static reg_errcode_t
1674 merge_state_array (dfa, dst, src, num)
1675      re_dfa_t *dfa;
1676      re_dfastate_t **dst;
1677      re_dfastate_t **src;
1678      int num;
1679 {
1680   int st_idx;
1681   reg_errcode_t err;
1682   for (st_idx = 0; st_idx < num; ++st_idx)
1683     {
1684       if (dst[st_idx] == NULL)
1685         dst[st_idx] = src[st_idx];
1686       else if (src[st_idx] != NULL)
1687         {
1688           re_node_set merged_set;
1689           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1690                                         &src[st_idx]->nodes);
1691           if (BE (err != REG_NOERROR, 0))
1692             return err;
1693           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1694           re_node_set_free (&merged_set);
1695           if (BE (err != REG_NOERROR, 0))
1696             return err;
1697         }
1698     }
1699   return REG_NOERROR;
1700 }
1701
1702 static reg_errcode_t
1703 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1704      re_match_context_t *mctx;
1705      re_sift_context_t *sctx;
1706      int str_idx;
1707      re_node_set *dest_nodes;
1708 {
1709   re_dfa_t *const dfa = mctx->dfa;
1710   reg_errcode_t err;
1711   const re_node_set *candidates;
1712   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1713                 : &mctx->state_log[str_idx]->nodes);
1714
1715   if (dest_nodes->nelem == 0)
1716     sctx->sifted_states[str_idx] = NULL;
1717   else
1718     {
1719       if (candidates)
1720         {
1721           /* At first, add the nodes which can epsilon transit to a node in
1722              DEST_NODE.  */
1723           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1724           if (BE (err != REG_NOERROR, 0))
1725             return err;
1726
1727           /* Then, check the limitations in the current sift_context.  */
1728           if (sctx->limits.nelem)
1729             {
1730               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1731                                          mctx->bkref_ents, str_idx);
1732               if (BE (err != REG_NOERROR, 0))
1733                 return err;
1734             }
1735         }
1736
1737       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1738       if (BE (err != REG_NOERROR, 0))
1739         return err;
1740     }
1741
1742   if (candidates && mctx->state_log[str_idx]->has_backref)
1743     {
1744       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1745       if (BE (err != REG_NOERROR, 0))
1746         return err;
1747     }
1748   return REG_NOERROR;
1749 }
1750
1751 static reg_errcode_t
1752 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1753      re_dfa_t *dfa;
1754      re_node_set *dest_nodes;
1755      const re_node_set *candidates;
1756 {
1757   reg_errcode_t err;
1758   int src_idx;
1759   re_node_set src_copy;
1760
1761   err = re_node_set_init_copy (&src_copy, dest_nodes);
1762   if (BE (err != REG_NOERROR, 0))
1763     return err;
1764   for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1765     {
1766       err = re_node_set_add_intersect (dest_nodes, candidates,
1767                                        dfa->inveclosures
1768                                        + src_copy.elems[src_idx]);
1769       if (BE (err != REG_NOERROR, 0))
1770         {
1771           re_node_set_free (&src_copy);
1772           return err;
1773         }
1774     }
1775   re_node_set_free (&src_copy);
1776   return REG_NOERROR;
1777 }
1778
1779 static reg_errcode_t
1780 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1781      re_dfa_t *dfa;
1782      int node;
1783      re_node_set *dest_nodes;
1784      const re_node_set *candidates;
1785 {
1786     int ecl_idx;
1787     reg_errcode_t err;
1788     re_node_set *inv_eclosure = dfa->inveclosures + node;
1789     re_node_set except_nodes;
1790     re_node_set_init_empty (&except_nodes);
1791     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1792       {
1793         int cur_node = inv_eclosure->elems[ecl_idx];
1794         if (cur_node == node)
1795           continue;
1796         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1797           {
1798             int edst1 = dfa->edests[cur_node].elems[0];
1799             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1800                          ? dfa->edests[cur_node].elems[1] : -1);
1801             if ((!re_node_set_contains (inv_eclosure, edst1)
1802                  && re_node_set_contains (dest_nodes, edst1))
1803                 || (edst2 > 0
1804                     && !re_node_set_contains (inv_eclosure, edst2)
1805                     && re_node_set_contains (dest_nodes, edst2)))
1806               {
1807                 err = re_node_set_add_intersect (&except_nodes, candidates,
1808                                                  dfa->inveclosures + cur_node);
1809                 if (BE (err != REG_NOERROR, 0))
1810                   {
1811                     re_node_set_free (&except_nodes);
1812                     return err;
1813                   }
1814               }
1815           }
1816       }
1817     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1818       {
1819         int cur_node = inv_eclosure->elems[ecl_idx];
1820         if (!re_node_set_contains (&except_nodes, cur_node))
1821           {
1822             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1823             re_node_set_remove_at (dest_nodes, idx);
1824           }
1825       }
1826     re_node_set_free (&except_nodes);
1827     return REG_NOERROR;
1828 }
1829
1830 static int
1831 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1832      re_match_context_t *mctx;
1833      re_node_set *limits;
1834      int dst_node, dst_idx, src_node, src_idx;
1835 {
1836   re_dfa_t *const dfa = mctx->dfa;
1837   int lim_idx, src_pos, dst_pos;
1838
1839   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1840   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1841   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1842     {
1843       int subexp_idx;
1844       struct re_backref_cache_entry *ent;
1845       ent = mctx->bkref_ents + limits->elems[lim_idx];
1846       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1847
1848       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1849                                            subexp_idx, dst_node, dst_idx,
1850                                            dst_bkref_idx);
1851       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1852                                            subexp_idx, src_node, src_idx,
1853                                            src_bkref_idx);
1854
1855       /* In case of:
1856          <src> <dst> ( <subexp> )
1857          ( <subexp> ) <src> <dst>
1858          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1859       if (src_pos == dst_pos)
1860         continue; /* This is unrelated limitation.  */
1861       else
1862         return 1;
1863     }
1864   return 0;
1865 }
1866
1867 static int
1868 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1869      re_match_context_t *mctx;
1870      int boundaries, subexp_idx, from_node, bkref_idx;
1871 {
1872   re_dfa_t *const dfa = mctx->dfa;
1873   re_node_set *eclosures = dfa->eclosures + from_node;
1874   int node_idx;
1875
1876   /* Else, we are on the boundary: examine the nodes on the epsilon
1877      closure.  */
1878   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1879     {
1880       int node = eclosures->elems[node_idx];
1881       switch (dfa->nodes[node].type)
1882         {
1883         case OP_BACK_REF:
1884           {
1885             struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1886             do
1887               {
1888                 int dst, cpos;
1889
1890                 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1891                   continue;
1892
1893                 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1894                    OP_CLOSE_SUBEXP cases below.  But, if the
1895                    destination node is the same node as the source
1896                    node, don't recurse because it would cause an
1897                    infinite loop: a regex that exhibits this behavior
1898                    is ()\1*\1*  */
1899                 dst = dfa->edests[node].elems[0];
1900                 if (dst == from_node)
1901                   {
1902                     if (boundaries & 1)
1903                       return -1;
1904                     else /* if (boundaries & 2) */
1905                       return 0;
1906                   }
1907
1908                 cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
1909                                                     subexp_idx, dst, bkref_idx);
1910
1911                 if (cpos == -1 && (boundaries & 1))
1912                   return -1;
1913
1914                 if (cpos == 0 /* && (boundaries & 2) */)
1915                   return 0;
1916               }
1917             while (ent++->more);
1918             break;
1919           }
1920
1921         case OP_OPEN_SUBEXP:
1922           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1923             return -1;
1924           break;
1925
1926         case OP_CLOSE_SUBEXP:
1927           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1928             return 0;
1929           break;
1930
1931         default:
1932             break;
1933         }
1934     }
1935
1936   return (boundaries & 2) ? 1 : 0;
1937 }
1938
1939 static int
1940 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1941      re_match_context_t *mctx;
1942      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1943 {
1944   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1945   int boundaries;
1946
1947   /* If we are outside the range of the subexpression, return -1 or 1.  */
1948   if (str_idx < lim->subexp_from)
1949     return -1;
1950
1951   if (lim->subexp_to < str_idx)
1952     return 1;
1953
1954   /* If we are within the subexpression, return 0.  */
1955   boundaries = (str_idx == lim->subexp_from);
1956   boundaries |= (str_idx == lim->subexp_to) << 1;
1957   if (boundaries == 0)
1958     return 0;
1959
1960   /* Else, examine epsilon closure.  */
1961   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1962                                       from_node, bkref_idx);
1963 }
1964
1965 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1966    which are against limitations from DEST_NODES. */
1967
1968 static reg_errcode_t
1969 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1970      re_dfa_t *dfa;
1971      re_node_set *dest_nodes;
1972      const re_node_set *candidates;
1973      re_node_set *limits;
1974      struct re_backref_cache_entry *bkref_ents;
1975      int str_idx;
1976 {
1977   reg_errcode_t err;
1978   int node_idx, lim_idx;
1979
1980   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1981     {
1982       int subexp_idx;
1983       struct re_backref_cache_entry *ent;
1984       ent = bkref_ents + limits->elems[lim_idx];
1985
1986       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1987         continue; /* This is unrelated limitation.  */
1988
1989       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1990       if (ent->subexp_to == str_idx)
1991         {
1992           int ops_node = -1;
1993           int cls_node = -1;
1994           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1995             {
1996               int node = dest_nodes->elems[node_idx];
1997               re_token_type_t type = dfa->nodes[node].type;
1998               if (type == OP_OPEN_SUBEXP
1999                   && subexp_idx == dfa->nodes[node].opr.idx)
2000                 ops_node = node;
2001               else if (type == OP_CLOSE_SUBEXP
2002                        && subexp_idx == dfa->nodes[node].opr.idx)
2003                 cls_node = node;
2004             }
2005
2006           /* Check the limitation of the open subexpression.  */
2007           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2008           if (ops_node >= 0)
2009             {
2010               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2011                                            candidates);
2012               if (BE (err != REG_NOERROR, 0))
2013                 return err;
2014             }
2015
2016           /* Check the limitation of the close subexpression.  */
2017           if (cls_node >= 0)
2018             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2019               {
2020                 int node = dest_nodes->elems[node_idx];
2021                 if (!re_node_set_contains (dfa->inveclosures + node,
2022                                            cls_node)
2023                     && !re_node_set_contains (dfa->eclosures + node,
2024                                               cls_node))
2025                   {
2026                     /* It is against this limitation.
2027                        Remove it form the current sifted state.  */
2028                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2029                                                  candidates);
2030                     if (BE (err != REG_NOERROR, 0))
2031                       return err;
2032                     --node_idx;
2033                   }
2034               }
2035         }
2036       else /* (ent->subexp_to != str_idx)  */
2037         {
2038           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2039             {
2040               int node = dest_nodes->elems[node_idx];
2041               re_token_type_t type = dfa->nodes[node].type;
2042               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2043                 {
2044                   if (subexp_idx != dfa->nodes[node].opr.idx)
2045                     continue;
2046                   if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
2047                       || (type == OP_OPEN_SUBEXP))
2048                     {
2049                       /* It is against this limitation.
2050                          Remove it form the current sifted state.  */
2051                       err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2052                                                    candidates);
2053                       if (BE (err != REG_NOERROR, 0))
2054                         return err;
2055                     }
2056                 }
2057             }
2058         }
2059     }
2060   return REG_NOERROR;
2061 }
2062
2063 static reg_errcode_t
2064 sift_states_bkref (mctx, sctx, str_idx, candidates)
2065      re_match_context_t *mctx;
2066      re_sift_context_t *sctx;
2067      int str_idx;
2068      const re_node_set *candidates;
2069 {
2070   re_dfa_t *const dfa = mctx->dfa;
2071   reg_errcode_t err;
2072   int node_idx, node;
2073   re_sift_context_t local_sctx;
2074   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2075
2076   if (first_idx == -1)
2077     return REG_NOERROR;
2078
2079   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2080
2081   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2082     {
2083       int enabled_idx;
2084       re_token_type_t type;
2085       struct re_backref_cache_entry *entry;
2086       node = candidates->elems[node_idx];
2087       type = dfa->nodes[node].type;
2088       /* Avoid infinite loop for the REs like "()\1+".  */
2089       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2090         continue;
2091       if (type != OP_BACK_REF)
2092         continue;
2093
2094       entry = mctx->bkref_ents + first_idx;
2095       enabled_idx = first_idx;
2096       do
2097         {
2098           int subexp_len, to_idx, dst_node;
2099           re_dfastate_t *cur_state;
2100
2101           if (entry->node != node)
2102             continue;
2103           subexp_len = entry->subexp_to - entry->subexp_from;
2104           to_idx = str_idx + subexp_len;
2105           dst_node = (subexp_len ? dfa->nexts[node]
2106                       : dfa->edests[node].elems[0]);
2107
2108           if (to_idx > sctx->last_str_idx
2109               || sctx->sifted_states[to_idx] == NULL
2110               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2111               || check_dst_limits (mctx, &sctx->limits, node,
2112                                    str_idx, dst_node, to_idx))
2113             continue;
2114
2115           if (local_sctx.sifted_states == NULL)
2116             {
2117               local_sctx = *sctx;
2118               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2119               if (BE (err != REG_NOERROR, 0))
2120                 goto free_return;
2121             }
2122           local_sctx.last_node = node;
2123           local_sctx.last_str_idx = str_idx;
2124           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2125           if (BE (err < 0, 0))
2126             {
2127               err = REG_ESPACE;
2128               goto free_return;
2129             }
2130           cur_state = local_sctx.sifted_states[str_idx];
2131           err = sift_states_backward (mctx, &local_sctx);
2132           if (BE (err != REG_NOERROR, 0))
2133             goto free_return;
2134           if (sctx->limited_states != NULL)
2135             {
2136               err = merge_state_array (dfa, sctx->limited_states,
2137                                        local_sctx.sifted_states,
2138                                        str_idx + 1);
2139               if (BE (err != REG_NOERROR, 0))
2140                 goto free_return;
2141             }
2142           local_sctx.sifted_states[str_idx] = cur_state;
2143           re_node_set_remove (&local_sctx.limits, enabled_idx);
2144
2145           /* mctx->bkref_ents may have changed, reload the pointer.  */
2146           entry = mctx->bkref_ents + enabled_idx;
2147         }
2148       while (enabled_idx++, entry++->more);
2149     }
2150   err = REG_NOERROR;
2151  free_return:
2152   if (local_sctx.sifted_states != NULL)
2153     {
2154       re_node_set_free (&local_sctx.limits);
2155     }
2156
2157   return err;
2158 }
2159
2160
2161 #ifdef RE_ENABLE_I18N
2162 static int
2163 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2164     const re_match_context_t *mctx;
2165     re_sift_context_t *sctx;
2166     int node_idx, str_idx, max_str_idx;
2167 {
2168   re_dfa_t *const dfa = mctx->dfa;
2169   int naccepted;
2170   /* Check the node can accept `multi byte'.  */
2171   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2172   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2173       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2174                             dfa->nexts[node_idx]))
2175     /* The node can't accept the `multi byte', or the
2176        destination was already thrown away, then the node
2177        could't accept the current input `multi byte'.   */
2178     naccepted = 0;
2179   /* Otherwise, it is sure that the node could accept
2180      `naccepted' bytes input.  */
2181   return naccepted;
2182 }
2183 #endif /* RE_ENABLE_I18N */
2184
2185 \f
2186 /* Functions for state transition.  */
2187
2188 /* Return the next state to which the current state STATE will transit by
2189    accepting the current input byte, and update STATE_LOG if necessary.
2190    If STATE can accept a multibyte char/collating element/back reference
2191    update the destination of STATE_LOG.  */
2192
2193 static re_dfastate_t *
2194 transit_state (err, mctx, state)
2195      reg_errcode_t *err;
2196      re_match_context_t *mctx;
2197      re_dfastate_t *state;
2198 {
2199   re_dfa_t *const dfa = mctx->dfa;
2200   re_dfastate_t **trtable;
2201   unsigned char ch;
2202
2203 #ifdef RE_ENABLE_I18N
2204   /* If the current state can accept multibyte.  */
2205   if (BE (state->accept_mb, 0))
2206     {
2207       *err = transit_state_mb (mctx, state);
2208       if (BE (*err != REG_NOERROR, 0))
2209         return NULL;
2210     }
2211 #endif /* RE_ENABLE_I18N */
2212
2213   /* Then decide the next state with the single byte.  */
2214   if (1)
2215     {
2216       /* Use transition table  */
2217       ch = re_string_fetch_byte (&mctx->input);
2218       trtable = state->trtable;
2219       if (trtable == NULL)
2220         {
2221           trtable = build_trtable (dfa, state);
2222           if (trtable == NULL)
2223             {
2224               *err = REG_ESPACE;
2225               return NULL;
2226             }
2227         }
2228       if (BE (state->word_trtable, 0))
2229         {
2230           unsigned int context;
2231           context
2232             = re_string_context_at (&mctx->input,
2233                                     re_string_cur_idx (&mctx->input) - 1,
2234                                     mctx->eflags);
2235           if (IS_WORD_CONTEXT (context))
2236             return trtable[ch + SBC_MAX];
2237           else
2238             return trtable[ch];
2239         }
2240       else
2241         return trtable[ch];
2242     }
2243 #if 0
2244   else
2245     /* don't use transition table  */
2246     return transit_state_sb (err, mctx, state);
2247 #endif
2248 }
2249
2250 /* Update the state_log if we need */
2251 re_dfastate_t *
2252 merge_state_with_log (err, mctx, next_state)
2253      reg_errcode_t *err;
2254      re_match_context_t *mctx;
2255      re_dfastate_t *next_state;
2256 {
2257   re_dfa_t *const dfa = mctx->dfa;
2258   int cur_idx = re_string_cur_idx (&mctx->input);
2259
2260   if (cur_idx > mctx->state_log_top)
2261     {
2262       mctx->state_log[cur_idx] = next_state;
2263       mctx->state_log_top = cur_idx;
2264     }
2265   else if (mctx->state_log[cur_idx] == 0)
2266     {
2267       mctx->state_log[cur_idx] = next_state;
2268     }
2269   else
2270     {
2271       re_dfastate_t *pstate;
2272       unsigned int context;
2273       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2274       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2275          the destination of a multibyte char/collating element/
2276          back reference.  Then the next state is the union set of
2277          these destinations and the results of the transition table.  */
2278       pstate = mctx->state_log[cur_idx];
2279       log_nodes = pstate->entrance_nodes;
2280       if (next_state != NULL)
2281         {
2282           table_nodes = next_state->entrance_nodes;
2283           *err = re_node_set_init_union (&next_nodes, table_nodes,
2284                                              log_nodes);
2285           if (BE (*err != REG_NOERROR, 0))
2286             return NULL;
2287         }
2288       else
2289         next_nodes = *log_nodes;
2290       /* Note: We already add the nodes of the initial state,
2291          then we don't need to add them here.  */
2292
2293       context = re_string_context_at (&mctx->input,
2294                                       re_string_cur_idx (&mctx->input) - 1,
2295                                       mctx->eflags);
2296       next_state = mctx->state_log[cur_idx]
2297         = re_acquire_state_context (err, dfa, &next_nodes, context);
2298       /* We don't need to check errors here, since the return value of
2299          this function is next_state and ERR is already set.  */
2300
2301       if (table_nodes != NULL)
2302         re_node_set_free (&next_nodes);
2303     }
2304
2305   if (BE (dfa->nbackref, 0) && next_state != NULL)
2306     {
2307       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2308          later.  We must check them here, since the back references in the
2309          next state might use them.  */
2310       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2311                                         cur_idx);
2312       if (BE (*err != REG_NOERROR, 0))
2313         return NULL;
2314
2315       /* If the next state has back references.  */
2316       if (next_state->has_backref)
2317         {
2318           *err = transit_state_bkref (mctx, &next_state->nodes);
2319           if (BE (*err != REG_NOERROR, 0))
2320             return NULL;
2321           next_state = mctx->state_log[cur_idx];
2322         }
2323     }
2324
2325   return next_state;
2326 }
2327
2328 /* Skip bytes in the input that correspond to part of a
2329    multi-byte match, then look in the log for a state
2330    from which to restart matching.  */
2331 re_dfastate_t *
2332 find_recover_state (err, mctx)
2333      reg_errcode_t *err;
2334      re_match_context_t *mctx;
2335 {
2336   re_dfastate_t *cur_state = NULL;
2337   do
2338     {
2339       int max = mctx->state_log_top;
2340       int cur_str_idx = re_string_cur_idx (&mctx->input);
2341
2342       do
2343         {
2344           if (++cur_str_idx > max)
2345             return NULL;
2346           re_string_skip_bytes (&mctx->input, 1);
2347         }
2348       while (mctx->state_log[cur_str_idx] == NULL);
2349
2350       cur_state = merge_state_with_log (err, mctx, NULL);
2351     }
2352   while (err == REG_NOERROR && cur_state == NULL);
2353   return cur_state;
2354 }
2355
2356 /* Helper functions for transit_state.  */
2357
2358 /* From the node set CUR_NODES, pick up the nodes whose types are
2359    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2360    expression. And register them to use them later for evaluating the
2361    correspoding back references.  */
2362
2363 static reg_errcode_t
2364 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2365      re_match_context_t *mctx;
2366      re_node_set *cur_nodes;
2367      int str_idx;
2368 {
2369   re_dfa_t *const dfa = mctx->dfa;
2370   int node_idx;
2371   reg_errcode_t err;
2372
2373   /* TODO: This isn't efficient.
2374            Because there might be more than one nodes whose types are
2375            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2376            nodes.
2377            E.g. RE: (a){2}  */
2378   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2379     {
2380       int node = cur_nodes->elems[node_idx];
2381       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2382           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2383           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2384         {
2385           err = match_ctx_add_subtop (mctx, node, str_idx);
2386           if (BE (err != REG_NOERROR, 0))
2387             return err;
2388         }
2389     }
2390   return REG_NOERROR;
2391 }
2392
2393 #if 0
2394 /* Return the next state to which the current state STATE will transit by
2395    accepting the current input byte.  */
2396
2397 static re_dfastate_t *
2398 transit_state_sb (err, mctx, state)
2399      reg_errcode_t *err;
2400      re_match_context_t *mctx;
2401      re_dfastate_t *state;
2402 {
2403   re_dfa_t *const dfa = mctx->dfa;
2404   re_node_set next_nodes;
2405   re_dfastate_t *next_state;
2406   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2407   unsigned int context;
2408
2409   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2410   if (BE (*err != REG_NOERROR, 0))
2411     return NULL;
2412   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2413     {
2414       int cur_node = state->nodes.elems[node_cnt];
2415       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2416         {
2417           *err = re_node_set_merge (&next_nodes,
2418                                     dfa->eclosures + dfa->nexts[cur_node]);
2419           if (BE (*err != REG_NOERROR, 0))
2420             {
2421               re_node_set_free (&next_nodes);
2422               return NULL;
2423             }
2424         }
2425     }
2426   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2427   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2428   /* We don't need to check errors here, since the return value of
2429      this function is next_state and ERR is already set.  */
2430
2431   re_node_set_free (&next_nodes);
2432   re_string_skip_bytes (&mctx->input, 1);
2433   return next_state;
2434 }
2435 #endif
2436
2437 #ifdef RE_ENABLE_I18N
2438 static reg_errcode_t
2439 transit_state_mb (mctx, pstate)
2440     re_match_context_t *mctx;
2441     re_dfastate_t *pstate;
2442 {
2443   re_dfa_t *const dfa = mctx->dfa;
2444   reg_errcode_t err;
2445   int i;
2446
2447   for (i = 0; i < pstate->nodes.nelem; ++i)
2448     {
2449       re_node_set dest_nodes, *new_nodes;
2450       int cur_node_idx = pstate->nodes.elems[i];
2451       int naccepted = 0, dest_idx;
2452       unsigned int context;
2453       re_dfastate_t *dest_state;
2454
2455       if (dfa->nodes[cur_node_idx].constraint)
2456         {
2457           context = re_string_context_at (&mctx->input,
2458                                           re_string_cur_idx (&mctx->input),
2459                                           mctx->eflags);
2460           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2461                                            context))
2462             continue;
2463         }
2464
2465       /* How many bytes the node can accept?  */
2466       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2467         naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2468                                              re_string_cur_idx (&mctx->input));
2469       if (naccepted == 0)
2470         continue;
2471
2472       /* The node can accepts `naccepted' bytes.  */
2473       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2474       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2475                                : mctx->max_mb_elem_len);
2476       err = clean_state_log_if_needed (mctx, dest_idx);
2477       if (BE (err != REG_NOERROR, 0))
2478         return err;
2479 #ifdef DEBUG
2480       assert (dfa->nexts[cur_node_idx] != -1);
2481 #endif
2482       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2483          then we use pstate->nodes.elems[i] instead.  */
2484       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2485
2486       dest_state = mctx->state_log[dest_idx];
2487       if (dest_state == NULL)
2488         dest_nodes = *new_nodes;
2489       else
2490         {
2491           err = re_node_set_init_union (&dest_nodes,
2492                                         dest_state->entrance_nodes, new_nodes);
2493           if (BE (err != REG_NOERROR, 0))
2494             return err;
2495         }
2496       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2497       mctx->state_log[dest_idx]
2498         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2499       if (dest_state != NULL)
2500         re_node_set_free (&dest_nodes);
2501       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2502         return err;
2503     }
2504   return REG_NOERROR;
2505 }
2506 #endif /* RE_ENABLE_I18N */
2507
2508 static reg_errcode_t
2509 transit_state_bkref (mctx, nodes)
2510     re_match_context_t *mctx;
2511     const re_node_set *nodes;
2512 {
2513   re_dfa_t *const dfa = mctx->dfa;
2514   reg_errcode_t err;
2515   int i;
2516   int cur_str_idx = re_string_cur_idx (&mctx->input);
2517
2518   for (i = 0; i < nodes->nelem; ++i)
2519     {
2520       int dest_str_idx, prev_nelem, bkc_idx;
2521       int node_idx = nodes->elems[i];
2522       unsigned int context;
2523       const re_token_t *node = dfa->nodes + node_idx;
2524       re_node_set *new_dest_nodes;
2525
2526       /* Check whether `node' is a backreference or not.  */
2527       if (node->type != OP_BACK_REF)
2528         continue;
2529
2530       if (node->constraint)
2531         {
2532           context = re_string_context_at (&mctx->input, cur_str_idx,
2533                                           mctx->eflags);
2534           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2535             continue;
2536         }
2537
2538       /* `node' is a backreference.
2539          Check the substring which the substring matched.  */
2540       bkc_idx = mctx->nbkref_ents;
2541       err = get_subexp (mctx, node_idx, cur_str_idx);
2542       if (BE (err != REG_NOERROR, 0))
2543         goto free_return;
2544
2545       /* And add the epsilon closures (which is `new_dest_nodes') of
2546          the backreference to appropriate state_log.  */
2547 #ifdef DEBUG
2548       assert (dfa->nexts[node_idx] != -1);
2549 #endif
2550       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2551         {
2552           int subexp_len;
2553           re_dfastate_t *dest_state;
2554           struct re_backref_cache_entry *bkref_ent;
2555           bkref_ent = mctx->bkref_ents + bkc_idx;
2556           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2557             continue;
2558           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2559           new_dest_nodes = (subexp_len == 0
2560                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2561                             : dfa->eclosures + dfa->nexts[node_idx]);
2562           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2563                           - bkref_ent->subexp_from);
2564           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2565                                           mctx->eflags);
2566           dest_state = mctx->state_log[dest_str_idx];
2567           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2568                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2569           /* Add `new_dest_node' to state_log.  */
2570           if (dest_state == NULL)
2571             {
2572               mctx->state_log[dest_str_idx]
2573                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2574                                             context);
2575               if (BE (mctx->state_log[dest_str_idx] == NULL
2576                       && err != REG_NOERROR, 0))
2577                 goto free_return;
2578             }
2579           else
2580             {
2581               re_node_set dest_nodes;
2582               err = re_node_set_init_union (&dest_nodes,
2583                                             dest_state->entrance_nodes,
2584                                             new_dest_nodes);
2585               if (BE (err != REG_NOERROR, 0))
2586                 {
2587                   re_node_set_free (&dest_nodes);
2588                   goto free_return;
2589                 }
2590               mctx->state_log[dest_str_idx]
2591                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2592               re_node_set_free (&dest_nodes);
2593               if (BE (mctx->state_log[dest_str_idx] == NULL
2594                       && err != REG_NOERROR, 0))
2595                 goto free_return;
2596             }
2597           /* We need to check recursively if the backreference can epsilon
2598              transit.  */
2599           if (subexp_len == 0
2600               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2601             {
2602               err = check_subexp_matching_top (mctx, new_dest_nodes,
2603                                                cur_str_idx);
2604               if (BE (err != REG_NOERROR, 0))
2605                 goto free_return;
2606               err = transit_state_bkref (mctx, new_dest_nodes);
2607               if (BE (err != REG_NOERROR, 0))
2608                 goto free_return;
2609             }
2610         }
2611     }
2612   err = REG_NOERROR;
2613  free_return:
2614   return err;
2615 }
2616
2617 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2618    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2619    Note that we might collect inappropriate candidates here.
2620    However, the cost of checking them strictly here is too high, then we
2621    delay these checking for prune_impossible_nodes().  */
2622
2623 static reg_errcode_t
2624 get_subexp (mctx, bkref_node, bkref_str_idx)
2625      re_match_context_t *mctx;
2626      int bkref_node, bkref_str_idx;
2627 {
2628   re_dfa_t *const dfa = mctx->dfa;
2629   int subexp_num, sub_top_idx;
2630   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2631   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2632   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2633   if (cache_idx != -1)
2634     {
2635       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2636       do
2637         if (entry->node == bkref_node)
2638           return REG_NOERROR; /* We already checked it.  */
2639       while (entry++->more);
2640     }
2641
2642   subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2643
2644   /* For each sub expression  */
2645   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2646     {
2647       reg_errcode_t err;
2648       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2649       re_sub_match_last_t *sub_last;
2650       int sub_last_idx, sl_str, bkref_str_off;
2651
2652       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2653         continue; /* It isn't related.  */
2654
2655       sl_str = sub_top->str_idx;
2656       bkref_str_off = bkref_str_idx;
2657       /* At first, check the last node of sub expressions we already
2658          evaluated.  */
2659       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2660         {
2661           int sl_str_diff;
2662           sub_last = sub_top->lasts[sub_last_idx];
2663           sl_str_diff = sub_last->str_idx - sl_str;
2664           /* The matched string by the sub expression match with the substring
2665              at the back reference?  */
2666           if (sl_str_diff > 0)
2667             {
2668               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2669                 {
2670                   /* Not enough chars for a successful match.  */
2671                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2672                     break;
2673
2674                   err = clean_state_log_if_needed (mctx,
2675                                                    bkref_str_off
2676                                                    + sl_str_diff);
2677                   if (BE (err != REG_NOERROR, 0))
2678                     return err;
2679                   buf = (const char *) re_string_get_buffer (&mctx->input);
2680                 }
2681               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2682                 break; /* We don't need to search this sub expression any more.  */
2683             }
2684           bkref_str_off += sl_str_diff;
2685           sl_str += sl_str_diff;
2686           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2687                                 bkref_str_idx);
2688
2689           /* Reload buf, since the preceding call might have reallocated
2690              the buffer.  */
2691           buf = (const char *) re_string_get_buffer (&mctx->input);
2692
2693           if (err == REG_NOMATCH)
2694             continue;
2695           if (BE (err != REG_NOERROR, 0))
2696             return err;
2697         }
2698
2699       if (sub_last_idx < sub_top->nlasts)
2700         continue;
2701       if (sub_last_idx > 0)
2702         ++sl_str;
2703       /* Then, search for the other last nodes of the sub expression.  */
2704       for (; sl_str <= bkref_str_idx; ++sl_str)
2705         {
2706           int cls_node, sl_str_off;
2707           const re_node_set *nodes;
2708           sl_str_off = sl_str - sub_top->str_idx;
2709           /* The matched string by the sub expression match with the substring
2710              at the back reference?  */
2711           if (sl_str_off > 0)
2712             {
2713               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2714                 {
2715                   /* If we are at the end of the input, we cannot match.  */
2716                   if (bkref_str_off >= mctx->input.len)
2717                     break;
2718
2719                   err = extend_buffers (mctx);
2720                   if (BE (err != REG_NOERROR, 0))
2721                     return err;
2722
2723                   buf = (const char *) re_string_get_buffer (&mctx->input);
2724                 }
2725               if (buf [bkref_str_off++] != buf[sl_str - 1])
2726                 break; /* We don't need to search this sub expression
2727                           any more.  */
2728             }
2729           if (mctx->state_log[sl_str] == NULL)
2730             continue;
2731           /* Does this state have a ')' of the sub expression?  */
2732           nodes = &mctx->state_log[sl_str]->nodes;
2733           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2734           if (cls_node == -1)
2735             continue; /* No.  */
2736           if (sub_top->path == NULL)
2737             {
2738               sub_top->path = calloc (sizeof (state_array_t),
2739                                       sl_str - sub_top->str_idx + 1);
2740               if (sub_top->path == NULL)
2741                 return REG_ESPACE;
2742             }
2743           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2744              in the current context?  */
2745           err = check_arrival (mctx, sub_top->path, sub_top->node,
2746                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2747           if (err == REG_NOMATCH)
2748               continue;
2749           if (BE (err != REG_NOERROR, 0))
2750               return err;
2751           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2752           if (BE (sub_last == NULL, 0))
2753             return REG_ESPACE;
2754           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2755                                 bkref_str_idx);
2756           if (err == REG_NOMATCH)
2757             continue;
2758         }
2759     }
2760   return REG_NOERROR;
2761 }
2762
2763 /* Helper functions for get_subexp().  */
2764
2765 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2766    If it can arrive, register the sub expression expressed with SUB_TOP
2767    and SUB_LAST.  */
2768
2769 static reg_errcode_t
2770 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2771      re_match_context_t *mctx;
2772      const re_sub_match_top_t *sub_top;
2773      re_sub_match_last_t *sub_last;
2774      int bkref_node, bkref_str;
2775 {
2776   reg_errcode_t err;
2777   int to_idx;
2778   /* Can the subexpression arrive the back reference?  */
2779   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2780                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2781   if (err != REG_NOERROR)
2782     return err;
2783   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2784                              sub_last->str_idx);
2785   if (BE (err != REG_NOERROR, 0))
2786     return err;
2787   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2788   return clean_state_log_if_needed (mctx, to_idx);
2789 }
2790
2791 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2792    Search '(' if FL_OPEN, or search ')' otherwise.
2793    TODO: This function isn't efficient...
2794          Because there might be more than one nodes whose types are
2795          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2796          nodes.
2797          E.g. RE: (a){2}  */
2798
2799 static int
2800 find_subexp_node (dfa, nodes, subexp_idx, type)
2801      const re_dfa_t *dfa;
2802      const re_node_set *nodes;
2803      int subexp_idx, type;
2804 {
2805   int cls_idx;
2806   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2807     {
2808       int cls_node = nodes->elems[cls_idx];
2809       const re_token_t *node = dfa->nodes + cls_node;
2810       if (node->type == type
2811           && node->opr.idx == subexp_idx)
2812         return cls_node;
2813     }
2814   return -1;
2815 }
2816
2817 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2818    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2819    heavily reused.
2820    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2821
2822 static reg_errcode_t
2823 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2824                type)
2825      re_match_context_t *mctx;
2826      state_array_t *path;
2827      int top_node, top_str, last_node, last_str, type;
2828 {
2829   re_dfa_t *const dfa = mctx->dfa;
2830   reg_errcode_t err;
2831   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2832   re_dfastate_t *cur_state = NULL;
2833   re_node_set *cur_nodes, next_nodes;
2834   re_dfastate_t **backup_state_log;
2835   unsigned int context;
2836
2837   subexp_num = dfa->nodes[top_node].opr.idx;
2838   /* Extend the buffer if we need.  */
2839   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2840     {
2841       re_dfastate_t **new_array;
2842       int old_alloc = path->alloc;
2843       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2844       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2845       if (new_array == NULL)
2846         {
2847           path->alloc = old_alloc;
2848           return REG_ESPACE;
2849         }
2850       path->array = new_array;
2851       memset (new_array + old_alloc, '\0',
2852               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2853     }
2854
2855   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2856
2857   /* Temporary modify MCTX.  */
2858   backup_state_log = mctx->state_log;
2859   backup_cur_idx = mctx->input.cur_idx;
2860   mctx->state_log = path->array;
2861   mctx->input.cur_idx = str_idx;
2862
2863   /* Setup initial node set.  */
2864   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2865   if (str_idx == top_str)
2866     {
2867       err = re_node_set_init_1 (&next_nodes, top_node);
2868       if (BE (err != REG_NOERROR, 0))
2869         return err;
2870       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2871       if (BE (err != REG_NOERROR, 0))
2872         {
2873           re_node_set_free (&next_nodes);
2874           return err;
2875         }
2876     }
2877   else
2878     {
2879       cur_state = mctx->state_log[str_idx];
2880       if (cur_state && cur_state->has_backref)
2881         {
2882           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2883           if (BE ( err != REG_NOERROR, 0))
2884             return err;
2885         }
2886       else
2887         re_node_set_init_empty (&next_nodes);
2888     }
2889   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2890     {
2891       if (next_nodes.nelem)
2892         {
2893           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2894                                     subexp_num, type);
2895           if (BE ( err != REG_NOERROR, 0))
2896             {
2897               re_node_set_free (&next_nodes);
2898               return err;
2899             }
2900         }
2901       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2902       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2903         {
2904           re_node_set_free (&next_nodes);
2905           return err;
2906         }
2907       mctx->state_log[str_idx] = cur_state;
2908     }
2909
2910   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2911     {
2912       re_node_set_empty (&next_nodes);
2913       if (mctx->state_log[str_idx + 1])
2914         {
2915           err = re_node_set_merge (&next_nodes,
2916                                    &mctx->state_log[str_idx + 1]->nodes);
2917           if (BE (err != REG_NOERROR, 0))
2918             {
2919               re_node_set_free (&next_nodes);
2920               return err;
2921             }
2922         }
2923       if (cur_state)
2924         {
2925           err = check_arrival_add_next_nodes (mctx, str_idx,
2926                                               &cur_state->nodes, &next_nodes);
2927           if (BE (err != REG_NOERROR, 0))
2928             {
2929               re_node_set_free (&next_nodes);
2930               return err;
2931             }
2932         }
2933       ++str_idx;
2934       if (next_nodes.nelem)
2935         {
2936           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2937           if (BE (err != REG_NOERROR, 0))
2938             {
2939               re_node_set_free (&next_nodes);
2940               return err;
2941             }
2942           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2943                                     subexp_num, type);
2944           if (BE ( err != REG_NOERROR, 0))
2945             {
2946               re_node_set_free (&next_nodes);
2947               return err;
2948             }
2949         }
2950       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2951       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2952       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2953         {
2954           re_node_set_free (&next_nodes);
2955           return err;
2956         }
2957       mctx->state_log[str_idx] = cur_state;
2958       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2959     }
2960   re_node_set_free (&next_nodes);
2961   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2962                : &mctx->state_log[last_str]->nodes);
2963   path->next_idx = str_idx;
2964
2965   /* Fix MCTX.  */
2966   mctx->state_log = backup_state_log;
2967   mctx->input.cur_idx = backup_cur_idx;
2968
2969   /* Then check the current node set has the node LAST_NODE.  */
2970   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2971     return REG_NOERROR;
2972
2973   return REG_NOMATCH;
2974 }
2975
2976 /* Helper functions for check_arrival.  */
2977
2978 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2979    to NEXT_NODES.
2980    TODO: This function is similar to the functions transit_state*(),
2981          however this function has many additional works.
2982          Can't we unify them?  */
2983
2984 static reg_errcode_t
2985 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
2986      re_match_context_t *mctx;
2987      int str_idx;
2988      re_node_set *cur_nodes, *next_nodes;
2989 {
2990   re_dfa_t *const dfa = mctx->dfa;
2991   int result;
2992   int cur_idx;
2993   reg_errcode_t err;
2994   re_node_set union_set;
2995   re_node_set_init_empty (&union_set);
2996   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2997     {
2998       int naccepted = 0;
2999       int cur_node = cur_nodes->elems[cur_idx];
3000       re_token_type_t type = dfa->nodes[cur_node].type;
3001       if (IS_EPSILON_NODE (type))
3002         continue;
3003 #ifdef RE_ENABLE_I18N
3004       /* If the node may accept `multi byte'.  */
3005       if (ACCEPT_MB_NODE (type))
3006         {
3007           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3008                                                str_idx);
3009           if (naccepted > 1)
3010             {
3011               re_dfastate_t *dest_state;
3012               int next_node = dfa->nexts[cur_node];
3013               int next_idx = str_idx + naccepted;
3014               dest_state = mctx->state_log[next_idx];
3015               re_node_set_empty (&union_set);
3016               if (dest_state)
3017                 {
3018                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3019                   if (BE (err != REG_NOERROR, 0))
3020                     {
3021                       re_node_set_free (&union_set);
3022                       return err;
3023                     }
3024                 }
3025               result = re_node_set_insert (&union_set, next_node);
3026               if (BE (result < 0, 0))
3027                 {
3028                   re_node_set_free (&union_set);
3029                   return REG_ESPACE;
3030                 }
3031               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3032                                                             &union_set);
3033               if (BE (mctx->state_log[next_idx] == NULL
3034                       && err != REG_NOERROR, 0))
3035                 {
3036                   re_node_set_free (&union_set);
3037                   return err;
3038                 }
3039             }
3040         }
3041 #endif /* RE_ENABLE_I18N */
3042       if (naccepted
3043           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3044         {
3045           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3046           if (BE (result < 0, 0))
3047             {
3048               re_node_set_free (&union_set);
3049               return REG_ESPACE;
3050             }
3051         }
3052     }
3053   re_node_set_free (&union_set);
3054   return REG_NOERROR;
3055 }
3056
3057 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3058    CUR_NODES, however exclude the nodes which are:
3059     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3060     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3061 */
3062
3063 static reg_errcode_t
3064 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3065      re_dfa_t *dfa;
3066      re_node_set *cur_nodes;
3067      int ex_subexp, type;
3068 {
3069   reg_errcode_t err;
3070   int idx, outside_node;
3071   re_node_set new_nodes;
3072 #ifdef DEBUG
3073   assert (cur_nodes->nelem);
3074 #endif
3075   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3076   if (BE (err != REG_NOERROR, 0))
3077     return err;
3078   /* Create a new node set NEW_NODES with the nodes which are epsilon
3079      closures of the node in CUR_NODES.  */
3080
3081   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3082     {
3083       int cur_node = cur_nodes->elems[idx];
3084       re_node_set *eclosure = dfa->eclosures + cur_node;
3085       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3086       if (outside_node == -1)
3087         {
3088           /* There are no problematic nodes, just merge them.  */
3089           err = re_node_set_merge (&new_nodes, eclosure);
3090           if (BE (err != REG_NOERROR, 0))
3091             {
3092               re_node_set_free (&new_nodes);
3093               return err;
3094             }
3095         }
3096       else
3097         {
3098           /* There are problematic nodes, re-calculate incrementally.  */
3099           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3100                                               ex_subexp, type);
3101           if (BE (err != REG_NOERROR, 0))
3102             {
3103               re_node_set_free (&new_nodes);
3104               return err;
3105             }
3106         }
3107     }
3108   re_node_set_free (cur_nodes);
3109   *cur_nodes = new_nodes;
3110   return REG_NOERROR;
3111 }
3112
3113 /* Helper function for check_arrival_expand_ecl.
3114    Check incrementally the epsilon closure of TARGET, and if it isn't
3115    problematic append it to DST_NODES.  */
3116
3117 static reg_errcode_t
3118 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3119      re_dfa_t *dfa;
3120      int target, ex_subexp, type;
3121      re_node_set *dst_nodes;
3122 {
3123   int cur_node;
3124   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3125     {
3126       int err;
3127
3128       if (dfa->nodes[cur_node].type == type
3129           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3130         {
3131           if (type == OP_CLOSE_SUBEXP)
3132             {
3133               err = re_node_set_insert (dst_nodes, cur_node);
3134               if (BE (err == -1, 0))
3135                 return REG_ESPACE;
3136             }
3137           break;
3138         }
3139       err = re_node_set_insert (dst_nodes, cur_node);
3140       if (BE (err == -1, 0))
3141         return REG_ESPACE;
3142       if (dfa->edests[cur_node].nelem == 0)
3143         break;
3144       if (dfa->edests[cur_node].nelem == 2)
3145         {
3146           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3147                                               dfa->edests[cur_node].elems[1],
3148                                               ex_subexp, type);
3149           if (BE (err != REG_NOERROR, 0))
3150             return err;
3151         }
3152       cur_node = dfa->edests[cur_node].elems[0];
3153     }
3154   return REG_NOERROR;
3155 }
3156
3157
3158 /* For all the back references in the current state, calculate the
3159    destination of the back references by the appropriate entry
3160    in MCTX->BKREF_ENTS.  */
3161
3162 static reg_errcode_t
3163 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3164                     type)
3165      re_match_context_t *mctx;
3166      int cur_str, subexp_num, type;
3167      re_node_set *cur_nodes;
3168 {
3169   re_dfa_t *const dfa = mctx->dfa;
3170   reg_errcode_t err;
3171   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3172   struct re_backref_cache_entry *ent;
3173
3174   if (cache_idx_start == -1)
3175     return REG_NOERROR;
3176
3177  restart:
3178   ent = mctx->bkref_ents + cache_idx_start;
3179   do
3180     {
3181       int to_idx, next_node;
3182
3183       /* Is this entry ENT is appropriate?  */
3184       if (!re_node_set_contains (cur_nodes, ent->node))
3185         continue; /* No.  */
3186
3187       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3188       /* Calculate the destination of the back reference, and append it
3189          to MCTX->STATE_LOG.  */
3190       if (to_idx == cur_str)
3191         {
3192           /* The backreference did epsilon transit, we must re-check all the
3193              node in the current state.  */
3194           re_node_set new_dests;
3195           reg_errcode_t err2, err3;
3196           next_node = dfa->edests[ent->node].elems[0];
3197           if (re_node_set_contains (cur_nodes, next_node))
3198             continue;
3199           err = re_node_set_init_1 (&new_dests, next_node);
3200           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3201           err3 = re_node_set_merge (cur_nodes, &new_dests);
3202           re_node_set_free (&new_dests);
3203           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3204                   || err3 != REG_NOERROR, 0))
3205             {
3206               err = (err != REG_NOERROR ? err
3207                      : (err2 != REG_NOERROR ? err2 : err3));
3208               return err;
3209             }
3210           /* TODO: It is still inefficient...  */
3211           goto restart;
3212         }
3213       else
3214         {
3215           re_node_set union_set;
3216           next_node = dfa->nexts[ent->node];
3217           if (mctx->state_log[to_idx])
3218             {
3219               int ret;
3220               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3221                                         next_node))
3222                 continue;
3223               err = re_node_set_init_copy (&union_set,
3224                                            &mctx->state_log[to_idx]->nodes);
3225               ret = re_node_set_insert (&union_set, next_node);
3226               if (BE (err != REG_NOERROR || ret < 0, 0))
3227                 {
3228                   re_node_set_free (&union_set);
3229                   err = err != REG_NOERROR ? err : REG_ESPACE;
3230                   return err;
3231                 }
3232             }
3233           else
3234             {
3235               err = re_node_set_init_1 (&union_set, next_node);
3236               if (BE (err != REG_NOERROR, 0))
3237                 return err;
3238             }
3239           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3240           re_node_set_free (&union_set);
3241           if (BE (mctx->state_log[to_idx] == NULL
3242                   && err != REG_NOERROR, 0))
3243             return err;
3244         }
3245     }
3246   while (ent++->more);
3247   return REG_NOERROR;
3248 }
3249
3250 /* Build transition table for the state.
3251    Return the new table if succeeded, otherwise return NULL.  */
3252
3253 static re_dfastate_t **
3254 build_trtable (dfa, state)
3255     re_dfa_t *dfa;
3256     re_dfastate_t *state;
3257 {
3258   reg_errcode_t err;
3259   int i, j, ch;
3260   unsigned int elem, mask;
3261   int dests_node_malloced = 0, dest_states_malloced = 0;
3262   int ndests; /* Number of the destination states from `state'.  */
3263   re_dfastate_t **trtable;
3264   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3265   re_node_set follows, *dests_node;
3266   bitset *dests_ch;
3267   bitset acceptable;
3268
3269   /* We build DFA states which corresponds to the destination nodes
3270      from `state'.  `dests_node[i]' represents the nodes which i-th
3271      destination state contains, and `dests_ch[i]' represents the
3272      characters which i-th destination state accepts.  */
3273 #ifdef _LIBC
3274   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3275     dests_node = (re_node_set *)
3276                  alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3277   else
3278 #endif
3279     {
3280       dests_node = (re_node_set *)
3281                    malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3282       if (BE (dests_node == NULL, 0))
3283         return NULL;
3284       dests_node_malloced = 1;
3285     }
3286   dests_ch = (bitset *) (dests_node + SBC_MAX);
3287
3288   /* Initialize transiton table.  */
3289   state->word_trtable = 0;
3290
3291   /* At first, group all nodes belonging to `state' into several
3292      destinations.  */
3293   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3294   if (BE (ndests <= 0, 0))
3295     {
3296       if (dests_node_malloced)
3297         free (dests_node);
3298       /* Return NULL in case of an error, trtable otherwise.  */
3299       if (ndests == 0)
3300         {
3301           state->trtable = (re_dfastate_t **)
3302             calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3303           return state->trtable;
3304         }
3305       return NULL;
3306     }
3307
3308   err = re_node_set_alloc (&follows, ndests + 1);
3309   if (BE (err != REG_NOERROR, 0))
3310     goto out_free;
3311
3312 #ifdef _LIBC
3313   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3314                          + ndests * 3 * sizeof (re_dfastate_t *)))
3315     dest_states = (re_dfastate_t **)
3316                   alloca (ndests * 3 * sizeof (re_dfastate_t *));
3317   else
3318 #endif
3319     {
3320       dest_states = (re_dfastate_t **)
3321                     malloc (ndests * 3 * sizeof (re_dfastate_t *));
3322       if (BE (dest_states == NULL, 0))
3323         {
3324 out_free:
3325           if (dest_states_malloced)
3326             free (dest_states);
3327           re_node_set_free (&follows);
3328           for (i = 0; i < ndests; ++i)
3329             re_node_set_free (dests_node + i);
3330           if (dests_node_malloced)
3331             free (dests_node);
3332           return NULL;
3333         }
3334       dest_states_malloced = 1;
3335     }
3336   dest_states_word = dest_states + ndests;
3337   dest_states_nl = dest_states_word + ndests;
3338   bitset_empty (acceptable);
3339
3340   /* Then build the states for all destinations.  */
3341   for (i = 0; i < ndests; ++i)
3342     {
3343       int next_node;
3344       re_node_set_empty (&follows);
3345       /* Merge the follows of this destination states.  */
3346       for (j = 0; j < dests_node[i].nelem; ++j)
3347         {
3348           next_node = dfa->nexts[dests_node[i].elems[j]];
3349           if (next_node != -1)
3350             {
3351               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3352               if (BE (err != REG_NOERROR, 0))
3353                 goto out_free;
3354             }
3355         }
3356       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3357       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3358         goto out_free;
3359       /* If the new state has context constraint,
3360          build appropriate states for these contexts.  */
3361       if (dest_states[i]->has_constraint)
3362         {
3363           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3364                                                           CONTEXT_WORD);
3365           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3366             goto out_free;
3367
3368           if (dest_states[i] != dest_states_word[i]
3369               && dfa->mb_cur_max > 1)
3370             state->word_trtable = 1;
3371
3372           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3373                                                         CONTEXT_NEWLINE);
3374           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3375             goto out_free;
3376         }
3377       else
3378         {
3379           dest_states_word[i] = dest_states[i];
3380           dest_states_nl[i] = dest_states[i];
3381         }
3382       bitset_merge (acceptable, dests_ch[i]);
3383     }
3384
3385   if (!BE (state->word_trtable, 0))
3386     {
3387       /* We don't care about whether the following character is a word
3388          character, or we are in a single-byte character set so we can
3389          discern by looking at the character code: allocate a
3390          256-entry transition table.  */
3391       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3392       if (BE (trtable == NULL, 0))
3393         goto out_free;
3394
3395       /* For all characters ch...:  */
3396       for (i = 0; i < BITSET_UINTS; ++i)
3397         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3398              elem;
3399              mask <<= 1, elem >>= 1, ++ch)
3400           if (BE (elem & 1, 0))
3401             {
3402               /* There must be exactly one destination which accepts
3403                  character ch.  See group_nodes_into_DFAstates.  */
3404               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3405                 ;
3406
3407               /* j-th destination accepts the word character ch.  */
3408               if (dfa->word_char[i] & mask)
3409                 trtable[ch] = dest_states_word[j];
3410               else
3411                 trtable[ch] = dest_states[j];
3412             }
3413     }
3414   else
3415     {
3416       /* We care about whether the following character is a word
3417          character, and we are in a multi-byte character set: discern
3418          by looking at the character code: build two 256-entry
3419          transition tables, one starting at trtable[0] and one
3420          starting at trtable[SBC_MAX].  */
3421       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3422                                            2 * SBC_MAX);
3423       if (BE (trtable == NULL, 0))
3424         goto out_free;
3425
3426       /* For all characters ch...:  */
3427       for (i = 0; i < BITSET_UINTS; ++i)
3428         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3429              elem;
3430              mask <<= 1, elem >>= 1, ++ch)
3431           if (BE (elem & 1, 0))
3432             {
3433               /* There must be exactly one destination which accepts
3434                  character ch.  See group_nodes_into_DFAstates.  */
3435               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3436                 ;
3437
3438               /* j-th destination accepts the word character ch.  */
3439               trtable[ch] = dest_states[j];
3440               trtable[ch + SBC_MAX] = dest_states_word[j];
3441             }
3442     }
3443
3444   /* new line */
3445   if (bitset_contain (acceptable, NEWLINE_CHAR))
3446     {
3447       /* The current state accepts newline character.  */
3448       for (j = 0; j < ndests; ++j)
3449         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3450           {
3451             /* k-th destination accepts newline character.  */
3452             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3453             if (state->word_trtable)
3454               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3455             /* There must be only one destination which accepts
3456                newline.  See group_nodes_into_DFAstates.  */
3457             break;
3458           }
3459     }
3460
3461   if (dest_states_malloced)
3462     free (dest_states);
3463
3464   re_node_set_free (&follows);
3465   for (i = 0; i < ndests; ++i)
3466     re_node_set_free (dests_node + i);
3467
3468   if (dests_node_malloced)
3469     free (dests_node);
3470
3471   state->trtable = trtable;
3472   return trtable;
3473 }
3474
3475 /* Group all nodes belonging to STATE into several destinations.
3476    Then for all destinations, set the nodes belonging to the destination
3477    to DESTS_NODE[i] and set the characters accepted by the destination
3478    to DEST_CH[i].  This function return the number of destinations.  */
3479
3480 static int
3481 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3482     re_dfa_t *dfa;
3483     const re_dfastate_t *state;
3484     re_node_set *dests_node;
3485     bitset *dests_ch;
3486 {
3487   reg_errcode_t err;
3488   int result;
3489   int i, j, k;
3490   int ndests; /* Number of the destinations from `state'.  */
3491   bitset accepts; /* Characters a node can accept.  */
3492   const re_node_set *cur_nodes = &state->nodes;
3493   bitset_empty (accepts);
3494   ndests = 0;
3495
3496   /* For all the nodes belonging to `state',  */
3497   for (i = 0; i < cur_nodes->nelem; ++i)
3498     {
3499       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3500       re_token_type_t type = node->type;
3501       unsigned int constraint = node->constraint;
3502
3503       /* Enumerate all single byte character this node can accept.  */
3504       if (type == CHARACTER)
3505         bitset_set (accepts, node->opr.c);
3506       else if (type == SIMPLE_BRACKET)
3507         {
3508           bitset_merge (accepts, node->opr.sbcset);
3509         }
3510       else if (type == OP_PERIOD)
3511         {
3512 #ifdef RE_ENABLE_I18N
3513           if (dfa->mb_cur_max > 1)
3514             bitset_merge (accepts, dfa->sb_char);
3515           else
3516 #endif
3517             bitset_set_all (accepts);
3518           if (!(dfa->syntax & RE_DOT_NEWLINE))
3519             bitset_clear (accepts, '\n');
3520           if (dfa->syntax & RE_DOT_NOT_NULL)
3521             bitset_clear (accepts, '\0');
3522         }
3523 #ifdef RE_ENABLE_I18N
3524       else if (type == OP_UTF8_PERIOD)
3525         {
3526           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3527           if (!(dfa->syntax & RE_DOT_NEWLINE))
3528             bitset_clear (accepts, '\n');
3529           if (dfa->syntax & RE_DOT_NOT_NULL)
3530             bitset_clear (accepts, '\0');
3531         }
3532 #endif
3533       else
3534         continue;
3535
3536       /* Check the `accepts' and sift the characters which are not
3537          match it the context.  */
3538       if (constraint)
3539         {
3540           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3541             {
3542               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3543               bitset_empty (accepts);
3544               if (accepts_newline)
3545                 bitset_set (accepts, NEWLINE_CHAR);
3546               else
3547                 continue;
3548             }
3549           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3550             {
3551               bitset_empty (accepts);
3552               continue;
3553             }
3554
3555           if (constraint & NEXT_WORD_CONSTRAINT)
3556             {
3557               unsigned int any_set = 0;
3558               if (type == CHARACTER && !node->word_char)
3559                 {
3560                   bitset_empty (accepts);
3561                   continue;
3562                 }
3563 #ifdef RE_ENABLE_I18N
3564               if (dfa->mb_cur_max > 1)
3565                 for (j = 0; j < BITSET_UINTS; ++j)
3566                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3567               else
3568 #endif
3569                 for (j = 0; j < BITSET_UINTS; ++j)
3570                   any_set |= (accepts[j] &= dfa->word_char[j]);
3571               if (!any_set)
3572                 continue;
3573             }
3574           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3575             {
3576               unsigned int any_set = 0;
3577               if (type == CHARACTER && node->word_char)
3578                 {
3579                   bitset_empty (accepts);
3580                   continue;
3581                 }
3582 #ifdef RE_ENABLE_I18N
3583               if (dfa->mb_cur_max > 1)
3584                 for (j = 0; j < BITSET_UINTS; ++j)
3585                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3586               else
3587 #endif
3588                 for (j = 0; j < BITSET_UINTS; ++j)
3589                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3590               if (!any_set)
3591                 continue;
3592             }
3593         }
3594
3595       /* Then divide `accepts' into DFA states, or create a new
3596          state.  Above, we make sure that accepts is not empty.  */
3597       for (j = 0; j < ndests; ++j)
3598         {
3599           bitset intersec; /* Intersection sets, see below.  */
3600           bitset remains;
3601           /* Flags, see below.  */
3602           int has_intersec, not_subset, not_consumed;
3603
3604           /* Optimization, skip if this state doesn't accept the character.  */
3605           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3606             continue;
3607
3608           /* Enumerate the intersection set of this state and `accepts'.  */
3609           has_intersec = 0;
3610           for (k = 0; k < BITSET_UINTS; ++k)
3611             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3612           /* And skip if the intersection set is empty.  */
3613           if (!has_intersec)
3614             continue;
3615
3616           /* Then check if this state is a subset of `accepts'.  */
3617           not_subset = not_consumed = 0;
3618           for (k = 0; k < BITSET_UINTS; ++k)
3619             {
3620               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3621               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3622             }
3623
3624           /* If this state isn't a subset of `accepts', create a
3625              new group state, which has the `remains'. */
3626           if (not_subset)
3627             {
3628               bitset_copy (dests_ch[ndests], remains);
3629               bitset_copy (dests_ch[j], intersec);
3630               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3631               if (BE (err != REG_NOERROR, 0))
3632                 goto error_return;
3633               ++ndests;
3634             }
3635
3636           /* Put the position in the current group. */
3637           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3638           if (BE (result < 0, 0))
3639             goto error_return;
3640
3641           /* If all characters are consumed, go to next node. */
3642           if (!not_consumed)
3643             break;
3644         }
3645       /* Some characters remain, create a new group. */
3646       if (j == ndests)
3647         {
3648           bitset_copy (dests_ch[ndests], accepts);
3649           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3650           if (BE (err != REG_NOERROR, 0))
3651             goto error_return;
3652           ++ndests;
3653           bitset_empty (accepts);
3654         }
3655     }
3656   return ndests;
3657  error_return:
3658   for (j = 0; j < ndests; ++j)
3659     re_node_set_free (dests_node + j);
3660   return -1;
3661 }
3662
3663 #ifdef RE_ENABLE_I18N
3664 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3665    Return the number of the bytes the node accepts.
3666    STR_IDX is the current index of the input string.
3667
3668    This function handles the nodes which can accept one character, or
3669    one collating element like '.', '[a-z]', opposite to the other nodes
3670    can only accept one byte.  */
3671
3672 static int
3673 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3674     re_dfa_t *dfa;
3675     int node_idx, str_idx;
3676     const re_string_t *input;
3677 {
3678   const re_token_t *node = dfa->nodes + node_idx;
3679   int char_len, elem_len;
3680   int i;
3681
3682   if (BE (node->type == OP_UTF8_PERIOD, 0))
3683     {
3684       unsigned char c = re_string_byte_at (input, str_idx), d;
3685       if (BE (c < 0xc2, 1))
3686         return 0;
3687
3688       if (str_idx + 2 > input->len)
3689         return 0;
3690
3691       d = re_string_byte_at (input, str_idx + 1);
3692       if (c < 0xe0)
3693         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3694       else if (c < 0xf0)
3695         {
3696           char_len = 3;
3697           if (c == 0xe0 && d < 0xa0)
3698             return 0;
3699         }
3700       else if (c < 0xf8)
3701         {
3702           char_len = 4;
3703           if (c == 0xf0 && d < 0x90)
3704             return 0;
3705         }
3706       else if (c < 0xfc)
3707         {
3708           char_len = 5;
3709           if (c == 0xf8 && d < 0x88)
3710             return 0;
3711         }
3712       else if (c < 0xfe)
3713         {
3714           char_len = 6;
3715           if (c == 0xfc && d < 0x84)
3716             return 0;
3717         }
3718       else
3719         return 0;
3720
3721       if (str_idx + char_len > input->len)
3722         return 0;
3723
3724       for (i = 1; i < char_len; ++i)
3725         {
3726           d = re_string_byte_at (input, str_idx + i);
3727           if (d < 0x80 || d > 0xbf)
3728             return 0;
3729         }
3730       return char_len;
3731     }
3732
3733   char_len = re_string_char_size_at (input, str_idx);
3734   if (node->type == OP_PERIOD)
3735     {
3736       if (char_len <= 1)
3737         return 0;
3738       /* FIXME: I don't think this if is needed, as both '\n'
3739          and '\0' are char_len == 1.  */
3740       /* '.' accepts any one character except the following two cases.  */
3741       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3742            re_string_byte_at (input, str_idx) == '\n') ||
3743           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3744            re_string_byte_at (input, str_idx) == '\0'))
3745         return 0;
3746       return char_len;
3747     }
3748
3749   elem_len = re_string_elem_size_at (input, str_idx);
3750   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3751     return 0;
3752
3753   if (node->type == COMPLEX_BRACKET)
3754     {
3755       const re_charset_t *cset = node->opr.mbcset;
3756 # ifdef _LIBC
3757       const unsigned char *pin = ((char *) re_string_get_buffer (input)
3758                                   + str_idx);
3759       int j;
3760       uint32_t nrules;
3761 # endif /* _LIBC */
3762       int match_len = 0;
3763       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3764                     ? re_string_wchar_at (input, str_idx) : 0);
3765
3766       /* match with multibyte character?  */
3767       for (i = 0; i < cset->nmbchars; ++i)
3768         if (wc == cset->mbchars[i])
3769           {
3770             match_len = char_len;
3771             goto check_node_accept_bytes_match;
3772           }
3773       /* match with character_class?  */
3774       for (i = 0; i < cset->nchar_classes; ++i)
3775         {
3776           wctype_t wt = cset->char_classes[i];
3777           if (__iswctype (wc, wt))
3778             {
3779               match_len = char_len;
3780               goto check_node_accept_bytes_match;
3781             }
3782         }
3783
3784 # ifdef _LIBC
3785       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3786       if (nrules != 0)
3787         {
3788           unsigned int in_collseq = 0;
3789           const int32_t *table, *indirect;
3790           const unsigned char *weights, *extra;
3791           const char *collseqwc;
3792           int32_t idx;
3793           /* This #include defines a local function!  */
3794 #  include <locale/weight.h>
3795
3796           /* match with collating_symbol?  */
3797           if (cset->ncoll_syms)
3798             extra = (const unsigned char *)
3799               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3800           for (i = 0; i < cset->ncoll_syms; ++i)
3801             {
3802               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3803               /* Compare the length of input collating element and
3804                  the length of current collating element.  */
3805               if (*coll_sym != elem_len)
3806                 continue;
3807               /* Compare each bytes.  */
3808               for (j = 0; j < *coll_sym; j++)
3809                 if (pin[j] != coll_sym[1 + j])
3810                   break;
3811               if (j == *coll_sym)
3812                 {
3813                   /* Match if every bytes is equal.  */
3814                   match_len = j;
3815                   goto check_node_accept_bytes_match;
3816                 }
3817             }
3818
3819           if (cset->nranges)
3820             {
3821               if (elem_len <= char_len)
3822                 {
3823                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3824                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3825                 }
3826               else
3827                 in_collseq = find_collation_sequence_value (pin, elem_len);
3828             }
3829           /* match with range expression?  */
3830           for (i = 0; i < cset->nranges; ++i)
3831             if (cset->range_starts[i] <= in_collseq
3832                 && in_collseq <= cset->range_ends[i])
3833               {
3834                 match_len = elem_len;
3835                 goto check_node_accept_bytes_match;
3836               }
3837
3838           /* match with equivalence_class?  */
3839           if (cset->nequiv_classes)
3840             {
3841               const unsigned char *cp = pin;
3842               table = (const int32_t *)
3843                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3844               weights = (const unsigned char *)
3845                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3846               extra = (const unsigned char *)
3847                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3848               indirect = (const int32_t *)
3849                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3850               idx = findidx (&cp);
3851               if (idx > 0)
3852                 for (i = 0; i < cset->nequiv_classes; ++i)
3853                   {
3854                     int32_t equiv_class_idx = cset->equiv_classes[i];
3855                     size_t weight_len = weights[idx];
3856                     if (weight_len == weights[equiv_class_idx])
3857                       {
3858                         int cnt = 0;
3859                         while (cnt <= weight_len
3860                                && (weights[equiv_class_idx + 1 + cnt]
3861                                    == weights[idx + 1 + cnt]))
3862                           ++cnt;
3863                         if (cnt > weight_len)
3864                           {
3865                             match_len = elem_len;
3866                             goto check_node_accept_bytes_match;
3867                           }
3868                       }
3869                   }
3870             }
3871         }
3872       else
3873 # endif /* _LIBC */
3874         {
3875           /* match with range expression?  */
3876 #if __GNUC__ >= 2
3877           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3878 #else
3879           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3880           cmp_buf[2] = wc;
3881 #endif
3882           for (i = 0; i < cset->nranges; ++i)
3883             {
3884               cmp_buf[0] = cset->range_starts[i];
3885               cmp_buf[4] = cset->range_ends[i];
3886               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3887                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3888                 {
3889                   match_len = char_len;
3890                   goto check_node_accept_bytes_match;
3891                 }
3892             }
3893         }
3894     check_node_accept_bytes_match:
3895       if (!cset->non_match)
3896         return match_len;
3897       else
3898         {
3899           if (match_len > 0)
3900             return 0;
3901           else
3902             return (elem_len > char_len) ? elem_len : char_len;
3903         }
3904     }
3905   return 0;
3906 }
3907
3908 # ifdef _LIBC
3909 static unsigned int
3910 find_collation_sequence_value (mbs, mbs_len)
3911     const unsigned char *mbs;
3912     size_t mbs_len;
3913 {
3914   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3915   if (nrules == 0)
3916     {
3917       if (mbs_len == 1)
3918         {
3919           /* No valid character.  Match it as a single byte character.  */
3920           const unsigned char *collseq = (const unsigned char *)
3921             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3922           return collseq[mbs[0]];
3923         }
3924       return UINT_MAX;
3925     }
3926   else
3927     {
3928       int32_t idx;
3929       const unsigned char *extra = (const unsigned char *)
3930         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3931       int32_t extrasize = (const unsigned char *)
3932         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3933
3934       for (idx = 0; idx < extrasize;)
3935         {
3936           int mbs_cnt, found = 0;
3937           int32_t elem_mbs_len;
3938           /* Skip the name of collating element name.  */
3939           idx = idx + extra[idx] + 1;
3940           elem_mbs_len = extra[idx++];
3941           if (mbs_len == elem_mbs_len)
3942             {
3943               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3944                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3945                   break;
3946               if (mbs_cnt == elem_mbs_len)
3947                 /* Found the entry.  */
3948                 found = 1;
3949             }
3950           /* Skip the byte sequence of the collating element.  */
3951           idx += elem_mbs_len;
3952           /* Adjust for the alignment.  */
3953           idx = (idx + 3) & ~3;
3954           /* Skip the collation sequence value.  */
3955           idx += sizeof (uint32_t);
3956           /* Skip the wide char sequence of the collating element.  */
3957           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3958           /* If we found the entry, return the sequence value.  */
3959           if (found)
3960             return *(uint32_t *) (extra + idx);
3961           /* Skip the collation sequence value.  */
3962           idx += sizeof (uint32_t);
3963         }
3964       return UINT_MAX;
3965     }
3966 }
3967 # endif /* _LIBC */
3968 #endif /* RE_ENABLE_I18N */
3969
3970 /* Check whether the node accepts the byte which is IDX-th
3971    byte of the INPUT.  */
3972
3973 static int
3974 check_node_accept (mctx, node, idx)
3975     const re_match_context_t *mctx;
3976     const re_token_t *node;
3977     int idx;
3978 {
3979   re_dfa_t *const dfa = mctx->dfa;
3980   unsigned char ch;
3981   if (node->constraint)
3982     {
3983       /* The node has constraints.  Check whether the current context
3984          satisfies the constraints.  */
3985       unsigned int context = re_string_context_at (&mctx->input, idx,
3986                                                    mctx->eflags);
3987       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3988         return 0;
3989     }
3990   ch = re_string_byte_at (&mctx->input, idx);
3991   switch (node->type)
3992     {
3993     case CHARACTER:
3994       return node->opr.c == ch;
3995     case SIMPLE_BRACKET:
3996       return bitset_contain (node->opr.sbcset, ch);
3997 #ifdef RE_ENABLE_I18N
3998     case OP_UTF8_PERIOD:
3999       if (ch >= 0x80)
4000         return 0;
4001       /* FALLTHROUGH */
4002 #endif
4003     case OP_PERIOD:
4004       return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
4005                || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
4006     default:
4007       return 0;
4008     }
4009 }
4010
4011 /* Extend the buffers, if the buffers have run out.  */
4012
4013 static reg_errcode_t
4014 extend_buffers (mctx)
4015      re_match_context_t *mctx;
4016 {
4017   reg_errcode_t ret;
4018   re_string_t *pstr = &mctx->input;
4019
4020   /* Double the lengthes of the buffers.  */
4021   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4022   if (BE (ret != REG_NOERROR, 0))
4023     return ret;
4024
4025   if (mctx->state_log != NULL)
4026     {
4027       /* And double the length of state_log.  */
4028       /* XXX We have no indication of the size of this buffer.  If this
4029          allocation fail we have no indication that the state_log array
4030          does not have the right size.  */
4031       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4032                                               pstr->bufs_len + 1);
4033       if (BE (new_array == NULL, 0))
4034         return REG_ESPACE;
4035       mctx->state_log = new_array;
4036     }
4037
4038   /* Then reconstruct the buffers.  */
4039   if (pstr->icase)
4040     {
4041 #ifdef RE_ENABLE_I18N
4042       if (pstr->mb_cur_max > 1)
4043         {
4044           ret = build_wcs_upper_buffer (pstr);
4045           if (BE (ret != REG_NOERROR, 0))
4046             return ret;
4047         }
4048       else
4049 #endif /* RE_ENABLE_I18N  */
4050         build_upper_buffer (pstr);
4051     }
4052   else
4053     {
4054 #ifdef RE_ENABLE_I18N
4055       if (pstr->mb_cur_max > 1)
4056         build_wcs_buffer (pstr);
4057       else
4058 #endif /* RE_ENABLE_I18N  */
4059         {
4060           if (pstr->trans != NULL)
4061             re_string_translate_buffer (pstr);
4062         }
4063     }
4064   return REG_NOERROR;
4065 }
4066
4067 \f
4068 /* Functions for matching context.  */
4069
4070 /* Initialize MCTX.  */
4071
4072 static reg_errcode_t
4073 match_ctx_init (mctx, eflags, n)
4074     re_match_context_t *mctx;
4075     int eflags, n;
4076 {
4077   mctx->eflags = eflags;
4078   mctx->match_last = -1;
4079   if (n > 0)
4080     {
4081       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4082       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4083       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4084         return REG_ESPACE;
4085     }
4086   /* Already zero-ed by the caller.
4087      else
4088        mctx->bkref_ents = NULL;
4089      mctx->nbkref_ents = 0;
4090      mctx->nsub_tops = 0;  */
4091   mctx->abkref_ents = n;
4092   mctx->max_mb_elem_len = 1;
4093   mctx->asub_tops = n;
4094   return REG_NOERROR;
4095 }
4096
4097 /* Clean the entries which depend on the current input in MCTX.
4098    This function must be invoked when the matcher changes the start index
4099    of the input, or changes the input string.  */
4100
4101 static void
4102 match_ctx_clean (mctx)
4103     re_match_context_t *mctx;
4104 {
4105   match_ctx_free_subtops (mctx);
4106   mctx->nsub_tops = 0;
4107   mctx->nbkref_ents = 0;
4108 }
4109
4110 /* Free all the memory associated with MCTX.  */
4111
4112 static void
4113 match_ctx_free (mctx)
4114     re_match_context_t *mctx;
4115 {
4116   match_ctx_free_subtops (mctx);
4117   re_free (mctx->sub_tops);
4118   re_free (mctx->bkref_ents);
4119 }
4120
4121 /* Free all the memory associated with MCTX->SUB_TOPS.  */
4122
4123 static void
4124 match_ctx_free_subtops (mctx)
4125      re_match_context_t *mctx;
4126 {
4127   int st_idx;
4128   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4129     {
4130       int sl_idx;
4131       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4132       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4133         {
4134           re_sub_match_last_t *last = top->lasts[sl_idx];
4135           re_free (last->path.array);
4136           re_free (last);
4137         }
4138       re_free (top->lasts);
4139       if (top->path)
4140         {
4141           re_free (top->path->array);
4142           re_free (top->path);
4143         }
4144       free (top);
4145     }
4146 }
4147
4148 /* Add a new backreference entry to MCTX.
4149    Note that we assume that caller never call this function with duplicate
4150    entry, and call with STR_IDX which isn't smaller than any existing entry.
4151 */
4152
4153 static reg_errcode_t
4154 match_ctx_add_entry (mctx, node, str_idx, from, to)
4155      re_match_context_t *mctx;
4156      int node, str_idx, from, to;
4157 {
4158   if (mctx->nbkref_ents >= mctx->abkref_ents)
4159     {
4160       struct re_backref_cache_entry* new_entry;
4161       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4162                               mctx->abkref_ents * 2);
4163       if (BE (new_entry == NULL, 0))
4164         {
4165           re_free (mctx->bkref_ents);
4166           return REG_ESPACE;
4167         }
4168       mctx->bkref_ents = new_entry;
4169       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4170               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4171       mctx->abkref_ents *= 2;
4172     }
4173   if (mctx->nbkref_ents > 0
4174       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4175     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4176
4177   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4178   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4179   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4180   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4181   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4182   if (mctx->max_mb_elem_len < to - from)
4183     mctx->max_mb_elem_len = to - from;
4184   return REG_NOERROR;
4185 }
4186
4187 /* Search for the first entry which has the same str_idx, or -1 if none is
4188    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4189
4190 static int
4191 search_cur_bkref_entry (mctx, str_idx)
4192      re_match_context_t *mctx;
4193      int str_idx;
4194 {
4195   int left, right, mid, last;
4196   last = right = mctx->nbkref_ents;
4197   for (left = 0; left < right;)
4198     {
4199       mid = (left + right) / 2;
4200       if (mctx->bkref_ents[mid].str_idx < str_idx)
4201         left = mid + 1;
4202       else
4203         right = mid;
4204     }
4205   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4206     return left;
4207   else
4208     return -1;
4209 }
4210
4211 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4212    at STR_IDX.  */
4213
4214 static reg_errcode_t
4215 match_ctx_add_subtop (mctx, node, str_idx)
4216      re_match_context_t *mctx;
4217      int node, str_idx;
4218 {
4219 #ifdef DEBUG
4220   assert (mctx->sub_tops != NULL);
4221   assert (mctx->asub_tops > 0);
4222 #endif
4223   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4224     {
4225       int new_asub_tops = mctx->asub_tops * 2;
4226       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4227                                                    re_sub_match_top_t *,
4228                                                    new_asub_tops);
4229       if (BE (new_array == NULL, 0))
4230         return REG_ESPACE;
4231       mctx->sub_tops = new_array;
4232       mctx->asub_tops = new_asub_tops;
4233     }
4234   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4235   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4236     return REG_ESPACE;
4237   mctx->sub_tops[mctx->nsub_tops]->node = node;
4238   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4239   return REG_NOERROR;
4240 }
4241
4242 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4243    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4244
4245 static re_sub_match_last_t *
4246 match_ctx_add_sublast (subtop, node, str_idx)
4247      re_sub_match_top_t *subtop;
4248      int node, str_idx;
4249 {
4250   re_sub_match_last_t *new_entry;
4251   if (BE (subtop->nlasts == subtop->alasts, 0))
4252     {
4253       int new_alasts = 2 * subtop->alasts + 1;
4254       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4255                                                     re_sub_match_last_t *,
4256                                                     new_alasts);
4257       if (BE (new_array == NULL, 0))
4258         return NULL;
4259       subtop->lasts = new_array;
4260       subtop->alasts = new_alasts;
4261     }
4262   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4263   if (BE (new_entry != NULL, 1))
4264     {
4265       subtop->lasts[subtop->nlasts] = new_entry;
4266       new_entry->node = node;
4267       new_entry->str_idx = str_idx;
4268       ++subtop->nlasts;
4269     }
4270   return new_entry;
4271 }
4272
4273 static void
4274 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4275     re_sift_context_t *sctx;
4276     re_dfastate_t **sifted_sts, **limited_sts;
4277     int last_node, last_str_idx;
4278 {
4279   sctx->sifted_states = sifted_sts;
4280   sctx->limited_states = limited_sts;
4281   sctx->last_node = last_node;
4282   sctx->last_str_idx = last_str_idx;
4283   re_node_set_init_empty (&sctx->limits);
4284 }