posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static void match_ctx_free_subtops (re_match_context_t *mctx)
  26      internal_function;
  27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  28                                           int str_idx, int from, int to)
  29      internal_function;
  30 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  31      internal_function;
  32 static void match_ctx_clear_flag (re_match_context_t *mctx) internal_function;
  33 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  34                                            int str_idx) internal_function;
  35 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  36                                                    int node, int str_idx)
  37      internal_function;
  38 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  39                            re_dfastate_t **limited_sts, int last_node,
  40                            int last_str_idx, int check_subexp)
  41      internal_function;
  42 static reg_errcode_t re_search_internal (const regex_t *preg,
  43                                          const char *string, int length,
  44                                          int start, int range, int stop,
  45                                          size_t nmatch, regmatch_t pmatch[],
  46                                          int eflags) internal_function;
  47 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  48                              const char *string1, int length1,
  49                              const char *string2, int length2,
  50                              int start, int range, struct re_registers *regs,
  51                              int stop, int ret_len) internal_function;
  52 static int re_search_stub (struct re_pattern_buffer *bufp,
  53                            const char *string, int length, int start,
  54                            int range, int stop, struct re_registers *regs,
  55                            int ret_len) internal_function;
  56 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  57                               int nregs, int regs_allocated) internal_function;
  58 static inline re_dfastate_t *acquire_init_state_context
  59      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  60      __attribute ((always_inline)) internal_function;
  61 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  62      internal_function;
  63 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  64                            int *p_match_first)
  65      internal_function;
  66 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  67                                     unsigned int context) internal_function;
  68 static int check_halt_state_context (const re_match_context_t *mctx,
  69                                      const re_dfastate_t *state, int idx)
  70      internal_function;
  71 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  72                          regmatch_t *prev_idx_match, int cur_node,
  73                          int cur_idx, int nmatch) internal_function;
  74 static int proceed_next_node (const re_match_context_t *mctx,
  75                               int nregs, regmatch_t *regs,
  76                               int *pidx, int node, re_node_set *eps_via_nodes,
  77                               struct re_fail_stack_t *fs) internal_function;
  78 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  79                                       int str_idx, int *dests, int nregs,
  80                                       regmatch_t *regs,
  81                                       re_node_set *eps_via_nodes) internal_function;
  82 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  83                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  84 static reg_errcode_t set_regs (const regex_t *preg,
  85                                const re_match_context_t *mctx,
  86                                size_t nmatch, regmatch_t *pmatch,
  87                                int fl_backtrack) internal_function;
  88 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  89
  90 #ifdef RE_ENABLE_I18N
  91 static int sift_states_iter_mb (const re_match_context_t *mctx,
  92                                 re_sift_context_t *sctx,
  93                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  94 #endif /* RE_ENABLE_I18N */
  95 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  96                                            re_sift_context_t *sctx) internal_function;
  97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  98                                               re_sift_context_t *sctx,
  99                                               int str_idx,
 100                                               re_node_set *dest_nodes) internal_function;
 101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 102                                             re_node_set *dest_nodes,
 103                                             const re_node_set *candidates) internal_function;
 104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 105                                             re_node_set *dest_nodes,
 106                                             const re_node_set *and_nodes) internal_function;
 107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 108                              int dst_node, int dst_idx, int src_node,
 109                              int src_idx) internal_function;
 110 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 111                                       int limit, re_node_set *eclosures,
 112                                       int subexp_idx, int node, int str_idx) internal_function;
 113 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 114                                           re_node_set *dest_nodes,
 115                                           const re_node_set *candidates,
 116                                           re_node_set *limits,
 117                                           struct re_backref_cache_entry *bkref_ents,
 118                                           int str_idx) internal_function;
 119 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 120                                         re_sift_context_t *sctx,
 121                                         int str_idx, re_node_set *dest_nodes) internal_function;
 122 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 123                                                 int next_state_log_idx) internal_function;
 124 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 125                                         re_dfastate_t **src, int num) internal_function;
 126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 127                                          re_match_context_t *mctx) internal_function;
 128 static re_dfastate_t *transit_state (reg_errcode_t *err,
 129                                      re_match_context_t *mctx,
 130                                      re_dfastate_t *state) internal_function;
 131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 132                                             re_match_context_t *mctx,
 133                                             re_dfastate_t *next_state) internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate) internal_function;
 141 #endif
 142 #ifdef RE_ENABLE_I18N
 143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 144                                        re_dfastate_t *pstate) internal_function;
 145 #endif /* RE_ENABLE_I18N */
 146 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 147                                           const re_node_set *nodes) internal_function;
 148 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 149                                  int bkref_node, int bkref_str_idx) internal_function;
 150 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 151                                      const re_sub_match_top_t *sub_top,
 152                                      re_sub_match_last_t *sub_last,
 153                                      int bkref_node, int bkref_str) internal_function;
 154 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 155                              int subexp_idx, int type) internal_function;
 156 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 157                                     state_array_t *path, int top_node,
 158                                     int top_str, int last_node, int last_str,
 159                                     int type) internal_function;
 160 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 161                                                    int str_idx,
 162                                                    re_node_set *cur_nodes,
 163                                                    re_node_set *next_nodes) internal_function;
 164 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 165                                                re_node_set *cur_nodes,
 166                                                int ex_subexp, int type) internal_function;
 167 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 168                                                    re_node_set *dst_nodes,
 169                                                    int target, int ex_subexp,
 170                                                    int type) internal_function;
 171 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 172                                          re_node_set *cur_nodes, int cur_str,
 173                                          int last_str, int subexp_num,
 174                                          int type) internal_function;
 175 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
 176                                       re_dfastate_t *state) internal_function;
 177 #ifdef RE_ENABLE_I18N
 178 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 179                                     const re_string_t *input, int idx) internal_function;
 180 # ifdef _LIBC
 181 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 182                                                    size_t name_len) internal_function;
 183 # endif /* _LIBC */
 184 #endif /* RE_ENABLE_I18N */
 185 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 186                                        const re_dfastate_t *state,
 187                                        re_node_set *states_node,
 188                                        bitset *states_ch) internal_function;
 189 static int check_node_accept (const re_match_context_t *mctx,
 190                               const re_token_t *node, int idx) internal_function;
 191 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 192 \f
 193 /* Entry point for POSIX code.  */
 194
 195 /* regexec searches for a given pattern, specified by PREG, in the
 196    string STRING.
 197
 198    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 199    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 200    least NMATCH elements, and we set them to the offsets of the
 201    corresponding matched substrings.
 202
 203    EFLAGS specifies `execution flags' which affect matching: if
 204    REG_NOTBOL is set, then ^ does not match at the beginning of the
 205    string; if REG_NOTEOL is set, then $ does not match at the end.
 206
 207    We return 0 if we find a match and REG_NOMATCH if not.  */
 208
 209 int
 210 regexec (preg, string, nmatch, pmatch, eflags)
 211     const regex_t *__restrict preg;
 212     const char *__restrict string;
 213     size_t nmatch;
 214     regmatch_t pmatch[];
 215     int eflags;
 216 {
 217   reg_errcode_t err;
 218   int start, length;
 219   if (eflags & REG_STARTEND)
 220     {
 221       start = pmatch[0].rm_so;
 222       length = pmatch[0].rm_eo;
 223     }
 224   else
 225     {
 226       start = 0;
 227       length = strlen (string);
 228     }
 229   if (preg->no_sub)
 230     err = re_search_internal (preg, string, length, start, length - start,
 231                               length, 0, NULL, eflags);
 232   else
 233     err = re_search_internal (preg, string, length, start, length - start,
 234                               length, nmatch, pmatch, eflags);
 235   return err != REG_NOERROR;
 236 }
 237 #ifdef _LIBC
 238 weak_alias (__regexec, regexec)
 239 #endif
 240
 241 /* Entry points for GNU code.  */
 242
 243 /* re_match, re_search, re_match_2, re_search_2
 244
 245    The former two functions operate on STRING with length LENGTH,
 246    while the later two operate on concatenation of STRING1 and STRING2
 247    with lengths LENGTH1 and LENGTH2, respectively.
 248
 249    re_match() matches the compiled pattern in BUFP against the string,
 250    starting at index START.
 251
 252    re_search() first tries matching at index START, then it tries to match
 253    starting from index START + 1, and so on.  The last start position tried
 254    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 255    way as re_match().)
 256
 257    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 258    the first STOP characters of the concatenation of the strings should be
 259    concerned.
 260
 261    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 262    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 263    computed relative to the concatenation, not relative to the individual
 264    strings.)
 265
 266    On success, re_match* functions return the length of the match, re_search*
 267    return the position of the start of the match.  Return value -1 means no
 268    match was found and -2 indicates an internal error.  */
 269
 270 int
 271 re_match (bufp, string, length, start, regs)
 272     struct re_pattern_buffer *bufp;
 273     const char *string;
 274     int length, start;
 275     struct re_registers *regs;
 276 {
 277   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 278 }
 279 #ifdef _LIBC
 280 weak_alias (__re_match, re_match)
 281 #endif
 282
 283 int
 284 re_search (bufp, string, length, start, range, regs)
 285     struct re_pattern_buffer *bufp;
 286     const char *string;
 287     int length, start, range;
 288     struct re_registers *regs;
 289 {
 290   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 291 }
 292 #ifdef _LIBC
 293 weak_alias (__re_search, re_search)
 294 #endif
 295
 296 int
 297 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 298     struct re_pattern_buffer *bufp;
 299     const char *string1, *string2;
 300     int length1, length2, start, stop;
 301     struct re_registers *regs;
 302 {
 303   return re_search_2_stub (bufp, string1, length1, string2, length2,
 304                            start, 0, regs, stop, 1);
 305 }
 306 #ifdef _LIBC
 307 weak_alias (__re_match_2, re_match_2)
 308 #endif
 309
 310 int
 311 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 312     struct re_pattern_buffer *bufp;
 313     const char *string1, *string2;
 314     int length1, length2, start, range, stop;
 315     struct re_registers *regs;
 316 {
 317   return re_search_2_stub (bufp, string1, length1, string2, length2,
 318                            start, range, regs, stop, 0);
 319 }
 320 #ifdef _LIBC
 321 weak_alias (__re_search_2, re_search_2)
 322 #endif
 323
 324 static int
 325 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 326                   stop, ret_len)
 327     struct re_pattern_buffer *bufp;
 328     const char *string1, *string2;
 329     int length1, length2, start, range, stop, ret_len;
 330     struct re_registers *regs;
 331 {
 332   const char *str;
 333   int rval;
 334   int len = length1 + length2;
 335   int free_str = 0;
 336
 337   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 338     return -2;
 339
 340   /* Concatenate the strings.  */
 341   if (length2 > 0)
 342     if (length1 > 0)
 343       {
 344         char *s = re_malloc (char, len);
 345
 346         if (BE (s == NULL, 0))
 347           return -2;
 348         memcpy (s, string1, length1);
 349         memcpy (s + length1, string2, length2);
 350         str = s;
 351         free_str = 1;
 352       }
 353     else
 354       str = string2;
 355   else
 356     str = string1;
 357
 358   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 359                          ret_len);
 360   if (free_str)
 361     re_free ((char *) str);
 362   return rval;
 363 }
 364
 365 /* The parameters have the same meaning as those of re_search.
 366    Additional parameters:
 367    If RET_LEN is nonzero the length of the match is returned (re_match style);
 368    otherwise the position of the match is returned.  */
 369
 370 static int
 371 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 372     struct re_pattern_buffer *bufp;
 373     const char *string;
 374     int length, start, range, stop, ret_len;
 375     struct re_registers *regs;
 376 {
 377   reg_errcode_t result;
 378   regmatch_t *pmatch;
 379   int nregs, rval;
 380   int eflags = 0;
 381
 382   /* Check for out-of-range.  */
 383   if (BE (start < 0 || start > length, 0))
 384     return -1;
 385   if (BE (start + range > length, 0))
 386     range = length - start;
 387   else if (BE (start + range < 0, 0))
 388     range = -start;
 389
 390   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 391   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 392
 393   /* Compile fastmap if we haven't yet.  */
 394   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 395     re_compile_fastmap (bufp);
 396
 397   if (BE (bufp->no_sub, 0))
 398     regs = NULL;
 399
 400   /* We need at least 1 register.  */
 401   if (regs == NULL)
 402     nregs = 1;
 403   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 404                regs->num_regs < bufp->re_nsub + 1, 0))
 405     {
 406       nregs = regs->num_regs;
 407       if (BE (nregs < 1, 0))
 408         {
 409           /* Nothing can be copied to regs.  */
 410           regs = NULL;
 411           nregs = 1;
 412         }
 413     }
 414   else
 415     nregs = bufp->re_nsub + 1;
 416   pmatch = re_malloc (regmatch_t, nregs);
 417   if (BE (pmatch == NULL, 0))
 418     return -2;
 419
 420   result = re_search_internal (bufp, string, length, start, range, stop,
 421                                nregs, pmatch, eflags);
 422
 423   rval = 0;
 424
 425   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 426   if (result != REG_NOERROR)
 427     rval = -1;
 428   else if (regs != NULL)
 429     {
 430       /* If caller wants register contents data back, copy them.  */
 431       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 432                                            bufp->regs_allocated);
 433       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 434         rval = -2;
 435     }
 436
 437   if (BE (rval == 0, 1))
 438     {
 439       if (ret_len)
 440         {
 441           assert (pmatch[0].rm_so == start);
 442           rval = pmatch[0].rm_eo - start;
 443         }
 444       else
 445         rval = pmatch[0].rm_so;
 446     }
 447   re_free (pmatch);
 448   return rval;
 449 }
 450
 451 static unsigned
 452 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 453     struct re_registers *regs;
 454     regmatch_t *pmatch;
 455     int nregs, regs_allocated;
 456 {
 457   int rval = REGS_REALLOCATE;
 458   int i;
 459   int need_regs = nregs + 1;
 460   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 461      uses.  */
 462
 463   /* Have the register data arrays been allocated?  */
 464   if (regs_allocated == REGS_UNALLOCATED)
 465     { /* No.  So allocate them with malloc.  */
 466       regs->start = re_malloc (regoff_t, need_regs);
 467       regs->end = re_malloc (regoff_t, need_regs);
 468       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 469         return REGS_UNALLOCATED;
 470       regs->num_regs = need_regs;
 471     }
 472   else if (regs_allocated == REGS_REALLOCATE)
 473     { /* Yes.  If we need more elements than were already
 474          allocated, reallocate them.  If we need fewer, just
 475          leave it alone.  */
 476       if (BE (need_regs > regs->num_regs, 0))
 477         {
 478           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 479           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 480           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 481             return REGS_UNALLOCATED;
 482           regs->start = new_start;
 483           regs->end = new_end;
 484           regs->num_regs = need_regs;
 485         }
 486     }
 487   else
 488     {
 489       assert (regs_allocated == REGS_FIXED);
 490       /* This function may not be called with REGS_FIXED and nregs too big.  */
 491       assert (regs->num_regs >= nregs);
 492       rval = REGS_FIXED;
 493     }
 494
 495   /* Copy the regs.  */
 496   for (i = 0; i < nregs; ++i)
 497     {
 498       regs->start[i] = pmatch[i].rm_so;
 499       regs->end[i] = pmatch[i].rm_eo;
 500     }
 501   for ( ; i < regs->num_regs; ++i)
 502     regs->start[i] = regs->end[i] = -1;
 503
 504   return rval;
 505 }
 506
 507 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 508    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 509    this memory for recording register information.  STARTS and ENDS
 510    must be allocated using the malloc library routine, and must each
 511    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 512
 513    If NUM_REGS == 0, then subsequent matches should allocate their own
 514    register data.
 515
 516    Unless this function is called, the first search or match using
 517    PATTERN_BUFFER will allocate its own register data, without
 518    freeing the old data.  */
 519
 520 void
 521 re_set_registers (bufp, regs, num_regs, starts, ends)
 522     struct re_pattern_buffer *bufp;
 523     struct re_registers *regs;
 524     unsigned num_regs;
 525     regoff_t *starts, *ends;
 526 {
 527   if (num_regs)
 528     {
 529       bufp->regs_allocated = REGS_REALLOCATE;
 530       regs->num_regs = num_regs;
 531       regs->start = starts;
 532       regs->end = ends;
 533     }
 534   else
 535     {
 536       bufp->regs_allocated = REGS_UNALLOCATED;
 537       regs->num_regs = 0;
 538       regs->start = regs->end = (regoff_t *) 0;
 539     }
 540 }
 541 #ifdef _LIBC
 542 weak_alias (__re_set_registers, re_set_registers)
 543 #endif
 544 \f
 545 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 546    them unless specifically requested.  */
 547
 548 #if defined _REGEX_RE_COMP || defined _LIBC
 549 int
 550 # ifdef _LIBC
 551 weak_function
 552 # endif
 553 re_exec (s)
 554      const char *s;
 555 {
 556   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 557 }
 558 #endif /* _REGEX_RE_COMP */
 559 \f
 560 static re_node_set empty_set;
 561
 562 /* Internal entry point.  */
 563
 564 /* Searches for a compiled pattern PREG in the string STRING, whose
 565    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 566    mingings with regexec.  START, and RANGE have the same meanings
 567    with re_search.
 568    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 569    otherwise return the error code.
 570    Note: We assume front end functions already check ranges.
 571    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 572
 573 static reg_errcode_t
 574 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 575                     eflags)
 576     const regex_t *preg;
 577     const char *string;
 578     int length, start, range, stop, eflags;
 579     size_t nmatch;
 580     regmatch_t pmatch[];
 581 {
 582   reg_errcode_t err;
 583   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 584   int left_lim, right_lim, incr;
 585   int fl_longest_match, match_first, match_last = -1;
 586   int fast_translate, sb;
 587 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 588   re_match_context_t mctx = { .dfa = dfa };
 589 #else
 590   re_match_context_t mctx;
 591 #endif
 592   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
 593                     && range && !preg->can_be_null) ? preg->fastmap : NULL);
 594
 595 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 596   memset (&mctx, '\0', sizeof (re_match_context_t));
 597   mctx.dfa = dfa;
 598 #endif
 599
 600   /* Check if the DFA haven't been compiled.  */
 601   if (BE (preg->used == 0 || dfa->init_state == NULL
 602           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 603           || dfa->init_state_begbuf == NULL, 0))
 604     return REG_NOMATCH;
 605
 606 #ifdef DEBUG
 607   /* We assume front-end functions already check them.  */
 608   assert (start + range >= 0 && start + range <= length);
 609 #endif
 610
 611   /* If initial states with non-begbuf contexts have no elements,
 612      the regex must be anchored.  If preg->newline_anchor is set,
 613      we'll never use init_state_nl, so do not check it.  */
 614   if (dfa->init_state->nodes.nelem == 0
 615       && dfa->init_state_word->nodes.nelem == 0
 616       && (dfa->init_state_nl->nodes.nelem == 0
 617           || !preg->newline_anchor))
 618     {
 619       if (start != 0 && start + range != 0)
 620         return REG_NOMATCH;
 621       start = range = 0;
 622     }
 623
 624   re_node_set_init_empty (&empty_set);
 625
 626   /* We must check the longest matching, if nmatch > 0.  */
 627   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 628
 629   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 630                             preg->translate, preg->syntax & RE_ICASE, dfa);
 631   if (BE (err != REG_NOERROR, 0))
 632     goto free_return;
 633   mctx.input.stop = stop;
 634   mctx.input.raw_stop = stop;
 635   mctx.input.newline_anchor = preg->newline_anchor;
 636
 637   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 638   if (BE (err != REG_NOERROR, 0))
 639     goto free_return;
 640
 641   /* We will log all the DFA states through which the dfa pass,
 642      if nmatch > 1, or this dfa has "multibyte node", which is a
 643      back-reference or a node which can accept multibyte character or
 644      multi character collating element.  */
 645   if (nmatch > 1 || dfa->has_mb_node)
 646     {
 647       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 648       if (BE (mctx.state_log == NULL, 0))
 649         {
 650           err = REG_ESPACE;
 651           goto free_return;
 652         }
 653     }
 654   else
 655     mctx.state_log = NULL;
 656
 657   match_first = start;
 658   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 659                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 660
 661   /* Check incrementally whether of not the input string match.  */
 662   incr = (range < 0) ? -1 : 1;
 663   left_lim = (range < 0) ? start + range : start;
 664   right_lim = (range < 0) ? start : start + range;
 665   sb = dfa->mb_cur_max == 1;
 666   fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
 667
 668   for (;;)
 669     {
 670       /* At first get the current byte from input string.  */
 671       if (fastmap)
 672         {
 673           if (BE (fast_translate, 1))
 674             {
 675               unsigned RE_TRANSLATE_TYPE t
 676                 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 677               if (BE (range >= 0, 1))
 678                 {
 679                   if (BE (t != NULL, 0))
 680                     {
 681                       while (BE (match_first < right_lim, 1)
 682                              && !fastmap[t[(unsigned char) string[match_first]]])
 683                         ++match_first;
 684                     }
 685                   else
 686                     {
 687                       while (BE (match_first < right_lim, 1)
 688                              && !fastmap[(unsigned char) string[match_first]])
 689                         ++match_first;
 690                     }
 691                   if (BE (match_first == right_lim, 0))
 692                     {
 693                       int ch = match_first >= length
 694                                ? 0 : (unsigned char) string[match_first];
 695                       if (!fastmap[t ? t[ch] : ch])
 696                         break;
 697                     }
 698                 }
 699               else
 700                 {
 701                   while (match_first >= left_lim)
 702                     {
 703                       int ch = match_first >= length
 704                                ? 0 : (unsigned char) string[match_first];
 705                       if (fastmap[t ? t[ch] : ch])
 706                         break;
 707                       --match_first;
 708                     }
 709                   if (match_first < left_lim)
 710                     break;
 711                 }
 712             }
 713           else
 714             {
 715               int ch;
 716
 717               do
 718                 {
 719                   /* In this case, we can't determine easily the current byte,
 720                      since it might be a component byte of a multibyte
 721                      character.  Then we use the constructed buffer
 722                      instead.  */
 723                   /* If MATCH_FIRST is out of the valid range, reconstruct the
 724                      buffers.  */
 725                   if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
 726                       <= match_first
 727                       || match_first < mctx.input.raw_mbs_idx)
 728                     {
 729                       err = re_string_reconstruct (&mctx.input, match_first,
 730                                                    eflags);
 731                       if (BE (err != REG_NOERROR, 0))
 732                         goto free_return;
 733                     }
 734                   /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 735                      Note that MATCH_FIRST must not be smaller than 0.  */
 736                   ch = ((match_first >= length) ? 0
 737                        : re_string_byte_at (&mctx.input,
 738                                             match_first
 739                                             - mctx.input.raw_mbs_idx));
 740                   if (fastmap[ch])
 741                     break;
 742                   match_first += incr;
 743                 }
 744               while (match_first >= left_lim && match_first <= right_lim);
 745               if (! fastmap[ch])
 746                 break;
 747             }
 748         }
 749
 750       /* Reconstruct the buffers so that the matcher can assume that
 751          the matching starts from the beginning of the buffer.  */
 752       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 753       if (BE (err != REG_NOERROR, 0))
 754         goto free_return;
 755 #ifdef RE_ENABLE_I18N
 756      /* Eliminate it when it is a component of a multibyte character
 757          and isn't the head of a multibyte character.  */
 758       if (sb || re_string_first_byte (&mctx.input, 0))
 759 #endif
 760         {
 761           /* It seems to be appropriate one, then use the matcher.  */
 762           /* We assume that the matching starts from 0.  */
 763           mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 764           match_last = check_matching (&mctx, fl_longest_match,
 765                                        range >= 0 ? &match_first : NULL);
 766           if (match_last != -1)
 767             {
 768               if (BE (match_last == -2, 0))
 769                 {
 770                   err = REG_ESPACE;
 771                   goto free_return;
 772                 }
 773               else
 774                 {
 775                   mctx.match_last = match_last;
 776                   if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 777                     {
 778                       re_dfastate_t *pstate = mctx.state_log[match_last];
 779                       mctx.last_node = check_halt_state_context (&mctx, pstate,
 780                                                                  match_last);
 781                     }
 782                   if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 783                       || dfa->nbackref)
 784                     {
 785                       err = prune_impossible_nodes (&mctx);
 786                       if (err == REG_NOERROR)
 787                         break;
 788                       if (BE (err != REG_NOMATCH, 0))
 789                         goto free_return;
 790                       match_last = -1;
 791                     }
 792                   else
 793                     break; /* We found a match.  */
 794                 }
 795             }
 796           match_ctx_clean (&mctx);
 797         }
 798       /* Update counter.  */
 799       match_first += incr;
 800       if (match_first < left_lim || right_lim < match_first)
 801         break;
 802     }
 803
 804   /* Set pmatch[] if we need.  */
 805   if (match_last != -1 && nmatch > 0)
 806     {
 807       int reg_idx;
 808
 809       /* Initialize registers.  */
 810       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 811         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 812
 813       /* Set the points where matching start/end.  */
 814       pmatch[0].rm_so = 0;
 815       pmatch[0].rm_eo = mctx.match_last;
 816
 817       if (!preg->no_sub && nmatch > 1)
 818         {
 819           err = set_regs (preg, &mctx, nmatch, pmatch,
 820                           dfa->has_plural_match && dfa->nbackref > 0);
 821           if (BE (err != REG_NOERROR, 0))
 822             goto free_return;
 823         }
 824
 825       /* At last, add the offset to the each registers, since we slided
 826          the buffers so that we could assume that the matching starts
 827          from 0.  */
 828       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 829         if (pmatch[reg_idx].rm_so != -1)
 830           {
 831 #ifdef RE_ENABLE_I18N
 832             if (BE (mctx.input.offsets_needed != 0, 0))
 833               {
 834                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 835                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 836                 else
 837                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 838                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 839                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 840                 else
 841                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 842               }
 843 #else
 844             assert (mctx.input.offsets_needed == 0);
 845 #endif
 846             pmatch[reg_idx].rm_so += match_first;
 847             pmatch[reg_idx].rm_eo += match_first;
 848           }
 849     }
 850   err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
 851  free_return:
 852   re_free (mctx.state_log);
 853   if (dfa->nbackref)
 854     match_ctx_free (&mctx);
 855   re_string_destruct (&mctx.input);
 856   return err;
 857 }
 858
 859 static reg_errcode_t
 860 prune_impossible_nodes (mctx)
 861      re_match_context_t *mctx;
 862 {
 863   re_dfa_t *const dfa = mctx->dfa;
 864   int halt_node, match_last;
 865   reg_errcode_t ret;
 866   re_dfastate_t **sifted_states;
 867   re_dfastate_t **lim_states = NULL;
 868   re_sift_context_t sctx;
 869 #ifdef DEBUG
 870   assert (mctx->state_log != NULL);
 871 #endif
 872   match_last = mctx->match_last;
 873   halt_node = mctx->last_node;
 874   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 875   if (BE (sifted_states == NULL, 0))
 876     {
 877       ret = REG_ESPACE;
 878       goto free_return;
 879     }
 880   if (dfa->nbackref)
 881     {
 882       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 883       if (BE (lim_states == NULL, 0))
 884         {
 885           ret = REG_ESPACE;
 886           goto free_return;
 887         }
 888       while (1)
 889         {
 890           memset (lim_states, '\0',
 891                   sizeof (re_dfastate_t *) * (match_last + 1));
 892           match_ctx_clear_flag (mctx);
 893           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 894                          match_last, 0);
 895           ret = sift_states_backward (mctx, &sctx);
 896           re_node_set_free (&sctx.limits);
 897           if (BE (ret != REG_NOERROR, 0))
 898               goto free_return;
 899           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 900             break;
 901           do
 902             {
 903               --match_last;
 904               if (match_last < 0)
 905                 {
 906                   ret = REG_NOMATCH;
 907                   goto free_return;
 908                 }
 909             } while (mctx->state_log[match_last] == NULL
 910                      || !mctx->state_log[match_last]->halt);
 911           halt_node = check_halt_state_context (mctx,
 912                                                 mctx->state_log[match_last],
 913                                                 match_last);
 914         }
 915       ret = merge_state_array (dfa, sifted_states, lim_states,
 916                                match_last + 1);
 917       re_free (lim_states);
 918       lim_states = NULL;
 919       if (BE (ret != REG_NOERROR, 0))
 920         goto free_return;
 921     }
 922   else
 923     {
 924       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 925                      match_last, 0);
 926       ret = sift_states_backward (mctx, &sctx);
 927       re_node_set_free (&sctx.limits);
 928       if (BE (ret != REG_NOERROR, 0))
 929         goto free_return;
 930     }
 931   re_free (mctx->state_log);
 932   mctx->state_log = sifted_states;
 933   sifted_states = NULL;
 934   mctx->last_node = halt_node;
 935   mctx->match_last = match_last;
 936   ret = REG_NOERROR;
 937  free_return:
 938   re_free (sifted_states);
 939   re_free (lim_states);
 940   return ret;
 941 }
 942
 943 /* Acquire an initial state and return it.
 944    We must select appropriate initial state depending on the context,
 945    since initial states may have constraints like "\<", "^", etc..  */
 946
 947 static inline re_dfastate_t *
 948 acquire_init_state_context (err, mctx, idx)
 949      reg_errcode_t *err;
 950      const re_match_context_t *mctx;
 951      int idx;
 952 {
 953   re_dfa_t *const dfa = mctx->dfa;
 954   if (dfa->init_state->has_constraint)
 955     {
 956       unsigned int context;
 957       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 958       if (IS_WORD_CONTEXT (context))
 959         return dfa->init_state_word;
 960       else if (IS_ORDINARY_CONTEXT (context))
 961         return dfa->init_state;
 962       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
 963         return dfa->init_state_begbuf;
 964       else if (IS_NEWLINE_CONTEXT (context))
 965         return dfa->init_state_nl;
 966       else if (IS_BEGBUF_CONTEXT (context))
 967         {
 968           /* It is relatively rare case, then calculate on demand.  */
 969           return re_acquire_state_context (err, dfa,
 970                                            dfa->init_state->entrance_nodes,
 971                                            context);
 972         }
 973       else
 974         /* Must not happen?  */
 975         return dfa->init_state;
 976     }
 977   else
 978     return dfa->init_state;
 979 }
 980
 981 /* Check whether the regular expression match input string INPUT or not,
 982    and return the index where the matching end, return -1 if not match,
 983    or return -2 in case of an error.
 984    FL_LONGEST_MATCH means we want the POSIX longest matching.
 985    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
 986    next place where we may want to try matching.
 987    Note that the matcher assume that the maching starts from the current
 988    index of the buffer.  */
 989
 990 static int
 991 check_matching (mctx, fl_longest_match, p_match_first)
 992     re_match_context_t *mctx;
 993     int fl_longest_match;
 994     int *p_match_first;
 995 {
 996   re_dfa_t *const dfa = mctx->dfa;
 997   reg_errcode_t err;
 998   int match = 0;
 999   int match_last = -1;
1000   int cur_str_idx = re_string_cur_idx (&mctx->input);
1001   re_dfastate_t *cur_state;
1002   int at_init_state = p_match_first != NULL;
1003   int next_start_idx = cur_str_idx;
1004
1005   err = REG_NOERROR;
1006   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1007   /* An initial state must not be NULL (invalid).  */
1008   if (BE (cur_state == NULL, 0))
1009     {
1010       assert (err == REG_ESPACE);
1011       return -2;
1012     }
1013
1014   if (mctx->state_log != NULL)
1015     {
1016       mctx->state_log[cur_str_idx] = cur_state;
1017
1018       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1019          later.  E.g. Processing back references.  */
1020       if (BE (dfa->nbackref, 0))
1021         {
1022           at_init_state = 0;
1023           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1024           if (BE (err != REG_NOERROR, 0))
1025             return err;
1026
1027           if (cur_state->has_backref)
1028             {
1029               err = transit_state_bkref (mctx, &cur_state->nodes);
1030               if (BE (err != REG_NOERROR, 0))
1031                 return err;
1032             }
1033         }
1034     }
1035
1036   /* If the RE accepts NULL string.  */
1037   if (BE (cur_state->halt, 0))
1038     {
1039       if (!cur_state->has_constraint
1040           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1041         {
1042           if (!fl_longest_match)
1043             return cur_str_idx;
1044           else
1045             {
1046               match_last = cur_str_idx;
1047               match = 1;
1048             }
1049         }
1050     }
1051
1052   while (!re_string_eoi (&mctx->input))
1053     {
1054       re_dfastate_t *old_state = cur_state;
1055       cur_state = transit_state (&err, mctx, cur_state);
1056       if (mctx->state_log != NULL)
1057         cur_state = merge_state_with_log (&err, mctx, cur_state);
1058
1059       if (cur_state == NULL)
1060         {
1061           /* Reached the invalid state or an error.  Try to recover a valid
1062              state using the state log, if available and if we have not
1063              already found a valid (even if not the longest) match.  */
1064           if (BE (err != REG_NOERROR, 0))
1065             return -2;
1066
1067           if (mctx->state_log == NULL
1068               || (match && !fl_longest_match)
1069               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1070             break;
1071         }
1072
1073       if (at_init_state)
1074         {
1075           if (old_state == cur_state)
1076             next_start_idx = re_string_cur_idx (&mctx->input);
1077           else
1078             at_init_state = 0;
1079         }
1080
1081       if (cur_state->halt)
1082         {
1083           /* Reached a halt state.
1084              Check the halt state can satisfy the current context.  */
1085           if (!cur_state->has_constraint
1086               || check_halt_state_context (mctx, cur_state,
1087                                            re_string_cur_idx (&mctx->input)))
1088             {
1089               /* We found an appropriate halt state.  */
1090               match_last = re_string_cur_idx (&mctx->input);
1091               match = 1;
1092               if (!fl_longest_match)
1093                 break;
1094             }
1095         }
1096    }
1097
1098   if (match_last == -1 && p_match_first)
1099     *p_match_first += next_start_idx;
1100
1101   return match_last;
1102 }
1103
1104 /* Check NODE match the current context.  */
1105
1106 static int check_halt_node_context (dfa, node, context)
1107     const re_dfa_t *dfa;
1108     int node;
1109     unsigned int context;
1110 {
1111   re_token_type_t type = dfa->nodes[node].type;
1112   unsigned int constraint = dfa->nodes[node].constraint;
1113   if (type != END_OF_RE)
1114     return 0;
1115   if (!constraint)
1116     return 1;
1117   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1118     return 0;
1119   return 1;
1120 }
1121
1122 /* Check the halt state STATE match the current context.
1123    Return 0 if not match, if the node, STATE has, is a halt node and
1124    match the context, return the node.  */
1125
1126 static int
1127 check_halt_state_context (mctx, state, idx)
1128     const re_match_context_t *mctx;
1129     const re_dfastate_t *state;
1130     int idx;
1131 {
1132   int i;
1133   unsigned int context;
1134 #ifdef DEBUG
1135   assert (state->halt);
1136 #endif
1137   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1138   for (i = 0; i < state->nodes.nelem; ++i)
1139     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1140       return state->nodes.elems[i];
1141   return 0;
1142 }
1143
1144 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1145    corresponding to the DFA).
1146    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1147    of errors.  */
1148
1149 static int
1150 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1151     const re_match_context_t *mctx;
1152     regmatch_t *regs;
1153     int nregs, *pidx, node;
1154     re_node_set *eps_via_nodes;
1155     struct re_fail_stack_t *fs;
1156 {
1157   re_dfa_t *const dfa = mctx->dfa;
1158   int i, err, dest_node;
1159   dest_node = -1;
1160   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1161     {
1162       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1163       int ndest, dest_nodes[2];
1164       err = re_node_set_insert (eps_via_nodes, node);
1165       if (BE (err < 0, 0))
1166         return -2;
1167       /* Pick up valid destinations.  */
1168       for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1169         {
1170           int candidate = dfa->edests[node].elems[i];
1171           if (!re_node_set_contains (cur_nodes, candidate))
1172             continue;
1173           dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1174           dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1175           ++ndest;
1176         }
1177       if (ndest <= 1)
1178         return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1179       /* In order to avoid infinite loop like "(a*)*".  */
1180       if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1181         return dest_nodes[1];
1182       if (fs != NULL
1183           && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1184                               eps_via_nodes))
1185         return -2;
1186       return dest_nodes[0];
1187     }
1188   else
1189     {
1190       int naccepted = 0;
1191       re_token_type_t type = dfa->nodes[node].type;
1192
1193 #ifdef RE_ENABLE_I18N
1194       if (ACCEPT_MB_NODE (type))
1195         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1196       else
1197 #endif /* RE_ENABLE_I18N */
1198       if (type == OP_BACK_REF)
1199         {
1200           int subexp_idx = dfa->nodes[node].opr.idx;
1201           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1202           if (fs != NULL)
1203             {
1204               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1205                 return -1;
1206               else if (naccepted)
1207                 {
1208                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1209                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1210                               naccepted) != 0)
1211                     return -1;
1212                 }
1213             }
1214
1215           if (naccepted == 0)
1216             {
1217               err = re_node_set_insert (eps_via_nodes, node);
1218               if (BE (err < 0, 0))
1219                 return -2;
1220               dest_node = dfa->edests[node].elems[0];
1221               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1222                                         dest_node))
1223                 return dest_node;
1224             }
1225         }
1226
1227       if (naccepted != 0
1228           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1229         {
1230           dest_node = dfa->nexts[node];
1231           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1232           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1233                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1234                                                dest_node)))
1235             return -1;
1236           re_node_set_empty (eps_via_nodes);
1237           return dest_node;
1238         }
1239     }
1240   return -1;
1241 }
1242
1243 static reg_errcode_t
1244 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1245      struct re_fail_stack_t *fs;
1246      int str_idx, *dests, nregs;
1247      regmatch_t *regs;
1248      re_node_set *eps_via_nodes;
1249 {
1250   reg_errcode_t err;
1251   int num = fs->num++;
1252   if (fs->num == fs->alloc)
1253     {
1254       struct re_fail_stack_ent_t *new_array;
1255       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1256                                        * fs->alloc * 2));
1257       if (new_array == NULL)
1258         return REG_ESPACE;
1259       fs->alloc *= 2;
1260       fs->stack = new_array;
1261     }
1262   fs->stack[num].idx = str_idx;
1263   fs->stack[num].node = dests[1];
1264   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1265   if (fs->stack[num].regs == NULL)
1266     return REG_ESPACE;
1267   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1268   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1269   return err;
1270 }
1271
1272 static int
1273 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1274      struct re_fail_stack_t *fs;
1275      int *pidx, nregs;
1276      regmatch_t *regs;
1277      re_node_set *eps_via_nodes;
1278 {
1279   int num = --fs->num;
1280   assert (num >= 0);
1281   *pidx = fs->stack[num].idx;
1282   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1283   re_node_set_free (eps_via_nodes);
1284   re_free (fs->stack[num].regs);
1285   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1286   return fs->stack[num].node;
1287 }
1288
1289 /* Set the positions where the subexpressions are starts/ends to registers
1290    PMATCH.
1291    Note: We assume that pmatch[0] is already set, and
1292    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1293
1294 static reg_errcode_t
1295 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1296      const regex_t *preg;
1297      const re_match_context_t *mctx;
1298      size_t nmatch;
1299      regmatch_t *pmatch;
1300      int fl_backtrack;
1301 {
1302   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1303   int idx, cur_node, real_nmatch;
1304   re_node_set eps_via_nodes;
1305   struct re_fail_stack_t *fs;
1306   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1307   regmatch_t *prev_idx_match;
1308
1309 #ifdef DEBUG
1310   assert (nmatch > 1);
1311   assert (mctx->state_log != NULL);
1312 #endif
1313   if (fl_backtrack)
1314     {
1315       fs = &fs_body;
1316       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1317       if (fs->stack == NULL)
1318         return REG_ESPACE;
1319     }
1320   else
1321     fs = NULL;
1322
1323   cur_node = dfa->init_node;
1324   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1325   re_node_set_init_empty (&eps_via_nodes);
1326
1327   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1328   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1329
1330   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1331     {
1332       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1333
1334       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1335         {
1336           int reg_idx;
1337           if (fs)
1338             {
1339               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1340                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1341                   break;
1342               if (reg_idx == nmatch)
1343                 {
1344                   re_node_set_free (&eps_via_nodes);
1345                   return free_fail_stack_return (fs);
1346                 }
1347               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1348                                          &eps_via_nodes);
1349             }
1350           else
1351             {
1352               re_node_set_free (&eps_via_nodes);
1353               return REG_NOERROR;
1354             }
1355         }
1356
1357       /* Proceed to next node.  */
1358       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1359                                     &eps_via_nodes, fs);
1360
1361       if (BE (cur_node < 0, 0))
1362         {
1363           if (BE (cur_node == -2, 0))
1364             {
1365               re_node_set_free (&eps_via_nodes);
1366               free_fail_stack_return (fs);
1367               return REG_ESPACE;
1368             }
1369           if (fs)
1370             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1371                                        &eps_via_nodes);
1372           else
1373             {
1374               re_node_set_free (&eps_via_nodes);
1375               return REG_NOMATCH;
1376             }
1377         }
1378     }
1379   re_node_set_free (&eps_via_nodes);
1380   return free_fail_stack_return (fs);
1381 }
1382
1383 static reg_errcode_t
1384 free_fail_stack_return (fs)
1385      struct re_fail_stack_t *fs;
1386 {
1387   if (fs)
1388     {
1389       int fs_idx;
1390       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1391         {
1392           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1393           re_free (fs->stack[fs_idx].regs);
1394         }
1395       re_free (fs->stack);
1396     }
1397   return REG_NOERROR;
1398 }
1399
1400 static void
1401 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1402      re_dfa_t *dfa;
1403      regmatch_t *pmatch, *prev_idx_match;
1404      int cur_node, cur_idx, nmatch;
1405 {
1406   int type = dfa->nodes[cur_node].type;
1407   if (type == OP_OPEN_SUBEXP)
1408     {
1409       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1410
1411       /* We are at the first node of this sub expression.  */
1412       if (reg_num < nmatch)
1413         {
1414           pmatch[reg_num].rm_so = cur_idx;
1415           pmatch[reg_num].rm_eo = -1;
1416         }
1417     }
1418   else if (type == OP_CLOSE_SUBEXP)
1419     {
1420       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1421       if (reg_num < nmatch)
1422         {
1423           /* We are at the last node of this sub expression.  */
1424           if (pmatch[reg_num].rm_so < cur_idx)
1425             {
1426               pmatch[reg_num].rm_eo = cur_idx;
1427               /* This is a non-empty match or we are not inside an optional
1428                  subexpression.  Accept this right away.  */
1429               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1430             }
1431           else
1432             {
1433               if (dfa->nodes[cur_node].opt_subexp
1434                   && prev_idx_match[reg_num].rm_so != -1)
1435                 /* We transited through an empty match for an optional
1436                    subexpression, like (a?)*, and this is not the subexp's
1437                    first match.  Copy back the old content of the registers
1438                    so that matches of an inner subexpression are undone as
1439                    well, like in ((a?))*.  */
1440                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1441               else
1442                 /* We completed a subexpression, but it may be part of
1443                    an optional one, so do not update PREV_IDX_MATCH.  */
1444                 pmatch[reg_num].rm_eo = cur_idx;
1445             }
1446         }
1447     }
1448 }
1449
1450 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1451    and sift the nodes in each states according to the following rules.
1452    Updated state_log will be wrote to STATE_LOG.
1453
1454    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1455      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1456         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1457         the LAST_NODE, we throw away the node `a'.
1458      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1459         string `s' and transit to `b':
1460         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1461            away the node `a'.
1462         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1463             thrown away, we throw away the node `a'.
1464      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1465         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1466            node `a'.
1467         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1468             we throw away the node `a'.  */
1469
1470 #define STATE_NODE_CONTAINS(state,node) \
1471   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1472
1473 static reg_errcode_t
1474 sift_states_backward (mctx, sctx)
1475      re_match_context_t *mctx;
1476      re_sift_context_t *sctx;
1477 {
1478   re_dfa_t *const dfa = mctx->dfa;
1479   reg_errcode_t err;
1480   int null_cnt = 0;
1481   int str_idx = sctx->last_str_idx;
1482   re_node_set cur_dest;
1483   re_node_set *cur_src; /* Points the state_log[str_idx]->nodes  */
1484
1485 #ifdef DEBUG
1486   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1487 #endif
1488   cur_src = &mctx->state_log[str_idx]->nodes;
1489
1490   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1491      transit to the last_node and the last_node itself.  */
1492   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1493   if (BE (err != REG_NOERROR, 0))
1494     return err;
1495   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1496   if (BE (err != REG_NOERROR, 0))
1497     goto free_return;
1498
1499   /* Then check each states in the state_log.  */
1500   while (str_idx > 0)
1501     {
1502       int i, ret;
1503       /* Update counters.  */
1504       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1505       if (null_cnt > mctx->max_mb_elem_len)
1506         {
1507           memset (sctx->sifted_states, '\0',
1508                   sizeof (re_dfastate_t *) * str_idx);
1509           re_node_set_free (&cur_dest);
1510           return REG_NOERROR;
1511         }
1512       re_node_set_empty (&cur_dest);
1513       --str_idx;
1514       cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1515                  : &mctx->state_log[str_idx]->nodes);
1516
1517       /* Then build the next sifted state.
1518          We build the next sifted state on `cur_dest', and update
1519          `sifted_states[str_idx]' with `cur_dest'.
1520          Note:
1521          `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1522          `cur_src' points the node_set of the old `state_log[str_idx]'.  */
1523       for (i = 0; i < cur_src->nelem; i++)
1524         {
1525           int prev_node = cur_src->elems[i];
1526           int naccepted = 0;
1527           re_token_type_t type = dfa->nodes[prev_node].type;
1528
1529           if (IS_EPSILON_NODE (type))
1530             continue;
1531 #ifdef RE_ENABLE_I18N
1532           /* If the node may accept `multi byte'.  */
1533           if (ACCEPT_MB_NODE (type))
1534             naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1535                                              str_idx, sctx->last_str_idx);
1536
1537 #endif /* RE_ENABLE_I18N */
1538           /* We don't check backreferences here.
1539              See update_cur_sifted_state().  */
1540
1541           if (!naccepted
1542               && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1543               && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1544                                       dfa->nexts[prev_node]))
1545             naccepted = 1;
1546
1547           if (naccepted == 0)
1548             continue;
1549
1550           if (sctx->limits.nelem)
1551             {
1552               int to_idx = str_idx + naccepted;
1553               if (check_dst_limits (mctx, &sctx->limits,
1554                                     dfa->nexts[prev_node], to_idx,
1555                                     prev_node, str_idx))
1556                 continue;
1557             }
1558           ret = re_node_set_insert (&cur_dest, prev_node);
1559           if (BE (ret == -1, 0))
1560             {
1561               err = REG_ESPACE;
1562               goto free_return;
1563             }
1564         }
1565
1566       /* Add all the nodes which satisfy the following conditions:
1567          - It can epsilon transit to a node in CUR_DEST.
1568          - It is in CUR_SRC.
1569          And update state_log.  */
1570       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1571       if (BE (err != REG_NOERROR, 0))
1572         goto free_return;
1573     }
1574   err = REG_NOERROR;
1575  free_return:
1576   re_node_set_free (&cur_dest);
1577   return err;
1578 }
1579
1580 /* Helper functions.  */
1581
1582 static reg_errcode_t
1583 clean_state_log_if_needed (mctx, next_state_log_idx)
1584     re_match_context_t *mctx;
1585     int next_state_log_idx;
1586 {
1587   int top = mctx->state_log_top;
1588
1589   if (next_state_log_idx >= mctx->input.bufs_len
1590       || (next_state_log_idx >= mctx->input.valid_len
1591           && mctx->input.valid_len < mctx->input.len))
1592     {
1593       reg_errcode_t err;
1594       err = extend_buffers (mctx);
1595       if (BE (err != REG_NOERROR, 0))
1596         return err;
1597     }
1598
1599   if (top < next_state_log_idx)
1600     {
1601       memset (mctx->state_log + top + 1, '\0',
1602               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1603       mctx->state_log_top = next_state_log_idx;
1604     }
1605   return REG_NOERROR;
1606 }
1607
1608 static reg_errcode_t
1609 merge_state_array (dfa, dst, src, num)
1610      re_dfa_t *dfa;
1611      re_dfastate_t **dst;
1612      re_dfastate_t **src;
1613      int num;
1614 {
1615   int st_idx;
1616   reg_errcode_t err;
1617   for (st_idx = 0; st_idx < num; ++st_idx)
1618     {
1619       if (dst[st_idx] == NULL)
1620         dst[st_idx] = src[st_idx];
1621       else if (src[st_idx] != NULL)
1622         {
1623           re_node_set merged_set;
1624           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1625                                         &src[st_idx]->nodes);
1626           if (BE (err != REG_NOERROR, 0))
1627             return err;
1628           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1629           re_node_set_free (&merged_set);
1630           if (BE (err != REG_NOERROR, 0))
1631             return err;
1632         }
1633     }
1634   return REG_NOERROR;
1635 }
1636
1637 static reg_errcode_t
1638 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1639      re_match_context_t *mctx;
1640      re_sift_context_t *sctx;
1641      int str_idx;
1642      re_node_set *dest_nodes;
1643 {
1644   re_dfa_t *const dfa = mctx->dfa;
1645   reg_errcode_t err;
1646   const re_node_set *candidates;
1647   candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1648                 : &mctx->state_log[str_idx]->nodes);
1649
1650   /* At first, add the nodes which can epsilon transit to a node in
1651      DEST_NODE.  */
1652   if (dest_nodes->nelem)
1653     {
1654       err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1655       if (BE (err != REG_NOERROR, 0))
1656         return err;
1657     }
1658
1659   /* Then, check the limitations in the current sift_context.  */
1660   if (dest_nodes->nelem && sctx->limits.nelem)
1661     {
1662       err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1663                                  mctx->bkref_ents, str_idx);
1664       if (BE (err != REG_NOERROR, 0))
1665         return err;
1666     }
1667
1668   /* Update state_log.  */
1669   sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1670   if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1671     return err;
1672
1673   if ((mctx->state_log[str_idx] != NULL
1674        && mctx->state_log[str_idx]->has_backref))
1675     {
1676       err = sift_states_bkref (mctx, sctx, str_idx, dest_nodes);
1677       if (BE (err != REG_NOERROR, 0))
1678         return err;
1679     }
1680   return REG_NOERROR;
1681 }
1682
1683 static reg_errcode_t
1684 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1685      re_dfa_t *dfa;
1686      re_node_set *dest_nodes;
1687      const re_node_set *candidates;
1688 {
1689   reg_errcode_t err;
1690   int src_idx;
1691   re_node_set src_copy;
1692
1693   err = re_node_set_init_copy (&src_copy, dest_nodes);
1694   if (BE (err != REG_NOERROR, 0))
1695     return err;
1696   for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1697     {
1698       err = re_node_set_add_intersect (dest_nodes, candidates,
1699                                        dfa->inveclosures
1700                                        + src_copy.elems[src_idx]);
1701       if (BE (err != REG_NOERROR, 0))
1702         {
1703           re_node_set_free (&src_copy);
1704           return err;
1705         }
1706     }
1707   re_node_set_free (&src_copy);
1708   return REG_NOERROR;
1709 }
1710
1711 static reg_errcode_t
1712 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1713      re_dfa_t *dfa;
1714      int node;
1715      re_node_set *dest_nodes;
1716      const re_node_set *candidates;
1717 {
1718     int ecl_idx;
1719     reg_errcode_t err;
1720     re_node_set *inv_eclosure = dfa->inveclosures + node;
1721     re_node_set except_nodes;
1722     re_node_set_init_empty (&except_nodes);
1723     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1724       {
1725         int cur_node = inv_eclosure->elems[ecl_idx];
1726         if (cur_node == node)
1727           continue;
1728         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1729           {
1730             int edst1 = dfa->edests[cur_node].elems[0];
1731             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1732                          ? dfa->edests[cur_node].elems[1] : -1);
1733             if ((!re_node_set_contains (inv_eclosure, edst1)
1734                  && re_node_set_contains (dest_nodes, edst1))
1735                 || (edst2 > 0
1736                     && !re_node_set_contains (inv_eclosure, edst2)
1737                     && re_node_set_contains (dest_nodes, edst2)))
1738               {
1739                 err = re_node_set_add_intersect (&except_nodes, candidates,
1740                                                  dfa->inveclosures + cur_node);
1741                 if (BE (err != REG_NOERROR, 0))
1742                   {
1743                     re_node_set_free (&except_nodes);
1744                     return err;
1745                   }
1746               }
1747           }
1748       }
1749     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1750       {
1751         int cur_node = inv_eclosure->elems[ecl_idx];
1752         if (!re_node_set_contains (&except_nodes, cur_node))
1753           {
1754             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1755             re_node_set_remove_at (dest_nodes, idx);
1756           }
1757       }
1758     re_node_set_free (&except_nodes);
1759     return REG_NOERROR;
1760 }
1761
1762 static int
1763 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1764      re_match_context_t *mctx;
1765      re_node_set *limits;
1766      int dst_node, dst_idx, src_node, src_idx;
1767 {
1768   re_dfa_t *const dfa = mctx->dfa;
1769   int lim_idx, src_pos, dst_pos;
1770
1771   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1772     {
1773       int subexp_idx;
1774       struct re_backref_cache_entry *ent;
1775       ent = mctx->bkref_ents + limits->elems[lim_idx];
1776       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1777
1778       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1779                                            dfa->eclosures + dst_node,
1780                                            subexp_idx, dst_node, dst_idx);
1781       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1782                                            dfa->eclosures + src_node,
1783                                            subexp_idx, src_node, src_idx);
1784
1785       /* In case of:
1786          <src> <dst> ( <subexp> )
1787          ( <subexp> ) <src> <dst>
1788          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1789       if (src_pos == dst_pos)
1790         continue; /* This is unrelated limitation.  */
1791       else
1792         return 1;
1793     }
1794   return 0;
1795 }
1796
1797 static int
1798 check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
1799                            str_idx)
1800      re_match_context_t *mctx;
1801      re_node_set *eclosures;
1802      int limit, subexp_idx, from_node, str_idx;
1803 {
1804   re_dfa_t *const dfa = mctx->dfa;
1805   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1806   int node_idx;
1807
1808   /* If we are outside the range of the subexpression, return -1 or 1.  */
1809   if (str_idx < lim->subexp_from)
1810     return -1;
1811
1812   if (lim->subexp_to < str_idx)
1813     return 1;
1814
1815   /* If we are within the subexpression, return 0.  */
1816   if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1817     return 0;
1818
1819   /* Else, we are on the boundary: examine the nodes on the epsilon
1820      closure.  */
1821   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1822     {
1823       int node = eclosures->elems[node_idx];
1824       switch (dfa->nodes[node].type)
1825         {
1826         case OP_BACK_REF:
1827           {
1828             int bi = search_cur_bkref_entry (mctx, str_idx);
1829             for (; bi < mctx->nbkref_ents; ++bi)
1830               {
1831                 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1832                 int dst, cpos;
1833
1834                 /* If this backreference goes beyond the point we're
1835                    examining, don't go any further.  */
1836                 if (ent->str_idx > str_idx)
1837                   break;
1838
1839                 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1840                   continue;
1841
1842                 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1843                    OP_CLOSE_SUBEXP cases below.  But, if the
1844                    destination node is the same node as the source
1845                    node, don't recurse because it would cause an
1846                    infinite loop: a regex that exhibits this behavior
1847                    is ()\1*\1*  */
1848                 dst = dfa->edests[node].elems[0];
1849                 if (dst == from_node)
1850                   {
1851                     if (str_idx == lim->subexp_from)
1852                       return -1;
1853                     else /* if (str_idx == lim->subexp_to) */
1854                       return 0;
1855                   }
1856
1857                 cpos = check_dst_limits_calc_pos (mctx, limit,
1858                                                   dfa->eclosures + dst,
1859                                                   subexp_idx, dst,
1860                                                   str_idx);
1861
1862                 if (cpos == -1 && str_idx == lim->subexp_from)
1863                   return -1;
1864
1865                 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1866                   return 0;
1867               }
1868               break;
1869             }
1870
1871         case OP_OPEN_SUBEXP:
1872           if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1873             return -1;
1874           break;
1875
1876         case OP_CLOSE_SUBEXP:
1877           if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1878             return 0;
1879           break;
1880
1881         default:
1882             break;
1883         }
1884     }
1885
1886   if (str_idx == lim->subexp_to)
1887     return 1;
1888   else
1889     return 0;
1890 }
1891
1892 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1893    which are against limitations from DEST_NODES. */
1894
1895 static reg_errcode_t
1896 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1897      re_dfa_t *dfa;
1898      re_node_set *dest_nodes;
1899      const re_node_set *candidates;
1900      re_node_set *limits;
1901      struct re_backref_cache_entry *bkref_ents;
1902      int str_idx;
1903 {
1904   reg_errcode_t err;
1905   int node_idx, lim_idx;
1906
1907   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1908     {
1909       int subexp_idx;
1910       struct re_backref_cache_entry *ent;
1911       ent = bkref_ents + limits->elems[lim_idx];
1912
1913       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1914         continue; /* This is unrelated limitation.  */
1915
1916       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1917       if (ent->subexp_to == str_idx)
1918         {
1919           int ops_node = -1;
1920           int cls_node = -1;
1921           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1922             {
1923               int node = dest_nodes->elems[node_idx];
1924               re_token_type_t type = dfa->nodes[node].type;
1925               if (type == OP_OPEN_SUBEXP
1926                   && subexp_idx == dfa->nodes[node].opr.idx)
1927                 ops_node = node;
1928               else if (type == OP_CLOSE_SUBEXP
1929                        && subexp_idx == dfa->nodes[node].opr.idx)
1930                 cls_node = node;
1931             }
1932
1933           /* Check the limitation of the open subexpression.  */
1934           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
1935           if (ops_node >= 0)
1936             {
1937               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1938                                            candidates);
1939               if (BE (err != REG_NOERROR, 0))
1940                 return err;
1941             }
1942
1943           /* Check the limitation of the close subexpression.  */
1944           if (cls_node >= 0)
1945             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1946               {
1947                 int node = dest_nodes->elems[node_idx];
1948                 if (!re_node_set_contains (dfa->inveclosures + node,
1949                                            cls_node)
1950                     && !re_node_set_contains (dfa->eclosures + node,
1951                                               cls_node))
1952                   {
1953                     /* It is against this limitation.
1954                        Remove it form the current sifted state.  */
1955                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1956                                                  candidates);
1957                     if (BE (err != REG_NOERROR, 0))
1958                       return err;
1959                     --node_idx;
1960                   }
1961               }
1962         }
1963       else /* (ent->subexp_to != str_idx)  */
1964         {
1965           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1966             {
1967               int node = dest_nodes->elems[node_idx];
1968               re_token_type_t type = dfa->nodes[node].type;
1969               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1970                 {
1971                   if (subexp_idx != dfa->nodes[node].opr.idx)
1972                     continue;
1973                   if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1974                       || (type == OP_OPEN_SUBEXP))
1975                     {
1976                       /* It is against this limitation.
1977                          Remove it form the current sifted state.  */
1978                       err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1979                                                    candidates);
1980                       if (BE (err != REG_NOERROR, 0))
1981                         return err;
1982                     }
1983                 }
1984             }
1985         }
1986     }
1987   return REG_NOERROR;
1988 }
1989
1990 static reg_errcode_t
1991 sift_states_bkref (mctx, sctx, str_idx, dest_nodes)
1992      re_match_context_t *mctx;
1993      re_sift_context_t *sctx;
1994      int str_idx;
1995      re_node_set *dest_nodes;
1996 {
1997   re_dfa_t *const dfa = mctx->dfa;
1998   reg_errcode_t err;
1999   int node_idx, node;
2000   re_sift_context_t local_sctx;
2001   const re_node_set *candidates;
2002   candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
2003                 : &mctx->state_log[str_idx]->nodes);
2004   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2005
2006   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2007     {
2008       int cur_bkref_idx = re_string_cur_idx (&mctx->input);
2009       re_token_type_t type;
2010       node = candidates->elems[node_idx];
2011       type = dfa->nodes[node].type;
2012       if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
2013         continue;
2014       /* Avoid infinite loop for the REs like "()\1+".  */
2015       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2016         continue;
2017       if (type == OP_BACK_REF)
2018         {
2019           int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2020           for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2021             {
2022               int disabled_idx, subexp_len, to_idx, dst_node;
2023               struct re_backref_cache_entry *entry;
2024               entry = mctx->bkref_ents + enabled_idx;
2025               if (entry->str_idx > str_idx)
2026                 break;
2027               if (entry->node != node)
2028                   continue;
2029               subexp_len = entry->subexp_to - entry->subexp_from;
2030               to_idx = str_idx + subexp_len;
2031               dst_node = (subexp_len ? dfa->nexts[node]
2032                           : dfa->edests[node].elems[0]);
2033
2034               if (to_idx > sctx->last_str_idx
2035                   || sctx->sifted_states[to_idx] == NULL
2036                   || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
2037                                            dst_node)
2038                   || check_dst_limits (mctx, &sctx->limits, node,
2039                                        str_idx, dst_node, to_idx))
2040                 continue;
2041                 {
2042                   re_dfastate_t *cur_state;
2043                   entry->flag = 0;
2044                   for (disabled_idx = enabled_idx + 1;
2045                        disabled_idx < mctx->nbkref_ents; ++disabled_idx)
2046                     {
2047                       struct re_backref_cache_entry *entry2;
2048                       entry2 = mctx->bkref_ents + disabled_idx;
2049                       if (entry2->str_idx > str_idx)
2050                         break;
2051                       entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
2052                     }
2053
2054                   if (local_sctx.sifted_states == NULL)
2055                     {
2056                       local_sctx = *sctx;
2057                       err = re_node_set_init_copy (&local_sctx.limits,
2058                                                    &sctx->limits);
2059                       if (BE (err != REG_NOERROR, 0))
2060                         goto free_return;
2061                     }
2062                   local_sctx.last_node = node;
2063                   local_sctx.last_str_idx = str_idx;
2064                   err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2065                   if (BE (err < 0, 0))
2066                     {
2067                       err = REG_ESPACE;
2068                       goto free_return;
2069                     }
2070                   cur_state = local_sctx.sifted_states[str_idx];
2071                   err = sift_states_backward (mctx, &local_sctx);
2072                   if (BE (err != REG_NOERROR, 0))
2073                     goto free_return;
2074                   if (sctx->limited_states != NULL)
2075                     {
2076                       err = merge_state_array (dfa, sctx->limited_states,
2077                                                local_sctx.sifted_states,
2078                                                str_idx + 1);
2079                       if (BE (err != REG_NOERROR, 0))
2080                         goto free_return;
2081                     }
2082                   local_sctx.sifted_states[str_idx] = cur_state;
2083                   re_node_set_remove (&local_sctx.limits, enabled_idx);
2084                   /* We must not use the variable entry here, since
2085                      mctx->bkref_ents might be realloced.  */
2086                   mctx->bkref_ents[enabled_idx].flag = 1;
2087                 }
2088             }
2089           enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2090           for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2091             {
2092               struct re_backref_cache_entry *entry;
2093               entry = mctx->bkref_ents + enabled_idx;
2094               if (entry->str_idx > str_idx)
2095                 break;
2096               if (entry->node == node)
2097                 entry->flag = 0;
2098             }
2099         }
2100     }
2101   err = REG_NOERROR;
2102  free_return:
2103   if (local_sctx.sifted_states != NULL)
2104     {
2105       re_node_set_free (&local_sctx.limits);
2106     }
2107
2108   return err;
2109 }
2110
2111
2112 #ifdef RE_ENABLE_I18N
2113 static int
2114 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2115     const re_match_context_t *mctx;
2116     re_sift_context_t *sctx;
2117     int node_idx, str_idx, max_str_idx;
2118 {
2119   re_dfa_t *const dfa = mctx->dfa;
2120   int naccepted;
2121   /* Check the node can accept `multi byte'.  */
2122   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2123   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2124       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2125                             dfa->nexts[node_idx]))
2126     /* The node can't accept the `multi byte', or the
2127        destination was already thrown away, then the node
2128        could't accept the current input `multi byte'.   */
2129     naccepted = 0;
2130   /* Otherwise, it is sure that the node could accept
2131      `naccepted' bytes input.  */
2132   return naccepted;
2133 }
2134 #endif /* RE_ENABLE_I18N */
2135
2136 \f
2137 /* Functions for state transition.  */
2138
2139 /* Return the next state to which the current state STATE will transit by
2140    accepting the current input byte, and update STATE_LOG if necessary.
2141    If STATE can accept a multibyte char/collating element/back reference
2142    update the destination of STATE_LOG.  */
2143
2144 static re_dfastate_t *
2145 transit_state (err, mctx, state)
2146      reg_errcode_t *err;
2147      re_match_context_t *mctx;
2148      re_dfastate_t *state;
2149 {
2150   re_dfa_t *const dfa = mctx->dfa;
2151   re_dfastate_t **trtable;
2152   unsigned char ch;
2153
2154   if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
2155       || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
2156           && mctx->input.valid_len < mctx->input.len))
2157     {
2158       *err = extend_buffers (mctx);
2159       if (BE (*err != REG_NOERROR, 0))
2160         return NULL;
2161     }
2162
2163 #ifdef RE_ENABLE_I18N
2164       /* If the current state can accept multibyte.  */
2165       if (state->accept_mb)
2166         {
2167           *err = transit_state_mb (mctx, state);
2168           if (BE (*err != REG_NOERROR, 0))
2169             return NULL;
2170         }
2171 #endif /* RE_ENABLE_I18N */
2172
2173   /* Then decide the next state with the single byte.  */
2174   if (1)
2175     {
2176       /* Use transition table  */
2177       ch = re_string_fetch_byte (&mctx->input);
2178       trtable = state->trtable;
2179       if (trtable == NULL)
2180         {
2181           trtable = build_trtable (dfa, state);
2182           if (trtable == NULL)
2183             {
2184               *err = REG_ESPACE;
2185               return NULL;
2186             }
2187         }
2188       if (BE (state->word_trtable, 0))
2189         {
2190           unsigned int context;
2191           context
2192             = re_string_context_at (&mctx->input,
2193                                     re_string_cur_idx (&mctx->input) - 1,
2194                                     mctx->eflags);
2195           if (IS_WORD_CONTEXT (context))
2196             return trtable[ch + SBC_MAX];
2197           else
2198             return trtable[ch];
2199         }
2200       else
2201         return trtable[ch];
2202     }
2203 #if 0
2204   else
2205     /* don't use transition table  */
2206     return transit_state_sb (err, mctx, state);
2207 #endif
2208 }
2209
2210 /* Update the state_log if we need */
2211 re_dfastate_t *
2212 merge_state_with_log (err, mctx, next_state)
2213      reg_errcode_t *err;
2214      re_match_context_t *mctx;
2215      re_dfastate_t *next_state;
2216 {
2217   re_dfa_t *const dfa = mctx->dfa;
2218   int cur_idx = re_string_cur_idx (&mctx->input);
2219
2220   if (cur_idx > mctx->state_log_top)
2221     {
2222       mctx->state_log[cur_idx] = next_state;
2223       mctx->state_log_top = cur_idx;
2224     }
2225   else if (mctx->state_log[cur_idx] == 0)
2226     {
2227       mctx->state_log[cur_idx] = next_state;
2228     }
2229   else
2230     {
2231       re_dfastate_t *pstate;
2232       unsigned int context;
2233       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2234       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2235          the destination of a multibyte char/collating element/
2236          back reference.  Then the next state is the union set of
2237          these destinations and the results of the transition table.  */
2238       pstate = mctx->state_log[cur_idx];
2239       log_nodes = pstate->entrance_nodes;
2240       if (next_state != NULL)
2241         {
2242           table_nodes = next_state->entrance_nodes;
2243           *err = re_node_set_init_union (&next_nodes, table_nodes,
2244                                              log_nodes);
2245           if (BE (*err != REG_NOERROR, 0))
2246             return NULL;
2247         }
2248       else
2249         next_nodes = *log_nodes;
2250       /* Note: We already add the nodes of the initial state,
2251          then we don't need to add them here.  */
2252
2253       context = re_string_context_at (&mctx->input,
2254                                       re_string_cur_idx (&mctx->input) - 1,
2255                                       mctx->eflags);
2256       next_state = mctx->state_log[cur_idx]
2257         = re_acquire_state_context (err, dfa, &next_nodes, context);
2258       /* We don't need to check errors here, since the return value of
2259          this function is next_state and ERR is already set.  */
2260
2261       if (table_nodes != NULL)
2262         re_node_set_free (&next_nodes);
2263     }
2264
2265   if (BE (dfa->nbackref, 0) && next_state != NULL)
2266     {
2267       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2268          later.  We must check them here, since the back references in the
2269          next state might use them.  */
2270       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2271                                         cur_idx);
2272       if (BE (*err != REG_NOERROR, 0))
2273         return NULL;
2274
2275       /* If the next state has back references.  */
2276       if (next_state->has_backref)
2277         {
2278           *err = transit_state_bkref (mctx, &next_state->nodes);
2279           if (BE (*err != REG_NOERROR, 0))
2280             return NULL;
2281           next_state = mctx->state_log[cur_idx];
2282         }
2283     }
2284
2285   return next_state;
2286 }
2287
2288 /* Skip bytes in the input that correspond to part of a
2289    multi-byte match, then look in the log for a state
2290    from which to restart matching.  */
2291 re_dfastate_t *
2292 find_recover_state (err, mctx)
2293      reg_errcode_t *err;
2294      re_match_context_t *mctx;
2295 {
2296   re_dfastate_t *cur_state = NULL;
2297   do
2298     {
2299       int max = mctx->state_log_top;
2300       int cur_str_idx = re_string_cur_idx (&mctx->input);
2301
2302       do
2303         {
2304           if (++cur_str_idx > max)
2305             return NULL;
2306           re_string_skip_bytes (&mctx->input, 1);
2307         }
2308       while (mctx->state_log[cur_str_idx] == NULL);
2309
2310       cur_state = merge_state_with_log (err, mctx, NULL);
2311     }
2312   while (err == REG_NOERROR && cur_state == NULL);
2313   return cur_state;
2314 }
2315
2316 /* Helper functions for transit_state.  */
2317
2318 /* From the node set CUR_NODES, pick up the nodes whose types are
2319    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2320    expression. And register them to use them later for evaluating the
2321    correspoding back references.  */
2322
2323 static reg_errcode_t
2324 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2325      re_match_context_t *mctx;
2326      re_node_set *cur_nodes;
2327      int str_idx;
2328 {
2329   re_dfa_t *const dfa = mctx->dfa;
2330   int node_idx;
2331   reg_errcode_t err;
2332
2333   /* TODO: This isn't efficient.
2334            Because there might be more than one nodes whose types are
2335            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2336            nodes.
2337            E.g. RE: (a){2}  */
2338   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2339     {
2340       int node = cur_nodes->elems[node_idx];
2341       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2342           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2343           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2344         {
2345           err = match_ctx_add_subtop (mctx, node, str_idx);
2346           if (BE (err != REG_NOERROR, 0))
2347             return err;
2348         }
2349     }
2350   return REG_NOERROR;
2351 }
2352
2353 #if 0
2354 /* Return the next state to which the current state STATE will transit by
2355    accepting the current input byte.  */
2356
2357 static re_dfastate_t *
2358 transit_state_sb (err, mctx, state)
2359      reg_errcode_t *err;
2360      re_match_context_t *mctx;
2361      re_dfastate_t *state;
2362 {
2363   re_dfa_t *const dfa = mctx->dfa;
2364   re_node_set next_nodes;
2365   re_dfastate_t *next_state;
2366   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2367   unsigned int context;
2368
2369   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2370   if (BE (*err != REG_NOERROR, 0))
2371     return NULL;
2372   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2373     {
2374       int cur_node = state->nodes.elems[node_cnt];
2375       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2376         {
2377           *err = re_node_set_merge (&next_nodes,
2378                                     dfa->eclosures + dfa->nexts[cur_node]);
2379           if (BE (*err != REG_NOERROR, 0))
2380             {
2381               re_node_set_free (&next_nodes);
2382               return NULL;
2383             }
2384         }
2385     }
2386   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2387   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2388   /* We don't need to check errors here, since the return value of
2389      this function is next_state and ERR is already set.  */
2390
2391   re_node_set_free (&next_nodes);
2392   re_string_skip_bytes (&mctx->input, 1);
2393   return next_state;
2394 }
2395 #endif
2396
2397 #ifdef RE_ENABLE_I18N
2398 static reg_errcode_t
2399 transit_state_mb (mctx, pstate)
2400     re_match_context_t *mctx;
2401     re_dfastate_t *pstate;
2402 {
2403   re_dfa_t *const dfa = mctx->dfa;
2404   reg_errcode_t err;
2405   int i;
2406
2407   for (i = 0; i < pstate->nodes.nelem; ++i)
2408     {
2409       re_node_set dest_nodes, *new_nodes;
2410       int cur_node_idx = pstate->nodes.elems[i];
2411       int naccepted = 0, dest_idx;
2412       unsigned int context;
2413       re_dfastate_t *dest_state;
2414
2415       if (dfa->nodes[cur_node_idx].constraint)
2416         {
2417           context = re_string_context_at (&mctx->input,
2418                                           re_string_cur_idx (&mctx->input),
2419                                           mctx->eflags);
2420           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2421                                            context))
2422             continue;
2423         }
2424
2425       /* How many bytes the node can accept?  */
2426       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2427         naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2428                                              re_string_cur_idx (&mctx->input));
2429       if (naccepted == 0)
2430         continue;
2431
2432       /* The node can accepts `naccepted' bytes.  */
2433       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2434       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2435                                : mctx->max_mb_elem_len);
2436       err = clean_state_log_if_needed (mctx, dest_idx);
2437       if (BE (err != REG_NOERROR, 0))
2438         return err;
2439 #ifdef DEBUG
2440       assert (dfa->nexts[cur_node_idx] != -1);
2441 #endif
2442       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2443          then we use pstate->nodes.elems[i] instead.  */
2444       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2445
2446       dest_state = mctx->state_log[dest_idx];
2447       if (dest_state == NULL)
2448         dest_nodes = *new_nodes;
2449       else
2450         {
2451           err = re_node_set_init_union (&dest_nodes,
2452                                         dest_state->entrance_nodes, new_nodes);
2453           if (BE (err != REG_NOERROR, 0))
2454             return err;
2455         }
2456       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2457       mctx->state_log[dest_idx]
2458         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2459       if (dest_state != NULL)
2460         re_node_set_free (&dest_nodes);
2461       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2462         return err;
2463     }
2464   return REG_NOERROR;
2465 }
2466 #endif /* RE_ENABLE_I18N */
2467
2468 static reg_errcode_t
2469 transit_state_bkref (mctx, nodes)
2470     re_match_context_t *mctx;
2471     const re_node_set *nodes;
2472 {
2473   re_dfa_t *const dfa = mctx->dfa;
2474   reg_errcode_t err;
2475   int i;
2476   int cur_str_idx = re_string_cur_idx (&mctx->input);
2477
2478   for (i = 0; i < nodes->nelem; ++i)
2479     {
2480       int dest_str_idx, prev_nelem, bkc_idx;
2481       int node_idx = nodes->elems[i];
2482       unsigned int context;
2483       const re_token_t *node = dfa->nodes + node_idx;
2484       re_node_set *new_dest_nodes;
2485
2486       /* Check whether `node' is a backreference or not.  */
2487       if (node->type != OP_BACK_REF)
2488         continue;
2489
2490       if (node->constraint)
2491         {
2492           context = re_string_context_at (&mctx->input, cur_str_idx,
2493                                           mctx->eflags);
2494           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2495             continue;
2496         }
2497
2498       /* `node' is a backreference.
2499          Check the substring which the substring matched.  */
2500       bkc_idx = mctx->nbkref_ents;
2501       err = get_subexp (mctx, node_idx, cur_str_idx);
2502       if (BE (err != REG_NOERROR, 0))
2503         goto free_return;
2504
2505       /* And add the epsilon closures (which is `new_dest_nodes') of
2506          the backreference to appropriate state_log.  */
2507 #ifdef DEBUG
2508       assert (dfa->nexts[node_idx] != -1);
2509 #endif
2510       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2511         {
2512           int subexp_len;
2513           re_dfastate_t *dest_state;
2514           struct re_backref_cache_entry *bkref_ent;
2515           bkref_ent = mctx->bkref_ents + bkc_idx;
2516           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2517             continue;
2518           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2519           new_dest_nodes = (subexp_len == 0
2520                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2521                             : dfa->eclosures + dfa->nexts[node_idx]);
2522           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2523                           - bkref_ent->subexp_from);
2524           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2525                                           mctx->eflags);
2526           dest_state = mctx->state_log[dest_str_idx];
2527           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2528                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2529           /* Add `new_dest_node' to state_log.  */
2530           if (dest_state == NULL)
2531             {
2532               mctx->state_log[dest_str_idx]
2533                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2534                                             context);
2535               if (BE (mctx->state_log[dest_str_idx] == NULL
2536                       && err != REG_NOERROR, 0))
2537                 goto free_return;
2538             }
2539           else
2540             {
2541               re_node_set dest_nodes;
2542               err = re_node_set_init_union (&dest_nodes,
2543                                             dest_state->entrance_nodes,
2544                                             new_dest_nodes);
2545               if (BE (err != REG_NOERROR, 0))
2546                 {
2547                   re_node_set_free (&dest_nodes);
2548                   goto free_return;
2549                 }
2550               mctx->state_log[dest_str_idx]
2551                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2552               re_node_set_free (&dest_nodes);
2553               if (BE (mctx->state_log[dest_str_idx] == NULL
2554                       && err != REG_NOERROR, 0))
2555                 goto free_return;
2556             }
2557           /* We need to check recursively if the backreference can epsilon
2558              transit.  */
2559           if (subexp_len == 0
2560               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2561             {
2562               err = check_subexp_matching_top (mctx, new_dest_nodes,
2563                                                cur_str_idx);
2564               if (BE (err != REG_NOERROR, 0))
2565                 goto free_return;
2566               err = transit_state_bkref (mctx, new_dest_nodes);
2567               if (BE (err != REG_NOERROR, 0))
2568                 goto free_return;
2569             }
2570         }
2571     }
2572   err = REG_NOERROR;
2573  free_return:
2574   return err;
2575 }
2576
2577 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2578    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2579    Note that we might collect inappropriate candidates here.
2580    However, the cost of checking them strictly here is too high, then we
2581    delay these checking for prune_impossible_nodes().  */
2582
2583 static reg_errcode_t
2584 get_subexp (mctx, bkref_node, bkref_str_idx)
2585      re_match_context_t *mctx;
2586      int bkref_node, bkref_str_idx;
2587 {
2588   re_dfa_t *const dfa = mctx->dfa;
2589   int subexp_num, sub_top_idx;
2590   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2591   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2592   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2593   for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2594     {
2595       const struct re_backref_cache_entry *entry
2596         = &mctx->bkref_ents[cache_idx];
2597       if (entry->str_idx > bkref_str_idx)
2598         break;
2599       if (entry->node == bkref_node)
2600         return REG_NOERROR; /* We already checked it.  */
2601     }
2602   subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2603
2604   /* For each sub expression  */
2605   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2606     {
2607       reg_errcode_t err;
2608       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2609       re_sub_match_last_t *sub_last;
2610       int sub_last_idx, sl_str, bkref_str_off;
2611
2612       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2613         continue; /* It isn't related.  */
2614
2615       sl_str = sub_top->str_idx;
2616       bkref_str_off = bkref_str_idx;
2617       /* At first, check the last node of sub expressions we already
2618          evaluated.  */
2619       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2620         {
2621           int sl_str_diff;
2622           sub_last = sub_top->lasts[sub_last_idx];
2623           sl_str_diff = sub_last->str_idx - sl_str;
2624           /* The matched string by the sub expression match with the substring
2625              at the back reference?  */
2626           if (sl_str_diff > 0)
2627             {
2628               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2629                 {
2630                   /* Not enough chars for a successful match.  */
2631                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2632                     break;
2633
2634                   err = clean_state_log_if_needed (mctx,
2635                                                    bkref_str_off
2636                                                    + sl_str_diff);
2637                   if (BE (err != REG_NOERROR, 0))
2638                     return err;
2639                   buf = (const char *) re_string_get_buffer (&mctx->input);
2640                 }
2641               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2642                 break; /* We don't need to search this sub expression any more.  */
2643             }
2644           bkref_str_off += sl_str_diff;
2645           sl_str += sl_str_diff;
2646           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2647                                 bkref_str_idx);
2648
2649           /* Reload buf, since the preceding call might have reallocated
2650              the buffer.  */
2651           buf = (const char *) re_string_get_buffer (&mctx->input);
2652
2653           if (err == REG_NOMATCH)
2654             continue;
2655           if (BE (err != REG_NOERROR, 0))
2656             return err;
2657         }
2658
2659       if (sub_last_idx < sub_top->nlasts)
2660         continue;
2661       if (sub_last_idx > 0)
2662         ++sl_str;
2663       /* Then, search for the other last nodes of the sub expression.  */
2664       for (; sl_str <= bkref_str_idx; ++sl_str)
2665         {
2666           int cls_node, sl_str_off;
2667           const re_node_set *nodes;
2668           sl_str_off = sl_str - sub_top->str_idx;
2669           /* The matched string by the sub expression match with the substring
2670              at the back reference?  */
2671           if (sl_str_off > 0)
2672             {
2673               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2674                 {
2675                   /* If we are at the end of the input, we cannot match.  */
2676                   if (bkref_str_off >= mctx->input.len)
2677                     break;
2678
2679                   err = extend_buffers (mctx);
2680                   if (BE (err != REG_NOERROR, 0))
2681                     return err;
2682
2683                   buf = (const char *) re_string_get_buffer (&mctx->input);
2684                 }
2685               if (buf [bkref_str_off++] != buf[sl_str - 1])
2686                 break; /* We don't need to search this sub expression
2687                           any more.  */
2688             }
2689           if (mctx->state_log[sl_str] == NULL)
2690             continue;
2691           /* Does this state have a ')' of the sub expression?  */
2692           nodes = &mctx->state_log[sl_str]->nodes;
2693           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2694           if (cls_node == -1)
2695             continue; /* No.  */
2696           if (sub_top->path == NULL)
2697             {
2698               sub_top->path = calloc (sizeof (state_array_t),
2699                                       sl_str - sub_top->str_idx + 1);
2700               if (sub_top->path == NULL)
2701                 return REG_ESPACE;
2702             }
2703           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2704              in the current context?  */
2705           err = check_arrival (mctx, sub_top->path, sub_top->node,
2706                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2707           if (err == REG_NOMATCH)
2708               continue;
2709           if (BE (err != REG_NOERROR, 0))
2710               return err;
2711           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2712           if (BE (sub_last == NULL, 0))
2713             return REG_ESPACE;
2714           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2715                                 bkref_str_idx);
2716           if (err == REG_NOMATCH)
2717             continue;
2718         }
2719     }
2720   return REG_NOERROR;
2721 }
2722
2723 /* Helper functions for get_subexp().  */
2724
2725 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2726    If it can arrive, register the sub expression expressed with SUB_TOP
2727    and SUB_LAST.  */
2728
2729 static reg_errcode_t
2730 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2731      re_match_context_t *mctx;
2732      const re_sub_match_top_t *sub_top;
2733      re_sub_match_last_t *sub_last;
2734      int bkref_node, bkref_str;
2735 {
2736   reg_errcode_t err;
2737   int to_idx;
2738   /* Can the subexpression arrive the back reference?  */
2739   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2740                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2741   if (err != REG_NOERROR)
2742     return err;
2743   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2744                              sub_last->str_idx);
2745   if (BE (err != REG_NOERROR, 0))
2746     return err;
2747   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2748   return clean_state_log_if_needed (mctx, to_idx);
2749 }
2750
2751 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2752    Search '(' if FL_OPEN, or search ')' otherwise.
2753    TODO: This function isn't efficient...
2754          Because there might be more than one nodes whose types are
2755          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2756          nodes.
2757          E.g. RE: (a){2}  */
2758
2759 static int
2760 find_subexp_node (dfa, nodes, subexp_idx, type)
2761      const re_dfa_t *dfa;
2762      const re_node_set *nodes;
2763      int subexp_idx, type;
2764 {
2765   int cls_idx;
2766   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2767     {
2768       int cls_node = nodes->elems[cls_idx];
2769       const re_token_t *node = dfa->nodes + cls_node;
2770       if (node->type == type
2771           && node->opr.idx == subexp_idx)
2772         return cls_node;
2773     }
2774   return -1;
2775 }
2776
2777 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2778    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2779    heavily reused.
2780    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2781
2782 static reg_errcode_t
2783 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2784                type)
2785      re_match_context_t *mctx;
2786      state_array_t *path;
2787      int top_node, top_str, last_node, last_str, type;
2788 {
2789   re_dfa_t *const dfa = mctx->dfa;
2790   reg_errcode_t err;
2791   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2792   re_dfastate_t *cur_state = NULL;
2793   re_node_set *cur_nodes, next_nodes;
2794   re_dfastate_t **backup_state_log;
2795   unsigned int context;
2796
2797   subexp_num = dfa->nodes[top_node].opr.idx;
2798   /* Extend the buffer if we need.  */
2799   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2800     {
2801       re_dfastate_t **new_array;
2802       int old_alloc = path->alloc;
2803       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2804       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2805       if (new_array == NULL)
2806         {
2807           path->alloc = old_alloc;
2808           return REG_ESPACE;
2809         }
2810       path->array = new_array;
2811       memset (new_array + old_alloc, '\0',
2812               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2813     }
2814
2815   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2816
2817   /* Temporary modify MCTX.  */
2818   backup_state_log = mctx->state_log;
2819   backup_cur_idx = mctx->input.cur_idx;
2820   mctx->state_log = path->array;
2821   mctx->input.cur_idx = str_idx;
2822
2823   /* Setup initial node set.  */
2824   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2825   if (str_idx == top_str)
2826     {
2827       err = re_node_set_init_1 (&next_nodes, top_node);
2828       if (BE (err != REG_NOERROR, 0))
2829         return err;
2830       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2831       if (BE (err != REG_NOERROR, 0))
2832         {
2833           re_node_set_free (&next_nodes);
2834           return err;
2835         }
2836     }
2837   else
2838     {
2839       cur_state = mctx->state_log[str_idx];
2840       if (cur_state && cur_state->has_backref)
2841         {
2842           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2843           if (BE ( err != REG_NOERROR, 0))
2844             return err;
2845         }
2846       else
2847         re_node_set_init_empty (&next_nodes);
2848     }
2849   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2850     {
2851       if (next_nodes.nelem)
2852         {
2853           err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
2854                                     subexp_num, type);
2855           if (BE ( err != REG_NOERROR, 0))
2856             {
2857               re_node_set_free (&next_nodes);
2858               return err;
2859             }
2860         }
2861       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2862       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2863         {
2864           re_node_set_free (&next_nodes);
2865           return err;
2866         }
2867       mctx->state_log[str_idx] = cur_state;
2868     }
2869
2870   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2871     {
2872       re_node_set_empty (&next_nodes);
2873       if (mctx->state_log[str_idx + 1])
2874         {
2875           err = re_node_set_merge (&next_nodes,
2876                                    &mctx->state_log[str_idx + 1]->nodes);
2877           if (BE (err != REG_NOERROR, 0))
2878             {
2879               re_node_set_free (&next_nodes);
2880               return err;
2881             }
2882         }
2883       if (cur_state)
2884         {
2885           err = check_arrival_add_next_nodes (mctx, str_idx,
2886                                               &cur_state->nodes, &next_nodes);
2887           if (BE (err != REG_NOERROR, 0))
2888             {
2889               re_node_set_free (&next_nodes);
2890               return err;
2891             }
2892         }
2893       ++str_idx;
2894       if (next_nodes.nelem)
2895         {
2896           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2897           if (BE (err != REG_NOERROR, 0))
2898             {
2899               re_node_set_free (&next_nodes);
2900               return err;
2901             }
2902           err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
2903                                     subexp_num, type);
2904           if (BE ( err != REG_NOERROR, 0))
2905             {
2906               re_node_set_free (&next_nodes);
2907               return err;
2908             }
2909         }
2910       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2911       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2912       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2913         {
2914           re_node_set_free (&next_nodes);
2915           return err;
2916         }
2917       mctx->state_log[str_idx] = cur_state;
2918       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2919     }
2920   re_node_set_free (&next_nodes);
2921   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2922                : &mctx->state_log[last_str]->nodes);
2923   path->next_idx = str_idx;
2924
2925   /* Fix MCTX.  */
2926   mctx->state_log = backup_state_log;
2927   mctx->input.cur_idx = backup_cur_idx;
2928
2929   /* Then check the current node set has the node LAST_NODE.  */
2930   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2931     return REG_NOERROR;
2932
2933   return REG_NOMATCH;
2934 }
2935
2936 /* Helper functions for check_arrival.  */
2937
2938 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2939    to NEXT_NODES.
2940    TODO: This function is similar to the functions transit_state*(),
2941          however this function has many additional works.
2942          Can't we unify them?  */
2943
2944 static reg_errcode_t
2945 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
2946      re_match_context_t *mctx;
2947      int str_idx;
2948      re_node_set *cur_nodes, *next_nodes;
2949 {
2950   re_dfa_t *const dfa = mctx->dfa;
2951   int cur_idx;
2952   reg_errcode_t err;
2953   re_node_set union_set;
2954   re_node_set_init_empty (&union_set);
2955   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2956     {
2957       int naccepted = 0;
2958       int cur_node = cur_nodes->elems[cur_idx];
2959       re_token_type_t type = dfa->nodes[cur_node].type;
2960       if (IS_EPSILON_NODE (type))
2961         continue;
2962 #ifdef RE_ENABLE_I18N
2963       /* If the node may accept `multi byte'.  */
2964       if (ACCEPT_MB_NODE (type))
2965         {
2966           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2967                                                str_idx);
2968           if (naccepted > 1)
2969             {
2970               re_dfastate_t *dest_state;
2971               int next_node = dfa->nexts[cur_node];
2972               int next_idx = str_idx + naccepted;
2973               dest_state = mctx->state_log[next_idx];
2974               re_node_set_empty (&union_set);
2975               if (dest_state)
2976                 {
2977                   err = re_node_set_merge (&union_set, &dest_state->nodes);
2978                   if (BE (err != REG_NOERROR, 0))
2979                     {
2980                       re_node_set_free (&union_set);
2981                       return err;
2982                     }
2983                 }
2984               err = re_node_set_insert (&union_set, next_node);
2985               if (BE (err < 0, 0))
2986                 {
2987                   re_node_set_free (&union_set);
2988                   return REG_ESPACE;
2989                 }
2990               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2991                                                             &union_set);
2992               if (BE (mctx->state_log[next_idx] == NULL
2993                       && err != REG_NOERROR, 0))
2994                 {
2995                   re_node_set_free (&union_set);
2996                   return err;
2997                 }
2998             }
2999         }
3000 #endif /* RE_ENABLE_I18N */
3001       if (naccepted
3002           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3003         {
3004           err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3005           if (BE (err < 0, 0))
3006             {
3007               re_node_set_free (&union_set);
3008               return REG_ESPACE;
3009             }
3010         }
3011     }
3012   re_node_set_free (&union_set);
3013   return REG_NOERROR;
3014 }
3015
3016 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3017    CUR_NODES, however exclude the nodes which are:
3018     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3019     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3020 */
3021
3022 static reg_errcode_t
3023 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3024      re_dfa_t *dfa;
3025      re_node_set *cur_nodes;
3026      int ex_subexp, type;
3027 {
3028   reg_errcode_t err;
3029   int idx, outside_node;
3030   re_node_set new_nodes;
3031 #ifdef DEBUG
3032   assert (cur_nodes->nelem);
3033 #endif
3034   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3035   if (BE (err != REG_NOERROR, 0))
3036     return err;
3037   /* Create a new node set NEW_NODES with the nodes which are epsilon
3038      closures of the node in CUR_NODES.  */
3039
3040   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3041     {
3042       int cur_node = cur_nodes->elems[idx];
3043       re_node_set *eclosure = dfa->eclosures + cur_node;
3044       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3045       if (outside_node == -1)
3046         {
3047           /* There are no problematic nodes, just merge them.  */
3048           err = re_node_set_merge (&new_nodes, eclosure);
3049           if (BE (err != REG_NOERROR, 0))
3050             {
3051               re_node_set_free (&new_nodes);
3052               return err;
3053             }
3054         }
3055       else
3056         {
3057           /* There are problematic nodes, re-calculate incrementally.  */
3058           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3059                                               ex_subexp, type);
3060           if (BE (err != REG_NOERROR, 0))
3061             {
3062               re_node_set_free (&new_nodes);
3063               return err;
3064             }
3065         }
3066     }
3067   re_node_set_free (cur_nodes);
3068   *cur_nodes = new_nodes;
3069   return REG_NOERROR;
3070 }
3071
3072 /* Helper function for check_arrival_expand_ecl.
3073    Check incrementally the epsilon closure of TARGET, and if it isn't
3074    problematic append it to DST_NODES.  */
3075
3076 static reg_errcode_t
3077 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3078      re_dfa_t *dfa;
3079      int target, ex_subexp, type;
3080      re_node_set *dst_nodes;
3081 {
3082   int cur_node;
3083   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3084     {
3085       int err;
3086
3087       if (dfa->nodes[cur_node].type == type
3088           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3089         {
3090           if (type == OP_CLOSE_SUBEXP)
3091             {
3092               err = re_node_set_insert (dst_nodes, cur_node);
3093               if (BE (err == -1, 0))
3094                 return REG_ESPACE;
3095             }
3096           break;
3097         }
3098       err = re_node_set_insert (dst_nodes, cur_node);
3099       if (BE (err == -1, 0))
3100         return REG_ESPACE;
3101       if (dfa->edests[cur_node].nelem == 0)
3102         break;
3103       if (dfa->edests[cur_node].nelem == 2)
3104         {
3105           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3106                                               dfa->edests[cur_node].elems[1],
3107                                               ex_subexp, type);
3108           if (BE (err != REG_NOERROR, 0))
3109             return err;
3110         }
3111       cur_node = dfa->edests[cur_node].elems[0];
3112     }
3113   return REG_NOERROR;
3114 }
3115
3116
3117 /* For all the back references in the current state, calculate the
3118    destination of the back references by the appropriate entry
3119    in MCTX->BKREF_ENTS.  */
3120
3121 static reg_errcode_t
3122 expand_bkref_cache (mctx, cur_nodes, cur_str, last_str, subexp_num,
3123                     type)
3124      re_match_context_t *mctx;
3125      int cur_str, last_str, subexp_num, type;
3126      re_node_set *cur_nodes;
3127 {
3128   re_dfa_t *const dfa = mctx->dfa;
3129   reg_errcode_t err;
3130   int cache_idx, cache_idx_start;
3131   /* The current state.  */
3132
3133   cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3134   for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3135     {
3136       int to_idx, next_node;
3137       struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3138       if (ent->str_idx > cur_str)
3139         break;
3140       /* Is this entry ENT is appropriate?  */
3141       if (!re_node_set_contains (cur_nodes, ent->node))
3142         continue; /* No.  */
3143
3144       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3145       /* Calculate the destination of the back reference, and append it
3146          to MCTX->STATE_LOG.  */
3147       if (to_idx == cur_str)
3148         {
3149           /* The backreference did epsilon transit, we must re-check all the
3150              node in the current state.  */
3151           re_node_set new_dests;
3152           reg_errcode_t err2, err3;
3153           next_node = dfa->edests[ent->node].elems[0];
3154           if (re_node_set_contains (cur_nodes, next_node))
3155             continue;
3156           err = re_node_set_init_1 (&new_dests, next_node);
3157           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3158           err3 = re_node_set_merge (cur_nodes, &new_dests);
3159           re_node_set_free (&new_dests);
3160           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3161                   || err3 != REG_NOERROR, 0))
3162             {
3163               err = (err != REG_NOERROR ? err
3164                      : (err2 != REG_NOERROR ? err2 : err3));
3165               return err;
3166             }
3167           /* TODO: It is still inefficient...  */
3168           cache_idx = cache_idx_start - 1;
3169           continue;
3170         }
3171       else
3172         {
3173           re_node_set union_set;
3174           next_node = dfa->nexts[ent->node];
3175           if (mctx->state_log[to_idx])
3176             {
3177               int ret;
3178               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3179                                         next_node))
3180                 continue;
3181               err = re_node_set_init_copy (&union_set,
3182                                            &mctx->state_log[to_idx]->nodes);
3183               ret = re_node_set_insert (&union_set, next_node);
3184               if (BE (err != REG_NOERROR || ret < 0, 0))
3185                 {
3186                   re_node_set_free (&union_set);
3187                   err = err != REG_NOERROR ? err : REG_ESPACE;
3188                   return err;
3189                 }
3190             }
3191           else
3192             {
3193               err = re_node_set_init_1 (&union_set, next_node);
3194               if (BE (err != REG_NOERROR, 0))
3195                 return err;
3196             }
3197           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3198           re_node_set_free (&union_set);
3199           if (BE (mctx->state_log[to_idx] == NULL
3200                   && err != REG_NOERROR, 0))
3201             return err;
3202         }
3203     }
3204   return REG_NOERROR;
3205 }
3206
3207 /* Build transition table for the state.
3208    Return the new table if succeeded, otherwise return NULL.  */
3209
3210 static re_dfastate_t **
3211 build_trtable (dfa, state)
3212     re_dfa_t *dfa;
3213     re_dfastate_t *state;
3214 {
3215   reg_errcode_t err;
3216   int i, j, ch;
3217   unsigned int elem, mask;
3218   int dests_node_malloced = 0, dest_states_malloced = 0;
3219   int ndests; /* Number of the destination states from `state'.  */
3220   re_dfastate_t **trtable;
3221   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3222   re_node_set follows, *dests_node;
3223   bitset *dests_ch;
3224   bitset acceptable;
3225
3226   /* We build DFA states which corresponds to the destination nodes
3227      from `state'.  `dests_node[i]' represents the nodes which i-th
3228      destination state contains, and `dests_ch[i]' represents the
3229      characters which i-th destination state accepts.  */
3230 #ifdef _LIBC
3231   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3232     dests_node = (re_node_set *)
3233                  alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3234   else
3235 #endif
3236     {
3237       dests_node = (re_node_set *)
3238                    malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3239       if (BE (dests_node == NULL, 0))
3240         return NULL;
3241       dests_node_malloced = 1;
3242     }
3243   dests_ch = (bitset *) (dests_node + SBC_MAX);
3244
3245   /* Initialize transiton table.  */
3246   state->word_trtable = 0;
3247
3248   /* At first, group all nodes belonging to `state' into several
3249      destinations.  */
3250   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3251   if (BE (ndests <= 0, 0))
3252     {
3253       if (dests_node_malloced)
3254         free (dests_node);
3255       /* Return NULL in case of an error, trtable otherwise.  */
3256       if (ndests == 0)
3257         {
3258           state->trtable = (re_dfastate_t **)
3259             calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3260           return state->trtable;
3261         }
3262       return NULL;
3263     }
3264
3265   err = re_node_set_alloc (&follows, ndests + 1);
3266   if (BE (err != REG_NOERROR, 0))
3267     goto out_free;
3268
3269 #ifdef _LIBC
3270   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3271                          + ndests * 3 * sizeof (re_dfastate_t *)))
3272     dest_states = (re_dfastate_t **)
3273                   alloca (ndests * 3 * sizeof (re_dfastate_t *));
3274   else
3275 #endif
3276     {
3277       dest_states = (re_dfastate_t **)
3278                     malloc (ndests * 3 * sizeof (re_dfastate_t *));
3279       if (BE (dest_states == NULL, 0))
3280         {
3281 out_free:
3282           if (dest_states_malloced)
3283             free (dest_states);
3284           re_node_set_free (&follows);
3285           for (i = 0; i < ndests; ++i)
3286             re_node_set_free (dests_node + i);
3287           if (dests_node_malloced)
3288             free (dests_node);
3289           return NULL;
3290         }
3291       dest_states_malloced = 1;
3292     }
3293   dest_states_word = dest_states + ndests;
3294   dest_states_nl = dest_states_word + ndests;
3295   bitset_empty (acceptable);
3296
3297   /* Then build the states for all destinations.  */
3298   for (i = 0; i < ndests; ++i)
3299     {
3300       int next_node;
3301       re_node_set_empty (&follows);
3302       /* Merge the follows of this destination states.  */
3303       for (j = 0; j < dests_node[i].nelem; ++j)
3304         {
3305           next_node = dfa->nexts[dests_node[i].elems[j]];
3306           if (next_node != -1)
3307             {
3308               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3309               if (BE (err != REG_NOERROR, 0))
3310                 goto out_free;
3311             }
3312         }
3313       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3314       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3315         goto out_free;
3316       /* If the new state has context constraint,
3317          build appropriate states for these contexts.  */
3318       if (dest_states[i]->has_constraint)
3319         {
3320           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3321                                                           CONTEXT_WORD);
3322           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3323             goto out_free;
3324
3325           if (dest_states[i] != dest_states_word[i]
3326               && dfa->mb_cur_max > 1)
3327             state->word_trtable = 1;
3328
3329           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3330                                                         CONTEXT_NEWLINE);
3331           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3332             goto out_free;
3333         }
3334       else
3335         {
3336           dest_states_word[i] = dest_states[i];
3337           dest_states_nl[i] = dest_states[i];
3338         }
3339       bitset_merge (acceptable, dests_ch[i]);
3340     }
3341
3342   if (!BE (state->word_trtable, 0))
3343     {
3344       /* We don't care about whether the following character is a word
3345          character, or we are in a single-byte character set so we can
3346          discern by looking at the character code: allocate a
3347          256-entry transition table.  */
3348       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3349       if (BE (trtable == NULL, 0))
3350         goto out_free;
3351
3352       /* For all characters ch...:  */
3353       for (i = 0; i < BITSET_UINTS; ++i)
3354         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3355              elem;
3356              mask <<= 1, elem >>= 1, ++ch)
3357           if (BE (elem & 1, 0))
3358             {
3359               /* There must be exactly one destination which accepts
3360                  character ch.  See group_nodes_into_DFAstates.  */
3361               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3362                 ;
3363
3364               /* j-th destination accepts the word character ch.  */
3365               if (dfa->word_char[i] & mask)
3366                 trtable[ch] = dest_states_word[j];
3367               else
3368                 trtable[ch] = dest_states[j];
3369             }
3370     }
3371   else
3372     {
3373       /* We care about whether the following character is a word
3374          character, and we are in a multi-byte character set: discern
3375          by looking at the character code: build two 256-entry
3376          transition tables, one starting at trtable[0] and one
3377          starting at trtable[SBC_MAX].  */
3378       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3379                                            2 * SBC_MAX);
3380       if (BE (trtable == NULL, 0))
3381         goto out_free;
3382
3383       /* For all characters ch...:  */
3384       for (i = 0; i < BITSET_UINTS; ++i)
3385         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3386              elem;
3387              mask <<= 1, elem >>= 1, ++ch)
3388           if (BE (elem & 1, 0))
3389             {
3390               /* There must be exactly one destination which accepts
3391                  character ch.  See group_nodes_into_DFAstates.  */
3392               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3393                 ;
3394
3395               /* j-th destination accepts the word character ch.  */
3396               trtable[ch] = dest_states[j];
3397               trtable[ch + SBC_MAX] = dest_states_word[j];
3398             }
3399     }
3400
3401   /* new line */
3402   if (bitset_contain (acceptable, NEWLINE_CHAR))
3403     {
3404       /* The current state accepts newline character.  */
3405       for (j = 0; j < ndests; ++j)
3406         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3407           {
3408             /* k-th destination accepts newline character.  */
3409             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3410             if (state->word_trtable)
3411               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3412             /* There must be only one destination which accepts
3413                newline.  See group_nodes_into_DFAstates.  */
3414             break;
3415           }
3416     }
3417
3418   if (dest_states_malloced)
3419     free (dest_states);
3420
3421   re_node_set_free (&follows);
3422   for (i = 0; i < ndests; ++i)
3423     re_node_set_free (dests_node + i);
3424
3425   if (dests_node_malloced)
3426     free (dests_node);
3427
3428   state->trtable = trtable;
3429   return trtable;
3430 }
3431
3432 /* Group all nodes belonging to STATE into several destinations.
3433    Then for all destinations, set the nodes belonging to the destination
3434    to DESTS_NODE[i] and set the characters accepted by the destination
3435    to DEST_CH[i].  This function return the number of destinations.  */
3436
3437 static int
3438 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3439     re_dfa_t *dfa;
3440     const re_dfastate_t *state;
3441     re_node_set *dests_node;
3442     bitset *dests_ch;
3443 {
3444   reg_errcode_t err;
3445   int i, j, k;
3446   int ndests; /* Number of the destinations from `state'.  */
3447   bitset accepts; /* Characters a node can accept.  */
3448   const re_node_set *cur_nodes = &state->nodes;
3449   bitset_empty (accepts);
3450   ndests = 0;
3451
3452   /* For all the nodes belonging to `state',  */
3453   for (i = 0; i < cur_nodes->nelem; ++i)
3454     {
3455       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3456       re_token_type_t type = node->type;
3457       unsigned int constraint = node->constraint;
3458
3459       /* Enumerate all single byte character this node can accept.  */
3460       if (type == CHARACTER)
3461         bitset_set (accepts, node->opr.c);
3462       else if (type == SIMPLE_BRACKET)
3463         {
3464           bitset_merge (accepts, node->opr.sbcset);
3465         }
3466       else if (type == OP_PERIOD)
3467         {
3468 #ifdef RE_ENABLE_I18N
3469           if (dfa->mb_cur_max > 1)
3470             bitset_merge (accepts, dfa->sb_char);
3471           else
3472 #endif
3473             bitset_set_all (accepts);
3474           if (!(dfa->syntax & RE_DOT_NEWLINE))
3475             bitset_clear (accepts, '\n');
3476           if (dfa->syntax & RE_DOT_NOT_NULL)
3477             bitset_clear (accepts, '\0');
3478         }
3479 #ifdef RE_ENABLE_I18N
3480       else if (type == OP_UTF8_PERIOD)
3481         {
3482           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3483           if (!(dfa->syntax & RE_DOT_NEWLINE))
3484             bitset_clear (accepts, '\n');
3485           if (dfa->syntax & RE_DOT_NOT_NULL)
3486             bitset_clear (accepts, '\0');
3487         }
3488 #endif
3489       else
3490         continue;
3491
3492       /* Check the `accepts' and sift the characters which are not
3493          match it the context.  */
3494       if (constraint)
3495         {
3496           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3497             {
3498               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3499               bitset_empty (accepts);
3500               if (accepts_newline)
3501                 bitset_set (accepts, NEWLINE_CHAR);
3502               else
3503                 continue;
3504             }
3505           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3506             {
3507               bitset_empty (accepts);
3508               continue;
3509             }
3510
3511           if (constraint & NEXT_WORD_CONSTRAINT)
3512             {
3513               unsigned int any_set = 0;
3514               if (type == CHARACTER && !node->word_char)
3515                 {
3516                   bitset_empty (accepts);
3517                   continue;
3518                 }
3519 #ifdef RE_ENABLE_I18N
3520               if (dfa->mb_cur_max > 1)
3521                 for (j = 0; j < BITSET_UINTS; ++j)
3522                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3523               else
3524 #endif
3525                 for (j = 0; j < BITSET_UINTS; ++j)
3526                   any_set |= (accepts[j] &= dfa->word_char[j]);
3527               if (!any_set)
3528                 continue;
3529             }
3530           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3531             {
3532               unsigned int any_set = 0;
3533               if (type == CHARACTER && node->word_char)
3534                 {
3535                   bitset_empty (accepts);
3536                   continue;
3537                 }
3538 #ifdef RE_ENABLE_I18N
3539               if (dfa->mb_cur_max > 1)
3540                 for (j = 0; j < BITSET_UINTS; ++j)
3541                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3542               else
3543 #endif
3544                 for (j = 0; j < BITSET_UINTS; ++j)
3545                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3546               if (!any_set)
3547                 continue;
3548             }
3549         }
3550
3551       /* Then divide `accepts' into DFA states, or create a new
3552          state.  Above, we make sure that accepts is not empty.  */
3553       for (j = 0; j < ndests; ++j)
3554         {
3555           bitset intersec; /* Intersection sets, see below.  */
3556           bitset remains;
3557           /* Flags, see below.  */
3558           int has_intersec, not_subset, not_consumed;
3559
3560           /* Optimization, skip if this state doesn't accept the character.  */
3561           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3562             continue;
3563
3564           /* Enumerate the intersection set of this state and `accepts'.  */
3565           has_intersec = 0;
3566           for (k = 0; k < BITSET_UINTS; ++k)
3567             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3568           /* And skip if the intersection set is empty.  */
3569           if (!has_intersec)
3570             continue;
3571
3572           /* Then check if this state is a subset of `accepts'.  */
3573           not_subset = not_consumed = 0;
3574           for (k = 0; k < BITSET_UINTS; ++k)
3575             {
3576               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3577               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3578             }
3579
3580           /* If this state isn't a subset of `accepts', create a
3581              new group state, which has the `remains'. */
3582           if (not_subset)
3583             {
3584               bitset_copy (dests_ch[ndests], remains);
3585               bitset_copy (dests_ch[j], intersec);
3586               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3587               if (BE (err != REG_NOERROR, 0))
3588                 goto error_return;
3589               ++ndests;
3590             }
3591
3592           /* Put the position in the current group. */
3593           err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3594           if (BE (err < 0, 0))
3595             goto error_return;
3596
3597           /* If all characters are consumed, go to next node. */
3598           if (!not_consumed)
3599             break;
3600         }
3601       /* Some characters remain, create a new group. */
3602       if (j == ndests)
3603         {
3604           bitset_copy (dests_ch[ndests], accepts);
3605           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3606           if (BE (err != REG_NOERROR, 0))
3607             goto error_return;
3608           ++ndests;
3609           bitset_empty (accepts);
3610         }
3611     }
3612   return ndests;
3613  error_return:
3614   for (j = 0; j < ndests; ++j)
3615     re_node_set_free (dests_node + j);
3616   return -1;
3617 }
3618
3619 #ifdef RE_ENABLE_I18N
3620 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3621    Return the number of the bytes the node accepts.
3622    STR_IDX is the current index of the input string.
3623
3624    This function handles the nodes which can accept one character, or
3625    one collating element like '.', '[a-z]', opposite to the other nodes
3626    can only accept one byte.  */
3627
3628 static int
3629 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3630     re_dfa_t *dfa;
3631     int node_idx, str_idx;
3632     const re_string_t *input;
3633 {
3634   const re_token_t *node = dfa->nodes + node_idx;
3635   int char_len, elem_len;
3636   int i;
3637
3638   if (BE (node->type == OP_UTF8_PERIOD, 0))
3639     {
3640       unsigned char c = re_string_byte_at (input, str_idx), d;
3641       if (BE (c < 0xc2, 1))
3642         return 0;
3643
3644       if (str_idx + 2 > input->len)
3645         return 0;
3646
3647       d = re_string_byte_at (input, str_idx + 1);
3648       if (c < 0xe0)
3649         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3650       else if (c < 0xf0)
3651         {
3652           char_len = 3;
3653           if (c == 0xe0 && d < 0xa0)
3654             return 0;
3655         }
3656       else if (c < 0xf8)
3657         {
3658           char_len = 4;
3659           if (c == 0xf0 && d < 0x90)
3660             return 0;
3661         }
3662       else if (c < 0xfc)
3663         {
3664           char_len = 5;
3665           if (c == 0xf8 && d < 0x88)
3666             return 0;
3667         }
3668       else if (c < 0xfe)
3669         {
3670           char_len = 6;
3671           if (c == 0xfc && d < 0x84)
3672             return 0;
3673         }
3674       else
3675         return 0;
3676
3677       if (str_idx + char_len > input->len)
3678         return 0;
3679
3680       for (i = 1; i < char_len; ++i)
3681         {
3682           d = re_string_byte_at (input, str_idx + i);
3683           if (d < 0x80 || d > 0xbf)
3684             return 0;
3685         }
3686       return char_len;
3687     }
3688
3689   char_len = re_string_char_size_at (input, str_idx);
3690   if (node->type == OP_PERIOD)
3691     {
3692       if (char_len <= 1)
3693         return 0;
3694       /* FIXME: I don't think this if is needed, as both '\n'
3695          and '\0' are char_len == 1.  */
3696       /* '.' accepts any one character except the following two cases.  */
3697       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3698            re_string_byte_at (input, str_idx) == '\n') ||
3699           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3700            re_string_byte_at (input, str_idx) == '\0'))
3701         return 0;
3702       return char_len;
3703     }
3704
3705   elem_len = re_string_elem_size_at (input, str_idx);
3706   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3707     return 0;
3708
3709   if (node->type == COMPLEX_BRACKET)
3710     {
3711       const re_charset_t *cset = node->opr.mbcset;
3712 # ifdef _LIBC
3713       const unsigned char *pin = ((char *) re_string_get_buffer (input)
3714                                   + str_idx);
3715       int j;
3716       uint32_t nrules;
3717 # endif /* _LIBC */
3718       int match_len = 0;
3719       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3720                     ? re_string_wchar_at (input, str_idx) : 0);
3721
3722       /* match with multibyte character?  */
3723       for (i = 0; i < cset->nmbchars; ++i)
3724         if (wc == cset->mbchars[i])
3725           {
3726             match_len = char_len;
3727             goto check_node_accept_bytes_match;
3728           }
3729       /* match with character_class?  */
3730       for (i = 0; i < cset->nchar_classes; ++i)
3731         {
3732           wctype_t wt = cset->char_classes[i];
3733           if (__iswctype (wc, wt))
3734             {
3735               match_len = char_len;
3736               goto check_node_accept_bytes_match;
3737             }
3738         }
3739
3740 # ifdef _LIBC
3741       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3742       if (nrules != 0)
3743         {
3744           unsigned int in_collseq = 0;
3745           const int32_t *table, *indirect;
3746           const unsigned char *weights, *extra;
3747           const char *collseqwc;
3748           int32_t idx;
3749           /* This #include defines a local function!  */
3750 #  include <locale/weight.h>
3751
3752           /* match with collating_symbol?  */
3753           if (cset->ncoll_syms)
3754             extra = (const unsigned char *)
3755               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3756           for (i = 0; i < cset->ncoll_syms; ++i)
3757             {
3758               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3759               /* Compare the length of input collating element and
3760                  the length of current collating element.  */
3761               if (*coll_sym != elem_len)
3762                 continue;
3763               /* Compare each bytes.  */
3764               for (j = 0; j < *coll_sym; j++)
3765                 if (pin[j] != coll_sym[1 + j])
3766                   break;
3767               if (j == *coll_sym)
3768                 {
3769                   /* Match if every bytes is equal.  */
3770                   match_len = j;
3771                   goto check_node_accept_bytes_match;
3772                 }
3773             }
3774
3775           if (cset->nranges)
3776             {
3777               if (elem_len <= char_len)
3778                 {
3779                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3780                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3781                 }
3782               else
3783                 in_collseq = find_collation_sequence_value (pin, elem_len);
3784             }
3785           /* match with range expression?  */
3786           for (i = 0; i < cset->nranges; ++i)
3787             if (cset->range_starts[i] <= in_collseq
3788                 && in_collseq <= cset->range_ends[i])
3789               {
3790                 match_len = elem_len;
3791                 goto check_node_accept_bytes_match;
3792               }
3793
3794           /* match with equivalence_class?  */
3795           if (cset->nequiv_classes)
3796             {
3797               const unsigned char *cp = pin;
3798               table = (const int32_t *)
3799                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3800               weights = (const unsigned char *)
3801                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3802               extra = (const unsigned char *)
3803                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3804               indirect = (const int32_t *)
3805                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3806               idx = findidx (&cp);
3807               if (idx > 0)
3808                 for (i = 0; i < cset->nequiv_classes; ++i)
3809                   {
3810                     int32_t equiv_class_idx = cset->equiv_classes[i];
3811                     size_t weight_len = weights[idx];
3812                     if (weight_len == weights[equiv_class_idx])
3813                       {
3814                         int cnt = 0;
3815                         while (cnt <= weight_len
3816                                && (weights[equiv_class_idx + 1 + cnt]
3817                                    == weights[idx + 1 + cnt]))
3818                           ++cnt;
3819                         if (cnt > weight_len)
3820                           {
3821                             match_len = elem_len;
3822                             goto check_node_accept_bytes_match;
3823                           }
3824                       }
3825                   }
3826             }
3827         }
3828       else
3829 # endif /* _LIBC */
3830         {
3831           /* match with range expression?  */
3832 #if __GNUC__ >= 2
3833           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3834 #else
3835           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3836           cmp_buf[2] = wc;
3837 #endif
3838           for (i = 0; i < cset->nranges; ++i)
3839             {
3840               cmp_buf[0] = cset->range_starts[i];
3841               cmp_buf[4] = cset->range_ends[i];
3842               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3843                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3844                 {
3845                   match_len = char_len;
3846                   goto check_node_accept_bytes_match;
3847                 }
3848             }
3849         }
3850     check_node_accept_bytes_match:
3851       if (!cset->non_match)
3852         return match_len;
3853       else
3854         {
3855           if (match_len > 0)
3856             return 0;
3857           else
3858             return (elem_len > char_len) ? elem_len : char_len;
3859         }
3860     }
3861   return 0;
3862 }
3863
3864 # ifdef _LIBC
3865 static unsigned int
3866 find_collation_sequence_value (mbs, mbs_len)
3867     const unsigned char *mbs;
3868     size_t mbs_len;
3869 {
3870   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3871   if (nrules == 0)
3872     {
3873       if (mbs_len == 1)
3874         {
3875           /* No valid character.  Match it as a single byte character.  */
3876           const unsigned char *collseq = (const unsigned char *)
3877             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3878           return collseq[mbs[0]];
3879         }
3880       return UINT_MAX;
3881     }
3882   else
3883     {
3884       int32_t idx;
3885       const unsigned char *extra = (const unsigned char *)
3886         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3887       int32_t extrasize = (const unsigned char *)
3888         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3889
3890       for (idx = 0; idx < extrasize;)
3891         {
3892           int mbs_cnt, found = 0;
3893           int32_t elem_mbs_len;
3894           /* Skip the name of collating element name.  */
3895           idx = idx + extra[idx] + 1;
3896           elem_mbs_len = extra[idx++];
3897           if (mbs_len == elem_mbs_len)
3898             {
3899               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3900                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3901                   break;
3902               if (mbs_cnt == elem_mbs_len)
3903                 /* Found the entry.  */
3904                 found = 1;
3905             }
3906           /* Skip the byte sequence of the collating element.  */
3907           idx += elem_mbs_len;
3908           /* Adjust for the alignment.  */
3909           idx = (idx + 3) & ~3;
3910           /* Skip the collation sequence value.  */
3911           idx += sizeof (uint32_t);
3912           /* Skip the wide char sequence of the collating element.  */
3913           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3914           /* If we found the entry, return the sequence value.  */
3915           if (found)
3916             return *(uint32_t *) (extra + idx);
3917           /* Skip the collation sequence value.  */
3918           idx += sizeof (uint32_t);
3919         }
3920       return UINT_MAX;
3921     }
3922 }
3923 # endif /* _LIBC */
3924 #endif /* RE_ENABLE_I18N */
3925
3926 /* Check whether the node accepts the byte which is IDX-th
3927    byte of the INPUT.  */
3928
3929 static int
3930 check_node_accept (mctx, node, idx)
3931     const re_match_context_t *mctx;
3932     const re_token_t *node;
3933     int idx;
3934 {
3935   re_dfa_t *const dfa = mctx->dfa;
3936   unsigned char ch;
3937   if (node->constraint)
3938     {
3939       /* The node has constraints.  Check whether the current context
3940          satisfies the constraints.  */
3941       unsigned int context = re_string_context_at (&mctx->input, idx,
3942                                                    mctx->eflags);
3943       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3944         return 0;
3945     }
3946   ch = re_string_byte_at (&mctx->input, idx);
3947   switch (node->type)
3948     {
3949     case CHARACTER:
3950       return node->opr.c == ch;
3951     case SIMPLE_BRACKET:
3952       return bitset_contain (node->opr.sbcset, ch);
3953 #ifdef RE_ENABLE_I18N
3954     case OP_UTF8_PERIOD:
3955       if (ch >= 0x80)
3956         return 0;
3957       /* FALLTHROUGH */
3958 #endif
3959     case OP_PERIOD:
3960       return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
3961                || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
3962     default:
3963       return 0;
3964     }
3965 }
3966
3967 /* Extend the buffers, if the buffers have run out.  */
3968
3969 static reg_errcode_t
3970 extend_buffers (mctx)
3971      re_match_context_t *mctx;
3972 {
3973   reg_errcode_t ret;
3974   re_string_t *pstr = &mctx->input;
3975
3976   /* Double the lengthes of the buffers.  */
3977   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3978   if (BE (ret != REG_NOERROR, 0))
3979     return ret;
3980
3981   if (mctx->state_log != NULL)
3982     {
3983       /* And double the length of state_log.  */
3984       /* XXX We have no indication of the size of this buffer.  If this
3985          allocation fail we have no indication that the state_log array
3986          does not have the right size.  */
3987       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3988                                               pstr->bufs_len + 1);
3989       if (BE (new_array == NULL, 0))
3990         return REG_ESPACE;
3991       mctx->state_log = new_array;
3992     }
3993
3994   /* Then reconstruct the buffers.  */
3995   if (pstr->icase)
3996     {
3997 #ifdef RE_ENABLE_I18N
3998       if (pstr->mb_cur_max > 1)
3999         {
4000           ret = build_wcs_upper_buffer (pstr);
4001           if (BE (ret != REG_NOERROR, 0))
4002             return ret;
4003         }
4004       else
4005 #endif /* RE_ENABLE_I18N  */
4006         build_upper_buffer (pstr);
4007     }
4008   else
4009     {
4010 #ifdef RE_ENABLE_I18N
4011       if (pstr->mb_cur_max > 1)
4012         build_wcs_buffer (pstr);
4013       else
4014 #endif /* RE_ENABLE_I18N  */
4015         {
4016           if (pstr->trans != NULL)
4017             re_string_translate_buffer (pstr);
4018         }
4019     }
4020   return REG_NOERROR;
4021 }
4022
4023 \f
4024 /* Functions for matching context.  */
4025
4026 /* Initialize MCTX.  */
4027
4028 static reg_errcode_t
4029 match_ctx_init (mctx, eflags, n)
4030     re_match_context_t *mctx;
4031     int eflags, n;
4032 {
4033   mctx->eflags = eflags;
4034   mctx->match_last = -1;
4035   if (n > 0)
4036     {
4037       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4038       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4039       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4040         return REG_ESPACE;
4041     }
4042   /* Already zero-ed by the caller.
4043      else
4044        mctx->bkref_ents = NULL;
4045      mctx->nbkref_ents = 0;
4046      mctx->nsub_tops = 0;  */
4047   mctx->abkref_ents = n;
4048   mctx->max_mb_elem_len = 1;
4049   mctx->asub_tops = n;
4050   return REG_NOERROR;
4051 }
4052
4053 /* Clean the entries which depend on the current input in MCTX.
4054    This function must be invoked when the matcher changes the start index
4055    of the input, or changes the input string.  */
4056
4057 static void
4058 match_ctx_clean (mctx)
4059     re_match_context_t *mctx;
4060 {
4061   match_ctx_free_subtops (mctx);
4062   mctx->nsub_tops = 0;
4063   mctx->nbkref_ents = 0;
4064 }
4065
4066 /* Free all the memory associated with MCTX.  */
4067
4068 static void
4069 match_ctx_free (mctx)
4070     re_match_context_t *mctx;
4071 {
4072   match_ctx_free_subtops (mctx);
4073   re_free (mctx->sub_tops);
4074   re_free (mctx->bkref_ents);
4075 }
4076
4077 /* Free all the memory associated with MCTX->SUB_TOPS.  */
4078
4079 static void
4080 match_ctx_free_subtops (mctx)
4081      re_match_context_t *mctx;
4082 {
4083   int st_idx;
4084   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4085     {
4086       int sl_idx;
4087       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4088       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4089         {
4090           re_sub_match_last_t *last = top->lasts[sl_idx];
4091           re_free (last->path.array);
4092           re_free (last);
4093         }
4094       re_free (top->lasts);
4095       if (top->path)
4096         {
4097           re_free (top->path->array);
4098           re_free (top->path);
4099         }
4100       free (top);
4101     }
4102 }
4103
4104 /* Add a new backreference entry to MCTX.
4105    Note that we assume that caller never call this function with duplicate
4106    entry, and call with STR_IDX which isn't smaller than any existing entry.
4107 */
4108
4109 static reg_errcode_t
4110 match_ctx_add_entry (mctx, node, str_idx, from, to)
4111      re_match_context_t *mctx;
4112      int node, str_idx, from, to;
4113 {
4114   if (mctx->nbkref_ents >= mctx->abkref_ents)
4115     {
4116       struct re_backref_cache_entry* new_entry;
4117       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4118                               mctx->abkref_ents * 2);
4119       if (BE (new_entry == NULL, 0))
4120         {
4121           re_free (mctx->bkref_ents);
4122           return REG_ESPACE;
4123         }
4124       mctx->bkref_ents = new_entry;
4125       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4126               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4127       mctx->abkref_ents *= 2;
4128     }
4129   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4130   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4131   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4132   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4133   mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
4134   if (mctx->max_mb_elem_len < to - from)
4135     mctx->max_mb_elem_len = to - from;
4136   return REG_NOERROR;
4137 }
4138
4139 /* Search for the first entry which has the same str_idx.
4140    Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4141
4142 static int
4143 search_cur_bkref_entry (mctx, str_idx)
4144      re_match_context_t *mctx;
4145      int str_idx;
4146 {
4147   int left, right, mid;
4148   right = mctx->nbkref_ents;
4149   for (left = 0; left < right;)
4150     {
4151       mid = (left + right) / 2;
4152       if (mctx->bkref_ents[mid].str_idx < str_idx)
4153         left = mid + 1;
4154       else
4155         right = mid;
4156     }
4157   return left;
4158 }
4159
4160 static void
4161 match_ctx_clear_flag (mctx)
4162      re_match_context_t *mctx;
4163 {
4164   int i;
4165   for (i = 0; i < mctx->nbkref_ents; ++i)
4166     mctx->bkref_ents[i].flag = 0;
4167 }
4168
4169 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4170    at STR_IDX.  */
4171
4172 static reg_errcode_t
4173 match_ctx_add_subtop (mctx, node, str_idx)
4174      re_match_context_t *mctx;
4175      int node, str_idx;
4176 {
4177 #ifdef DEBUG
4178   assert (mctx->sub_tops != NULL);
4179   assert (mctx->asub_tops > 0);
4180 #endif
4181   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4182     {
4183       int new_asub_tops = mctx->asub_tops * 2;
4184       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4185                                                    re_sub_match_top_t *,
4186                                                    new_asub_tops);
4187       if (BE (new_array == NULL, 0))
4188         return REG_ESPACE;
4189       mctx->sub_tops = new_array;
4190       mctx->asub_tops = new_asub_tops;
4191     }
4192   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4193   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4194     return REG_ESPACE;
4195   mctx->sub_tops[mctx->nsub_tops]->node = node;
4196   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4197   return REG_NOERROR;
4198 }
4199
4200 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4201    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4202
4203 static re_sub_match_last_t *
4204 match_ctx_add_sublast (subtop, node, str_idx)
4205      re_sub_match_top_t *subtop;
4206      int node, str_idx;
4207 {
4208   re_sub_match_last_t *new_entry;
4209   if (BE (subtop->nlasts == subtop->alasts, 0))
4210     {
4211       int new_alasts = 2 * subtop->alasts + 1;
4212       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4213                                                     re_sub_match_last_t *,
4214                                                     new_alasts);
4215       if (BE (new_array == NULL, 0))
4216         return NULL;
4217       subtop->lasts = new_array;
4218       subtop->alasts = new_alasts;
4219     }
4220   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4221   if (BE (new_entry != NULL, 1))
4222     {
4223       subtop->lasts[subtop->nlasts] = new_entry;
4224       new_entry->node = node;
4225       new_entry->str_idx = str_idx;
4226       ++subtop->nlasts;
4227     }
4228   return new_entry;
4229 }
4230
4231 static void
4232 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
4233                check_subexp)
4234     re_sift_context_t *sctx;
4235     re_dfastate_t **sifted_sts, **limited_sts;
4236     int last_node, last_str_idx, check_subexp;
4237 {
4238   sctx->sifted_states = sifted_sts;
4239   sctx->limited_states = limited_sts;
4240   sctx->last_node = last_node;
4241   sctx->last_str_idx = last_str_idx;
4242   sctx->check_subexp = check_subexp;
4243   sctx->cur_bkref = -1;
4244   sctx->cls_subexp_idx = -1;
4245   re_node_set_init_empty (&sctx->limits);
4246 }