]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regexec.c
Update.
[thirdparty/glibc.git] / posix / regexec.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
54e1cabc 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
a9388965 21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
3ce12656
UD
22 re_string_t *input, int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static void match_ctx_free_subtops (re_match_context_t *mctx) internal_function;
a9388965 26static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
3ce12656
UD
27 int str_idx, int from, int to) internal_function;
28static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx) internal_function;
29static void match_ctx_clear_flag (re_match_context_t *mctx) internal_function;
6291ee3c 30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
3ce12656 31 int str_idx) internal_function;
6291ee3c 32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
3ce12656 33 int node, int str_idx) internal_function;
0742e48e 34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
15a7d175 35 re_dfastate_t **limited_sts, int last_node,
3ce12656 36 int last_str_idx, int check_subexp) internal_function;
a9388965 37static reg_errcode_t re_search_internal (const regex_t *preg,
15a7d175
UD
38 const char *string, int length,
39 int start, int range, int stop,
40 size_t nmatch, regmatch_t pmatch[],
3ce12656 41 int eflags) internal_function;
ac3d553b 42static int re_search_2_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
43 const char *string1, int length1,
44 const char *string2, int length2,
45 int start, int range, struct re_registers *regs,
3ce12656 46 int stop, int ret_len) internal_function;
ac3d553b 47static int re_search_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
48 const char *string, int length, int start,
49 int range, int stop, struct re_registers *regs,
3ce12656 50 int ret_len) internal_function;
ac3d553b 51static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
3ce12656 52 int nregs, int regs_allocated) internal_function;
bb3f4825
UD
53static inline re_dfastate_t *acquire_init_state_context
54 (reg_errcode_t *err, const regex_t *preg, const re_match_context_t *mctx,
3ce12656 55 int idx) __attribute ((always_inline)) internal_function;
6291ee3c 56static reg_errcode_t prune_impossible_nodes (const regex_t *preg,
3ce12656 57 re_match_context_t *mctx) internal_function;
612546c6 58static int check_matching (const regex_t *preg, re_match_context_t *mctx,
3ce12656 59 int fl_longest_match) internal_function;
3b0bdc72 60static int check_halt_node_context (const re_dfa_t *dfa, int node,
3ce12656 61 unsigned int context) internal_function;
3b0bdc72 62static int check_halt_state_context (const regex_t *preg,
15a7d175 63 const re_dfastate_t *state,
3ce12656 64 const re_match_context_t *mctx, int idx) internal_function;
81c64d40 65static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
3ce12656 66 int cur_idx, int nmatch) internal_function;
a3022b82 67static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs,
15a7d175
UD
68 const re_match_context_t *mctx,
69 int *pidx, int node, re_node_set *eps_via_nodes,
3ce12656 70 struct re_fail_stack_t *fs) internal_function;
1b2c2628 71static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
15a7d175
UD
72 int str_idx, int *dests, int nregs,
73 regmatch_t *regs,
3ce12656 74 re_node_set *eps_via_nodes) internal_function;
a3022b82 75static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
3ce12656 76 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
612546c6 77static reg_errcode_t set_regs (const regex_t *preg,
15a7d175
UD
78 const re_match_context_t *mctx,
79 size_t nmatch, regmatch_t *pmatch,
3ce12656
UD
80 int fl_backtrack) internal_function;
81static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
485d775d 82
434d3784 83#ifdef RE_ENABLE_I18N
612546c6 84static int sift_states_iter_mb (const regex_t *preg,
15a7d175
UD
85 const re_match_context_t *mctx,
86 re_sift_context_t *sctx,
3ce12656 87 int node_idx, int str_idx, int max_str_idx) internal_function;
434d3784 88#endif /* RE_ENABLE_I18N */
a9388965 89static reg_errcode_t sift_states_backward (const regex_t *preg,
15a7d175 90 re_match_context_t *mctx,
3ce12656 91 re_sift_context_t *sctx) internal_function;
0742e48e 92static reg_errcode_t update_cur_sifted_state (const regex_t *preg,
15a7d175
UD
93 re_match_context_t *mctx,
94 re_sift_context_t *sctx,
95 int str_idx,
3ce12656 96 re_node_set *dest_nodes) internal_function;
0742e48e 97static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
15a7d175 98 re_node_set *dest_nodes,
3ce12656 99 const re_node_set *candidates) internal_function;
0742e48e 100static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
15a7d175 101 re_node_set *dest_nodes,
3ce12656 102 const re_node_set *and_nodes) internal_function;
a3022b82 103static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits,
15a7d175 104 re_match_context_t *mctx, int dst_node,
3ce12656 105 int dst_idx, int src_node, int src_idx) internal_function;
a3022b82 106static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx,
15a7d175 107 int limit, re_node_set *eclosures,
3ce12656 108 int subexp_idx, int node, int str_idx) internal_function;
0742e48e 109static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
15a7d175
UD
110 re_node_set *dest_nodes,
111 const re_node_set *candidates,
112 re_node_set *limits,
113 struct re_backref_cache_entry *bkref_ents,
3ce12656 114 int str_idx) internal_function;
0742e48e 115static reg_errcode_t sift_states_bkref (const regex_t *preg,
15a7d175
UD
116 re_match_context_t *mctx,
117 re_sift_context_t *sctx,
3ce12656 118 int str_idx, re_node_set *dest_nodes) internal_function;
612546c6 119static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
3ce12656 120 int next_state_log_idx) internal_function;
0742e48e 121static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
3ce12656 122 re_dfastate_t **src, int num) internal_function;
a9388965 123static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
15a7d175 124 re_match_context_t *mctx,
3ce12656 125 re_dfastate_t *state) internal_function;
6291ee3c
UD
126static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa,
127 re_match_context_t *mctx,
128 re_node_set *cur_nodes,
3ce12656 129 int str_idx) internal_function;
c13c99fa 130#if 0
a9388965 131static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
15a7d175 132 re_dfastate_t *pstate,
3ce12656 133 re_match_context_t *mctx) internal_function;
c13c99fa 134#endif
434d3784 135#ifdef RE_ENABLE_I18N
a9388965 136static reg_errcode_t transit_state_mb (const regex_t *preg,
15a7d175 137 re_dfastate_t *pstate,
3ce12656 138 re_match_context_t *mctx) internal_function;
434d3784 139#endif /* RE_ENABLE_I18N */
a9388965 140static reg_errcode_t transit_state_bkref (const regex_t *preg,
fe9434bb 141 const re_node_set *nodes,
3ce12656 142 re_match_context_t *mctx) internal_function;
6291ee3c 143static reg_errcode_t get_subexp (const regex_t *preg, re_match_context_t *mctx,
3ce12656 144 int bkref_node, int bkref_str_idx) internal_function;
6291ee3c
UD
145static reg_errcode_t get_subexp_sub (const regex_t *preg,
146 re_match_context_t *mctx,
fe9434bb 147 const re_sub_match_top_t *sub_top,
6291ee3c 148 re_sub_match_last_t *sub_last,
3ce12656 149 int bkref_node, int bkref_str) internal_function;
fe9434bb 150static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
3ce12656 151 int subexp_idx, int type) internal_function;
54e1cabc 152static reg_errcode_t check_arrival (const regex_t *preg,
6291ee3c
UD
153 re_match_context_t *mctx,
154 state_array_t *path, int top_node,
155 int top_str, int last_node, int last_str,
3ce12656 156 int type) internal_function;
6291ee3c
UD
157static reg_errcode_t check_arrival_add_next_nodes (const regex_t *preg,
158 re_dfa_t *dfa,
159 re_match_context_t *mctx,
160 int str_idx,
161 re_node_set *cur_nodes,
3ce12656 162 re_node_set *next_nodes) internal_function;
6291ee3c
UD
163static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
164 re_node_set *cur_nodes,
3ce12656 165 int ex_subexp, int type) internal_function;
6291ee3c
UD
166static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
167 re_node_set *dst_nodes,
168 int target, int ex_subexp,
3ce12656 169 int type) internal_function;
6291ee3c
UD
170static reg_errcode_t expand_bkref_cache (const regex_t *preg,
171 re_match_context_t *mctx,
172 re_node_set *cur_nodes, int cur_str,
173 int last_str, int subexp_num,
3ce12656 174 int type) internal_function;
3b0bdc72 175static re_dfastate_t **build_trtable (const regex_t *dfa,
3ce12656 176 re_dfastate_t *state) internal_function;
434d3784 177#ifdef RE_ENABLE_I18N
3b0bdc72 178static int check_node_accept_bytes (const regex_t *preg, int node_idx,
3ce12656 179 const re_string_t *input, int idx) internal_function;
434d3784 180# ifdef _LIBC
c202c2c5 181static unsigned int find_collation_sequence_value (const unsigned char *mbs,
3ce12656 182 size_t name_len) internal_function;
434d3784
UD
183# endif /* _LIBC */
184#endif /* RE_ENABLE_I18N */
3b0bdc72 185static int group_nodes_into_DFAstates (const regex_t *dfa,
15a7d175
UD
186 const re_dfastate_t *state,
187 re_node_set *states_node,
3ce12656 188 bitset *states_ch) internal_function;
3b0bdc72 189static int check_node_accept (const regex_t *preg, const re_token_t *node,
3ce12656
UD
190 const re_match_context_t *mctx, int idx) internal_function;
191static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
3b0bdc72
UD
192\f
193/* Entry point for POSIX code. */
194
195/* regexec searches for a given pattern, specified by PREG, in the
196 string STRING.
197
198 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
199 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
200 least NMATCH elements, and we set them to the offsets of the
201 corresponding matched substrings.
202
203 EFLAGS specifies `execution flags' which affect matching: if
204 REG_NOTBOL is set, then ^ does not match at the beginning of the
205 string; if REG_NOTEOL is set, then $ does not match at the end.
206
207 We return 0 if we find a match and REG_NOMATCH if not. */
208
209int
210regexec (preg, string, nmatch, pmatch, eflags)
92b27c74
UD
211 const regex_t *__restrict preg;
212 const char *__restrict string;
3b0bdc72
UD
213 size_t nmatch;
214 regmatch_t pmatch[];
215 int eflags;
216{
a9388965 217 reg_errcode_t err;
3b0bdc72
UD
218 int length = strlen (string);
219 if (preg->no_sub)
ac3d553b 220 err = re_search_internal (preg, string, length, 0, length, length, 0,
15a7d175 221 NULL, eflags);
3b0bdc72 222 else
ac3d553b 223 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
15a7d175 224 pmatch, eflags);
a9388965 225 return err != REG_NOERROR;
3b0bdc72
UD
226}
227#ifdef _LIBC
228weak_alias (__regexec, regexec)
229#endif
230
231/* Entry points for GNU code. */
232
ac3d553b
UD
233/* re_match, re_search, re_match_2, re_search_2
234
235 The former two functions operate on STRING with length LENGTH,
236 while the later two operate on concatenation of STRING1 and STRING2
237 with lengths LENGTH1 and LENGTH2, respectively.
238
239 re_match() matches the compiled pattern in BUFP against the string,
240 starting at index START.
241
242 re_search() first tries matching at index START, then it tries to match
243 starting from index START + 1, and so on. The last start position tried
244 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
245 way as re_match().)
246
247 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
248 the first STOP characters of the concatenation of the strings should be
249 concerned.
250
251 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
252 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
253 computed relative to the concatenation, not relative to the individual
254 strings.)
255
256 On success, re_match* functions return the length of the match, re_search*
257 return the position of the start of the match. Return value -1 means no
258 match was found and -2 indicates an internal error. */
3b0bdc72
UD
259
260int
ac3d553b
UD
261re_match (bufp, string, length, start, regs)
262 struct re_pattern_buffer *bufp;
3b0bdc72
UD
263 const char *string;
264 int length, start;
265 struct re_registers *regs;
266{
ac3d553b 267 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
3b0bdc72
UD
268}
269#ifdef _LIBC
270weak_alias (__re_match, re_match)
271#endif
272
ac3d553b
UD
273int
274re_search (bufp, string, length, start, range, regs)
275 struct re_pattern_buffer *bufp;
276 const char *string;
277 int length, start, range;
278 struct re_registers *regs;
279{
280 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
281}
282#ifdef _LIBC
283weak_alias (__re_search, re_search)
284#endif
3b0bdc72
UD
285
286int
ac3d553b
UD
287re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
288 struct re_pattern_buffer *bufp;
289 const char *string1, *string2;
290 int length1, length2, start, stop;
291 struct re_registers *regs;
3b0bdc72 292{
ac3d553b 293 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 294 start, 0, regs, stop, 1);
3b0bdc72
UD
295}
296#ifdef _LIBC
297weak_alias (__re_match_2, re_match_2)
298#endif
299
3b0bdc72 300int
ac3d553b
UD
301re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
302 struct re_pattern_buffer *bufp;
303 const char *string1, *string2;
304 int length1, length2, start, range, stop;
305 struct re_registers *regs;
306{
307 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 308 start, range, regs, stop, 0);
ac3d553b
UD
309}
310#ifdef _LIBC
311weak_alias (__re_search_2, re_search_2)
312#endif
313
314static int
315re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
15a7d175 316 stop, ret_len)
ac3d553b
UD
317 struct re_pattern_buffer *bufp;
318 const char *string1, *string2;
319 int length1, length2, start, range, stop, ret_len;
320 struct re_registers *regs;
321{
322 const char *str;
323 int rval;
324 int len = length1 + length2;
325 int free_str = 0;
326
327 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
328 return -2;
329
330 /* Concatenate the strings. */
331 if (length2 > 0)
332 if (length1 > 0)
333 {
15a7d175
UD
334 char *s = re_malloc (char, len);
335
336 if (BE (s == NULL, 0))
337 return -2;
338 memcpy (s, string1, length1);
339 memcpy (s + length1, string2, length2);
340 str = s;
341 free_str = 1;
ac3d553b
UD
342 }
343 else
344 str = string2;
345 else
346 str = string1;
347
348 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
15a7d175 349 ret_len);
ac3d553b 350 if (free_str)
1b2c2628 351 re_free ((char *) str);
ac3d553b
UD
352 return rval;
353}
354
355/* The parameters have the same meaning as those of re_search.
356 Additional parameters:
357 If RET_LEN is nonzero the length of the match is returned (re_match style);
358 otherwise the position of the match is returned. */
359
360static int
361re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
362 struct re_pattern_buffer *bufp;
363 const char *string;
364 int length, start, range, stop, ret_len;
365 struct re_registers *regs;
3b0bdc72 366{
a9388965 367 reg_errcode_t result;
3b0bdc72 368 regmatch_t *pmatch;
ac3d553b
UD
369 int nregs, rval;
370 int eflags = 0;
371
372 /* Check for out-of-range. */
a877402c 373 if (BE (start < 0 || start > length, 0))
ac3d553b
UD
374 return -1;
375 if (BE (start + range > length, 0))
376 range = length - start;
a877402c
UD
377 else if (BE (start + range < 0, 0))
378 range = -start;
3b0bdc72
UD
379
380 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
381 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
382
ac3d553b
UD
383 /* Compile fastmap if we haven't yet. */
384 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
385 re_compile_fastmap (bufp);
386
387 if (BE (bufp->no_sub, 0))
388 regs = NULL;
3b0bdc72
UD
389
390 /* We need at least 1 register. */
ac3d553b
UD
391 if (regs == NULL)
392 nregs = 1;
393 else if (BE (bufp->regs_allocated == REGS_FIXED &&
15a7d175 394 regs->num_regs < bufp->re_nsub + 1, 0))
ac3d553b
UD
395 {
396 nregs = regs->num_regs;
397 if (BE (nregs < 1, 0))
15a7d175
UD
398 {
399 /* Nothing can be copied to regs. */
400 regs = NULL;
401 nregs = 1;
402 }
ac3d553b
UD
403 }
404 else
405 nregs = bufp->re_nsub + 1;
3b0bdc72 406 pmatch = re_malloc (regmatch_t, nregs);
bc15410e
UD
407 if (BE (pmatch == NULL, 0))
408 return -2;
3b0bdc72 409
ac3d553b 410 result = re_search_internal (bufp, string, length, start, range, stop,
15a7d175 411 nregs, pmatch, eflags);
3b0bdc72 412
ac3d553b
UD
413 rval = 0;
414
415 /* I hope we needn't fill ther regs with -1's when no match was found. */
416 if (result != REG_NOERROR)
417 rval = -1;
418 else if (regs != NULL)
3b0bdc72 419 {
ac3d553b
UD
420 /* If caller wants register contents data back, copy them. */
421 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
15a7d175 422 bufp->regs_allocated);
ac3d553b 423 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
15a7d175 424 rval = -2;
3b0bdc72
UD
425 }
426
ac3d553b 427 if (BE (rval == 0, 1))
3b0bdc72 428 {
ac3d553b 429 if (ret_len)
15a7d175
UD
430 {
431 assert (pmatch[0].rm_so == start);
432 rval = pmatch[0].rm_eo - start;
433 }
ac3d553b 434 else
15a7d175 435 rval = pmatch[0].rm_so;
3b0bdc72 436 }
3b0bdc72
UD
437 re_free (pmatch);
438 return rval;
439}
3b0bdc72 440
ac3d553b
UD
441static unsigned
442re_copy_regs (regs, pmatch, nregs, regs_allocated)
3b0bdc72 443 struct re_registers *regs;
ac3d553b
UD
444 regmatch_t *pmatch;
445 int nregs, regs_allocated;
3b0bdc72 446{
ac3d553b
UD
447 int rval = REGS_REALLOCATE;
448 int i;
449 int need_regs = nregs + 1;
450 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
451 uses. */
452
453 /* Have the register data arrays been allocated? */
454 if (regs_allocated == REGS_UNALLOCATED)
455 { /* No. So allocate them with malloc. */
456 regs->start = re_malloc (regoff_t, need_regs);
457 if (BE (regs->start == NULL, 0))
15a7d175 458 return REGS_UNALLOCATED;
ac3d553b
UD
459 regs->end = re_malloc (regoff_t, need_regs);
460 if (BE (regs->end == NULL, 0))
15a7d175
UD
461 {
462 re_free (regs->start);
463 return REGS_UNALLOCATED;
464 }
ac3d553b
UD
465 regs->num_regs = need_regs;
466 }
467 else if (regs_allocated == REGS_REALLOCATE)
468 { /* Yes. If we need more elements than were already
15a7d175
UD
469 allocated, reallocate them. If we need fewer, just
470 leave it alone. */
951d6408 471 if (BE (need_regs > regs->num_regs, 0))
15a7d175
UD
472 {
473 regs->start = re_realloc (regs->start, regoff_t, need_regs);
474 if (BE (regs->start == NULL, 0))
475 {
476 if (regs->end != NULL)
477 re_free (regs->end);
478 return REGS_UNALLOCATED;
479 }
480 regs->end = re_realloc (regs->end, regoff_t, need_regs);
481 if (BE (regs->end == NULL, 0))
482 {
483 re_free (regs->start);
484 return REGS_UNALLOCATED;
485 }
486 regs->num_regs = need_regs;
487 }
ac3d553b
UD
488 }
489 else
490 {
491 assert (regs_allocated == REGS_FIXED);
492 /* This function may not be called with REGS_FIXED and nregs too big. */
493 assert (regs->num_regs >= nregs);
494 rval = REGS_FIXED;
495 }
496
497 /* Copy the regs. */
498 for (i = 0; i < nregs; ++i)
499 {
500 regs->start[i] = pmatch[i].rm_so;
501 regs->end[i] = pmatch[i].rm_eo;
502 }
503 for ( ; i < regs->num_regs; ++i)
504 regs->start[i] = regs->end[i] = -1;
505
506 return rval;
3b0bdc72 507}
3b0bdc72
UD
508
509/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
510 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
511 this memory for recording register information. STARTS and ENDS
512 must be allocated using the malloc library routine, and must each
513 be at least NUM_REGS * sizeof (regoff_t) bytes long.
514
515 If NUM_REGS == 0, then subsequent matches should allocate their own
516 register data.
517
518 Unless this function is called, the first search or match using
519 PATTERN_BUFFER will allocate its own register data, without
520 freeing the old data. */
521
522void
523re_set_registers (bufp, regs, num_regs, starts, ends)
524 struct re_pattern_buffer *bufp;
525 struct re_registers *regs;
526 unsigned num_regs;
527 regoff_t *starts, *ends;
528{
529 if (num_regs)
530 {
531 bufp->regs_allocated = REGS_REALLOCATE;
532 regs->num_regs = num_regs;
533 regs->start = starts;
534 regs->end = ends;
535 }
536 else
537 {
538 bufp->regs_allocated = REGS_UNALLOCATED;
539 regs->num_regs = 0;
540 regs->start = regs->end = (regoff_t *) 0;
541 }
542}
543#ifdef _LIBC
544weak_alias (__re_set_registers, re_set_registers)
545#endif
546\f
547/* Entry points compatible with 4.2 BSD regex library. We don't define
548 them unless specifically requested. */
549
550#if defined _REGEX_RE_COMP || defined _LIBC
551int
552# ifdef _LIBC
553weak_function
554# endif
555re_exec (s)
556 const char *s;
557{
558 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
559}
560#endif /* _REGEX_RE_COMP */
561\f
562static re_node_set empty_set;
563
564/* Internal entry point. */
565
566/* Searches for a compiled pattern PREG in the string STRING, whose
567 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
568 mingings with regexec. START, and RANGE have the same meanings
569 with re_search.
a9388965
UD
570 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
571 otherwise return the error code.
3b0bdc72
UD
572 Note: We assume front end functions already check ranges.
573 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
574
a9388965 575static reg_errcode_t
ac3d553b 576re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
15a7d175 577 eflags)
3b0bdc72
UD
578 const regex_t *preg;
579 const char *string;
ac3d553b 580 int length, start, range, stop, eflags;
3b0bdc72
UD
581 size_t nmatch;
582 regmatch_t pmatch[];
583{
a9388965 584 reg_errcode_t err;
3b0bdc72
UD
585 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
586 re_string_t input;
612546c6 587 int left_lim, right_lim, incr;
3b0bdc72 588 int fl_longest_match, match_first, match_last = -1;
1b2c2628 589 int fast_translate, sb;
3b0bdc72 590 re_match_context_t mctx;
1b2c2628
UD
591 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
592 && range && !preg->can_be_null) ? preg->fastmap : NULL);
3b0bdc72
UD
593
594 /* Check if the DFA haven't been compiled. */
bc15410e 595 if (BE (preg->used == 0 || dfa->init_state == NULL
15a7d175
UD
596 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
597 || dfa->init_state_begbuf == NULL, 0))
a9388965 598 return REG_NOMATCH;
3b0bdc72 599
c70f81dd
UD
600#ifdef DEBUG
601 /* We assume front-end functions already check them. */
602 assert (start + range >= 0 && start + range <= length);
603#endif
604
605 /* If initial states with non-begbuf contexts have no elements,
606 the regex must be anchored. If preg->newline_anchor is set,
607 we'll never use init_state_nl, so do not check it. */
608 if (dfa->init_state->nodes.nelem == 0
609 && dfa->init_state_word->nodes.nelem == 0
610 && (dfa->init_state_nl->nodes.nelem == 0
611 || !preg->newline_anchor))
612 {
613 if (start != 0 && start + range != 0)
614 return REG_NOMATCH;
615 start = range = 0;
616 }
617
3b0bdc72 618 re_node_set_init_empty (&empty_set);
1b2c2628 619 memset (&mctx, '\0', sizeof (re_match_context_t));
3b0bdc72
UD
620
621 /* We must check the longest matching, if nmatch > 0. */
6291ee3c 622 fl_longest_match = (nmatch != 0 || dfa->nbackref);
3b0bdc72 623
612546c6 624 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
f0c7c524 625 preg->translate, preg->syntax & RE_ICASE, dfa);
612546c6 626 if (BE (err != REG_NOERROR, 0))
1b2c2628 627 goto free_return;
ac3d553b 628 input.stop = stop;
bb3f4825 629 input.raw_stop = stop;
612546c6
UD
630
631 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
632 if (BE (err != REG_NOERROR, 0))
1b2c2628 633 goto free_return;
612546c6 634
3b0bdc72
UD
635 /* We will log all the DFA states through which the dfa pass,
636 if nmatch > 1, or this dfa has "multibyte node", which is a
637 back-reference or a node which can accept multibyte character or
638 multi character collating element. */
639 if (nmatch > 1 || dfa->has_mb_node)
a9388965 640 {
97fd3a30 641 mctx.state_log = re_malloc (re_dfastate_t *, input.bufs_len + 1);
612546c6 642 if (BE (mctx.state_log == NULL, 0))
15a7d175
UD
643 {
644 err = REG_ESPACE;
645 goto free_return;
646 }
a9388965 647 }
3b0bdc72 648 else
612546c6 649 mctx.state_log = NULL;
3b0bdc72 650
612546c6
UD
651 match_first = start;
652 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
15a7d175 653 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
612546c6 654
3b0bdc72 655 /* Check incrementally whether of not the input string match. */
612546c6
UD
656 incr = (range < 0) ? -1 : 1;
657 left_lim = (range < 0) ? start + range : start;
658 right_lim = (range < 0) ? start : start + range;
3c0fb574 659 sb = dfa->mb_cur_max == 1;
1b2c2628 660 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
612546c6
UD
661
662 for (;;)
3b0bdc72 663 {
612546c6 664 /* At first get the current byte from input string. */
1b2c2628
UD
665 if (fastmap)
666 {
667 if (BE (fast_translate, 1))
668 {
669 unsigned RE_TRANSLATE_TYPE t
670 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
671 if (BE (range >= 0, 1))
672 {
673 if (BE (t != NULL, 0))
674 {
675 while (BE (match_first < right_lim, 1)
676 && !fastmap[t[(unsigned char) string[match_first]]])
677 ++match_first;
678 }
679 else
680 {
681 while (BE (match_first < right_lim, 1)
682 && !fastmap[(unsigned char) string[match_first]])
683 ++match_first;
684 }
685 if (BE (match_first == right_lim, 0))
686 {
687 int ch = match_first >= length
688 ? 0 : (unsigned char) string[match_first];
689 if (!fastmap[t ? t[ch] : ch])
690 break;
691 }
692 }
693 else
694 {
695 while (match_first >= left_lim)
696 {
697 int ch = match_first >= length
698 ? 0 : (unsigned char) string[match_first];
699 if (fastmap[t ? t[ch] : ch])
700 break;
701 --match_first;
702 }
703 if (match_first < left_lim)
704 break;
705 }
706 }
707 else
708 {
709 int ch;
710
711 do
712 {
713 /* In this case, we can't determine easily the current byte,
714 since it might be a component byte of a multibyte
715 character. Then we use the constructed buffer
716 instead. */
717 /* If MATCH_FIRST is out of the valid range, reconstruct the
718 buffers. */
bb3f4825 719 if (input.raw_mbs_idx + input.valid_raw_len <= match_first
1b2c2628
UD
720 || match_first < input.raw_mbs_idx)
721 {
722 err = re_string_reconstruct (&input, match_first, eflags,
723 preg->newline_anchor);
724 if (BE (err != REG_NOERROR, 0))
725 goto free_return;
726 }
727 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
728 Note that MATCH_FIRST must not be smaller than 0. */
729 ch = ((match_first >= length) ? 0
730 : re_string_byte_at (&input,
731 match_first - input.raw_mbs_idx));
732 if (fastmap[ch])
733 break;
734 match_first += incr;
735 }
736 while (match_first >= left_lim && match_first <= right_lim);
737 if (! fastmap[ch])
738 break;
739 }
740 }
612546c6 741
1b2c2628 742 /* Reconstruct the buffers so that the matcher can assume that
fe9434bb 743 the matching starts from the beginning of the buffer. */
1b2c2628
UD
744 err = re_string_reconstruct (&input, match_first, eflags,
745 preg->newline_anchor);
746 if (BE (err != REG_NOERROR, 0))
747 goto free_return;
3b0bdc72 748#ifdef RE_ENABLE_I18N
1b2c2628 749 /* Eliminate it when it is a component of a multibyte character
15a7d175 750 and isn't the head of a multibyte character. */
1b2c2628 751 if (sb || re_string_first_byte (&input, 0))
3b0bdc72 752#endif
1b2c2628
UD
753 {
754 /* It seems to be appropriate one, then use the matcher. */
755 /* We assume that the matching starts from 0. */
756 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
c13c99fa 757 match_last = check_matching (preg, &mctx, fl_longest_match);
1b2c2628
UD
758 if (match_last != -1)
759 {
760 if (BE (match_last == -2, 0))
761 {
762 err = REG_ESPACE;
763 goto free_return;
764 }
765 else
6291ee3c
UD
766 {
767 mctx.match_last = match_last;
768 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
769 {
770 re_dfastate_t *pstate = mctx.state_log[match_last];
771 mctx.last_node = check_halt_state_context (preg, pstate,
772 &mctx, match_last);
773 }
774 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
775 || dfa->nbackref)
776 {
777 err = prune_impossible_nodes (preg, &mctx);
778 if (err == REG_NOERROR)
779 break;
780 if (BE (err != REG_NOMATCH, 0))
781 goto free_return;
97fd3a30 782 match_last = -1;
6291ee3c
UD
783 }
784 else
fe9434bb 785 break; /* We found a match. */
6291ee3c 786 }
1b2c2628 787 }
6291ee3c 788 match_ctx_clean (&mctx);
1b2c2628 789 }
3b0bdc72 790 /* Update counter. */
612546c6
UD
791 match_first += incr;
792 if (match_first < left_lim || right_lim < match_first)
15a7d175 793 break;
3b0bdc72
UD
794 }
795
796 /* Set pmatch[] if we need. */
797 if (match_last != -1 && nmatch > 0)
798 {
799 int reg_idx;
800
801 /* Initialize registers. */
97fd3a30 802 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
15a7d175 803 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
3b0bdc72
UD
804
805 /* Set the points where matching start/end. */
612546c6 806 pmatch[0].rm_so = 0;
6291ee3c 807 pmatch[0].rm_eo = mctx.match_last;
3b0bdc72
UD
808
809 if (!preg->no_sub && nmatch > 1)
15a7d175 810 {
15a7d175
UD
811 err = set_regs (preg, &mctx, nmatch, pmatch,
812 dfa->has_plural_match && dfa->nbackref > 0);
813 if (BE (err != REG_NOERROR, 0))
814 goto free_return;
815 }
612546c6
UD
816
817 /* At last, add the offset to the each registers, since we slided
97fd3a30
UD
818 the buffers so that we could assume that the matching starts
819 from 0. */
612546c6 820 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
15a7d175
UD
821 if (pmatch[reg_idx].rm_so != -1)
822 {
c0d5034e 823#ifdef RE_ENABLE_I18N
bb3f4825
UD
824 if (BE (input.offsets_needed != 0, 0))
825 {
826 if (pmatch[reg_idx].rm_so == input.valid_len)
827 pmatch[reg_idx].rm_so += input.valid_raw_len - input.valid_len;
828 else
829 pmatch[reg_idx].rm_so = input.offsets[pmatch[reg_idx].rm_so];
830 if (pmatch[reg_idx].rm_eo == input.valid_len)
831 pmatch[reg_idx].rm_eo += input.valid_raw_len - input.valid_len;
832 else
833 pmatch[reg_idx].rm_eo = input.offsets[pmatch[reg_idx].rm_eo];
834 }
c0d5034e
UD
835#else
836 assert (input.offsets_needed == 0);
837#endif
15a7d175
UD
838 pmatch[reg_idx].rm_so += match_first;
839 pmatch[reg_idx].rm_eo += match_first;
840 }
3b0bdc72 841 }
1b2c2628
UD
842 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
843 free_return:
612546c6 844 re_free (mctx.state_log);
3b0bdc72
UD
845 if (dfa->nbackref)
846 match_ctx_free (&mctx);
847 re_string_destruct (&input);
1b2c2628 848 return err;
3b0bdc72
UD
849}
850
6291ee3c
UD
851static reg_errcode_t
852prune_impossible_nodes (preg, mctx)
853 const regex_t *preg;
854 re_match_context_t *mctx;
855{
856 int halt_node, match_last;
857 reg_errcode_t ret;
858 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
859 re_dfastate_t **sifted_states;
860 re_dfastate_t **lim_states = NULL;
861 re_sift_context_t sctx;
862#ifdef DEBUG
863 assert (mctx->state_log != NULL);
864#endif
865 match_last = mctx->match_last;
866 halt_node = mctx->last_node;
867 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
868 if (BE (sifted_states == NULL, 0))
869 {
870 ret = REG_ESPACE;
871 goto free_return;
872 }
873 if (dfa->nbackref)
874 {
875 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
876 if (BE (lim_states == NULL, 0))
877 {
878 ret = REG_ESPACE;
879 goto free_return;
880 }
881 while (1)
882 {
883 memset (lim_states, '\0',
884 sizeof (re_dfastate_t *) * (match_last + 1));
885 match_ctx_clear_flag (mctx);
886 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
887 match_last, 0);
888 ret = sift_states_backward (preg, mctx, &sctx);
889 re_node_set_free (&sctx.limits);
890 if (BE (ret != REG_NOERROR, 0))
891 goto free_return;
892 if (sifted_states[0] != NULL || lim_states[0] != NULL)
893 break;
894 do
895 {
896 --match_last;
897 if (match_last < 0)
898 {
899 ret = REG_NOMATCH;
900 goto free_return;
901 }
97fd3a30
UD
902 } while (mctx->state_log[match_last] == NULL
903 || !mctx->state_log[match_last]->halt);
6291ee3c
UD
904 halt_node = check_halt_state_context (preg,
905 mctx->state_log[match_last],
906 mctx, match_last);
907 }
908 ret = merge_state_array (dfa, sifted_states, lim_states,
909 match_last + 1);
910 re_free (lim_states);
911 lim_states = NULL;
912 if (BE (ret != REG_NOERROR, 0))
913 goto free_return;
914 }
915 else
916 {
917 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
918 match_last, 0);
919 ret = sift_states_backward (preg, mctx, &sctx);
920 re_node_set_free (&sctx.limits);
921 if (BE (ret != REG_NOERROR, 0))
922 goto free_return;
923 }
924 re_free (mctx->state_log);
925 mctx->state_log = sifted_states;
926 sifted_states = NULL;
927 mctx->last_node = halt_node;
928 mctx->match_last = match_last;
929 ret = REG_NOERROR;
930 free_return:
931 re_free (sifted_states);
932 re_free (lim_states);
933 return ret;
934}
935
a9388965 936/* Acquire an initial state and return it.
3b0bdc72
UD
937 We must select appropriate initial state depending on the context,
938 since initial states may have constraints like "\<", "^", etc.. */
939
bb3f4825 940static inline re_dfastate_t *
612546c6 941acquire_init_state_context (err, preg, mctx, idx)
a9388965
UD
942 reg_errcode_t *err;
943 const regex_t *preg;
612546c6
UD
944 const re_match_context_t *mctx;
945 int idx;
3b0bdc72
UD
946{
947 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
948
a9388965 949 *err = REG_NOERROR;
3b0bdc72
UD
950 if (dfa->init_state->has_constraint)
951 {
952 unsigned int context;
c13c99fa
UD
953 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
954 preg->newline_anchor);
3b0bdc72 955 if (IS_WORD_CONTEXT (context))
15a7d175 956 return dfa->init_state_word;
3b0bdc72 957 else if (IS_ORDINARY_CONTEXT (context))
15a7d175 958 return dfa->init_state;
3b0bdc72 959 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
15a7d175 960 return dfa->init_state_begbuf;
3b0bdc72 961 else if (IS_NEWLINE_CONTEXT (context))
15a7d175 962 return dfa->init_state_nl;
3b0bdc72 963 else if (IS_BEGBUF_CONTEXT (context))
15a7d175
UD
964 {
965 /* It is relatively rare case, then calculate on demand. */
966 return re_acquire_state_context (err, dfa,
967 dfa->init_state->entrance_nodes,
968 context);
969 }
3b0bdc72 970 else
15a7d175
UD
971 /* Must not happen? */
972 return dfa->init_state;
3b0bdc72
UD
973 }
974 else
975 return dfa->init_state;
976}
977
978/* Check whether the regular expression match input string INPUT or not,
a9388965
UD
979 and return the index where the matching end, return -1 if not match,
980 or return -2 in case of an error.
612546c6
UD
981 FL_LONGEST_MATCH means we want the POSIX longest matching.
982 Note that the matcher assume that the maching starts from the current
983 index of the buffer. */
3b0bdc72
UD
984
985static int
c13c99fa 986check_matching (preg, mctx, fl_longest_match)
3b0bdc72 987 const regex_t *preg;
3b0bdc72 988 re_match_context_t *mctx;
c13c99fa 989 int fl_longest_match;
3b0bdc72 990{
6291ee3c 991 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
a9388965 992 reg_errcode_t err;
612546c6
UD
993 int match = 0;
994 int match_last = -1;
995 int cur_str_idx = re_string_cur_idx (mctx->input);
3b0bdc72
UD
996 re_dfastate_t *cur_state;
997
612546c6 998 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
a9388965 999 /* An initial state must not be NULL(invalid state). */
bc15410e 1000 if (BE (cur_state == NULL, 0))
a9388965 1001 return -2;
612546c6
UD
1002 if (mctx->state_log != NULL)
1003 mctx->state_log[cur_str_idx] = cur_state;
0742e48e 1004
6291ee3c
UD
1005 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1006 later. E.g. Processing back references. */
bb3f4825 1007 if (BE (dfa->nbackref, 0))
6291ee3c
UD
1008 {
1009 err = check_subexp_matching_top (dfa, mctx, &cur_state->nodes, 0);
1010 if (BE (err != REG_NOERROR, 0))
1011 return err;
6291ee3c 1012
bb3f4825
UD
1013 if (cur_state->has_backref)
1014 {
1015 err = transit_state_bkref (preg, &cur_state->nodes, mctx);
1016 if (BE (err != REG_NOERROR, 0))
1017 return err;
1018 }
0742e48e
UD
1019 }
1020
3b0bdc72 1021 /* If the RE accepts NULL string. */
bb3f4825 1022 if (BE (cur_state->halt, 0))
3b0bdc72
UD
1023 {
1024 if (!cur_state->has_constraint
15a7d175
UD
1025 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
1026 {
1027 if (!fl_longest_match)
1028 return cur_str_idx;
1029 else
1030 {
1031 match_last = cur_str_idx;
1032 match = 1;
1033 }
1034 }
3b0bdc72
UD
1035 }
1036
612546c6 1037 while (!re_string_eoi (mctx->input))
3b0bdc72 1038 {
c13c99fa 1039 cur_state = transit_state (&err, preg, mctx, cur_state);
a9388965 1040 if (cur_state == NULL) /* Reached at the invalid state or an error. */
15a7d175
UD
1041 {
1042 cur_str_idx = re_string_cur_idx (mctx->input);
1043 if (BE (err != REG_NOERROR, 0))
1044 return -2;
c13c99fa 1045 if (!fl_longest_match && match)
15a7d175 1046 break;
c13c99fa 1047 else
15a7d175
UD
1048 {
1049 if (mctx->state_log == NULL)
1050 break;
1051 else
1052 {
1053 int max = mctx->state_log_top;
1054 for (; cur_str_idx <= max; ++cur_str_idx)
1055 if (mctx->state_log[cur_str_idx] != NULL)
1056 break;
1057 if (cur_str_idx > max)
1058 break;
1059 }
1060 }
1061 }
3b0bdc72
UD
1062
1063 if (cur_state != NULL && cur_state->halt)
15a7d175
UD
1064 {
1065 /* Reached at a halt state.
1066 Check the halt state can satisfy the current context. */
1067 if (!cur_state->has_constraint
1068 || check_halt_state_context (preg, cur_state, mctx,
1069 re_string_cur_idx (mctx->input)))
1070 {
1071 /* We found an appropriate halt state. */
1072 match_last = re_string_cur_idx (mctx->input);
1073 match = 1;
1074 if (!fl_longest_match)
1075 break;
1076 }
1077 }
3b0bdc72
UD
1078 }
1079 return match_last;
1080}
1081
1082/* Check NODE match the current context. */
1083
1084static int check_halt_node_context (dfa, node, context)
1085 const re_dfa_t *dfa;
1086 int node;
1087 unsigned int context;
1088{
3b0bdc72 1089 re_token_type_t type = dfa->nodes[node].type;
485d775d
UD
1090 unsigned int constraint = dfa->nodes[node].constraint;
1091 if (type != END_OF_RE)
3b0bdc72 1092 return 0;
485d775d
UD
1093 if (!constraint)
1094 return 1;
1095 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
3b0bdc72
UD
1096 return 0;
1097 return 1;
1098}
1099
1100/* Check the halt state STATE match the current context.
1101 Return 0 if not match, if the node, STATE has, is a halt node and
1102 match the context, return the node. */
1103
1104static int
612546c6 1105check_halt_state_context (preg, state, mctx, idx)
3b0bdc72
UD
1106 const regex_t *preg;
1107 const re_dfastate_t *state;
612546c6
UD
1108 const re_match_context_t *mctx;
1109 int idx;
3b0bdc72
UD
1110{
1111 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1112 int i;
1113 unsigned int context;
1114#ifdef DEBUG
1115 assert (state->halt);
1116#endif
612546c6 1117 context = re_string_context_at (mctx->input, idx, mctx->eflags,
15a7d175 1118 preg->newline_anchor);
3b0bdc72
UD
1119 for (i = 0; i < state->nodes.nelem; ++i)
1120 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
1121 return state->nodes.elems[i];
1122 return 0;
1123}
1124
a9388965
UD
1125/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1126 corresponding to the DFA).
1127 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1128 of errors. */
3b0bdc72
UD
1129
1130static int
a3022b82 1131proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs)
3b0bdc72 1132 const regex_t *preg;
0742e48e 1133 regmatch_t *regs;
3b0bdc72 1134 const re_match_context_t *mctx;
a3022b82 1135 int nregs, *pidx, node;
3b0bdc72 1136 re_node_set *eps_via_nodes;
a3022b82 1137 struct re_fail_stack_t *fs;
3b0bdc72
UD
1138{
1139 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
485d775d 1140 int i, err, dest_node;
81c64d40 1141 dest_node = -1;
3b0bdc72
UD
1142 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1143 {
485d775d
UD
1144 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1145 int ndest, dest_nodes[2];
a9388965 1146 err = re_node_set_insert (eps_via_nodes, node);
bc15410e 1147 if (BE (err < 0, 0))
15a7d175 1148 return -1;
a3022b82 1149 /* Pick up valid destinations. */
485d775d 1150 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
15a7d175
UD
1151 {
1152 int candidate = dfa->edests[node].elems[i];
1153 if (!re_node_set_contains (cur_nodes, candidate))
1154 continue;
1155 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1156 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1157 ++ndest;
1158 }
a3022b82 1159 if (ndest <= 1)
15a7d175 1160 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
a3022b82
UD
1161 /* In order to avoid infinite loop like "(a*)*". */
1162 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
15a7d175 1163 return dest_nodes[1];
a3022b82 1164 if (fs != NULL)
15a7d175 1165 push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes);
a3022b82 1166 return dest_nodes[0];
3b0bdc72
UD
1167 }
1168 else
1169 {
485d775d 1170 int naccepted = 0;
3b0bdc72 1171 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72 1172
434d3784 1173#ifdef RE_ENABLE_I18N
3b0bdc72 1174 if (ACCEPT_MB_NODE (type))
15a7d175 1175 naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx);
434d3784
UD
1176 else
1177#endif /* RE_ENABLE_I18N */
1178 if (type == OP_BACK_REF)
15a7d175
UD
1179 {
1180 int subexp_idx = dfa->nodes[node].opr.idx;
1181 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1182 if (fs != NULL)
1183 {
1184 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1185 return -1;
1186 else if (naccepted)
1187 {
85c54a32 1188 char *buf = (char *) re_string_get_buffer (mctx->input);
6291ee3c
UD
1189 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1190 naccepted) != 0)
15a7d175
UD
1191 return -1;
1192 }
1193 }
1194
1195 if (naccepted == 0)
1196 {
1197 err = re_node_set_insert (eps_via_nodes, node);
1198 if (BE (err < 0, 0))
1199 return -2;
1200 dest_node = dfa->edests[node].elems[0];
1201 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1202 dest_node))
1203 return dest_node;
1204 }
1205 }
3b0bdc72
UD
1206
1207 if (naccepted != 0
15a7d175
UD
1208 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
1209 {
1210 dest_node = dfa->nexts[node];
1211 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1212 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1213 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1214 dest_node)))
1215 return -1;
1216 re_node_set_empty (eps_via_nodes);
1217 return dest_node;
1218 }
3b0bdc72 1219 }
a3022b82
UD
1220 return -1;
1221}
1222
1223static reg_errcode_t
1224push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1225 struct re_fail_stack_t *fs;
1226 int str_idx, *dests, nregs;
1227 regmatch_t *regs;
1228 re_node_set *eps_via_nodes;
1229{
1230 reg_errcode_t err;
1231 int num = fs->num++;
1232 if (fs->num == fs->alloc)
1233 {
1b2c2628 1234 struct re_fail_stack_ent_t *new_array;
a3022b82 1235 fs->alloc *= 2;
1b2c2628 1236 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
15a7d175 1237 * fs->alloc));
1b2c2628 1238 if (new_array == NULL)
15a7d175 1239 return REG_ESPACE;
1b2c2628 1240 fs->stack = new_array;
a3022b82
UD
1241 }
1242 fs->stack[num].idx = str_idx;
1243 fs->stack[num].node = dests[1];
1244 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1245 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1246 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1247 return err;
1248}
1b2c2628 1249
a3022b82
UD
1250static int
1251pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1252 struct re_fail_stack_t *fs;
1253 int *pidx, nregs;
1254 regmatch_t *regs;
1255 re_node_set *eps_via_nodes;
1256{
1257 int num = --fs->num;
1258 assert (num >= 0);
1259 *pidx = fs->stack[num].idx;
1260 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1261 re_node_set_free (eps_via_nodes);
485d775d 1262 re_free (fs->stack[num].regs);
a3022b82
UD
1263 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1264 return fs->stack[num].node;
3b0bdc72
UD
1265}
1266
1267/* Set the positions where the subexpressions are starts/ends to registers
1268 PMATCH.
1269 Note: We assume that pmatch[0] is already set, and
97fd3a30 1270 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
3b0bdc72 1271
a9388965 1272static reg_errcode_t
a3022b82
UD
1273set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1274 const regex_t *preg;
1275 const re_match_context_t *mctx;
1276 size_t nmatch;
1277 regmatch_t *pmatch;
1278 int fl_backtrack;
3b0bdc72
UD
1279{
1280 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
81c64d40 1281 int idx, cur_node, real_nmatch;
3b0bdc72 1282 re_node_set eps_via_nodes;
a3022b82
UD
1283 struct re_fail_stack_t *fs;
1284 struct re_fail_stack_t fs_body = {0, 2, NULL};
3b0bdc72
UD
1285#ifdef DEBUG
1286 assert (nmatch > 1);
612546c6 1287 assert (mctx->state_log != NULL);
3b0bdc72 1288#endif
a3022b82
UD
1289 if (fl_backtrack)
1290 {
1291 fs = &fs_body;
1292 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1293 }
1294 else
1295 fs = NULL;
3b0bdc72
UD
1296 cur_node = dfa->init_node;
1297 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1298 re_node_set_init_empty (&eps_via_nodes);
1299 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1300 {
81c64d40 1301 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
a3022b82 1302 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
15a7d175
UD
1303 {
1304 int reg_idx;
1305 if (fs)
1306 {
1307 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1308 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1309 break;
1310 if (reg_idx == nmatch)
1311 {
1312 re_node_set_free (&eps_via_nodes);
1313 return free_fail_stack_return (fs);
1314 }
1315 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1316 &eps_via_nodes);
1317 }
1318 else
1319 {
1320 re_node_set_free (&eps_via_nodes);
1321 return REG_NOERROR;
1322 }
1323 }
3b0bdc72
UD
1324
1325 /* Proceed to next node. */
a3022b82 1326 cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node,
15a7d175 1327 &eps_via_nodes, fs);
a3022b82 1328
bc15410e 1329 if (BE (cur_node < 0, 0))
15a7d175
UD
1330 {
1331 if (cur_node == -2)
1332 return REG_ESPACE;
1333 if (fs)
1334 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1335 &eps_via_nodes);
1336 else
1337 {
1338 re_node_set_free (&eps_via_nodes);
1339 return REG_NOMATCH;
1340 }
1341 }
3b0bdc72
UD
1342 }
1343 re_node_set_free (&eps_via_nodes);
485d775d
UD
1344 return free_fail_stack_return (fs);
1345}
1346
1347static reg_errcode_t
1348free_fail_stack_return (fs)
1349 struct re_fail_stack_t *fs;
1350{
1351 if (fs)
1352 {
1353 int fs_idx;
1354 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
15a7d175
UD
1355 {
1356 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1357 re_free (fs->stack[fs_idx].regs);
1358 }
485d775d
UD
1359 re_free (fs->stack);
1360 }
a9388965 1361 return REG_NOERROR;
3b0bdc72
UD
1362}
1363
81c64d40
UD
1364static void
1365update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1366 re_dfa_t *dfa;
1367 regmatch_t *pmatch;
1368 int cur_node, cur_idx, nmatch;
1369{
1370 int type = dfa->nodes[cur_node].type;
1371 int reg_num;
1372 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1373 return;
1374 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1375 if (reg_num >= nmatch)
1376 return;
1377 if (type == OP_OPEN_SUBEXP)
1378 {
1379 /* We are at the first node of this sub expression. */
1380 pmatch[reg_num].rm_so = cur_idx;
1381 pmatch[reg_num].rm_eo = -1;
1382 }
1383 else if (type == OP_CLOSE_SUBEXP)
1384 /* We are at the first node of this sub expression. */
1385 pmatch[reg_num].rm_eo = cur_idx;
0742e48e 1386}
81c64d40 1387
3b0bdc72
UD
1388#define NUMBER_OF_STATE 1
1389
0742e48e 1390/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
612546c6
UD
1391 and sift the nodes in each states according to the following rules.
1392 Updated state_log will be wrote to STATE_LOG.
3b0bdc72
UD
1393
1394 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1395 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
15a7d175
UD
1396 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1397 the LAST_NODE, we throw away the node `a'.
612546c6 1398 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
15a7d175
UD
1399 string `s' and transit to `b':
1400 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1401 away the node `a'.
1402 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
bb3f4825 1403 thrown away, we throw away the node `a'.
d0b96fc4 1404 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
15a7d175
UD
1405 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1406 node `a'.
bb3f4825 1407 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
15a7d175 1408 we throw away the node `a'. */
3b0bdc72
UD
1409
1410#define STATE_NODE_CONTAINS(state,node) \
1411 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1412
a9388965 1413static reg_errcode_t
0742e48e
UD
1414sift_states_backward (preg, mctx, sctx)
1415 const regex_t *preg;
1416 re_match_context_t *mctx;
1417 re_sift_context_t *sctx;
3b0bdc72 1418{
a9388965 1419 reg_errcode_t err;
3b0bdc72 1420 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
0742e48e
UD
1421 int null_cnt = 0;
1422 int str_idx = sctx->last_str_idx;
1423 re_node_set cur_dest;
1424 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
3b0bdc72
UD
1425
1426#ifdef DEBUG
612546c6 1427 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
3b0bdc72 1428#endif
0742e48e 1429 cur_src = &mctx->state_log[str_idx]->nodes;
3b0bdc72
UD
1430
1431 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1432 transit to the last_node and the last_node itself. */
0742e48e 1433 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
bc15410e 1434 if (BE (err != REG_NOERROR, 0))
a9388965 1435 return err;
0742e48e
UD
1436 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1437 if (BE (err != REG_NOERROR, 0))
1b2c2628 1438 goto free_return;
3b0bdc72
UD
1439
1440 /* Then check each states in the state_log. */
612546c6 1441 while (str_idx > 0)
3b0bdc72 1442 {
0742e48e 1443 int i, ret;
3b0bdc72 1444 /* Update counters. */
0742e48e
UD
1445 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1446 if (null_cnt > mctx->max_mb_elem_len)
15a7d175
UD
1447 {
1448 memset (sctx->sifted_states, '\0',
1449 sizeof (re_dfastate_t *) * str_idx);
1450 re_node_set_free (&cur_dest);
1451 return REG_NOERROR;
1452 }
0742e48e 1453 re_node_set_empty (&cur_dest);
3b0bdc72 1454 --str_idx;
0742e48e 1455 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1456 : &mctx->state_log[str_idx]->nodes);
3b0bdc72
UD
1457
1458 /* Then build the next sifted state.
15a7d175
UD
1459 We build the next sifted state on `cur_dest', and update
1460 `sifted_states[str_idx]' with `cur_dest'.
1461 Note:
1462 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1463 `cur_src' points the node_set of the old `state_log[str_idx]'. */
0742e48e 1464 for (i = 0; i < cur_src->nelem; i++)
15a7d175
UD
1465 {
1466 int prev_node = cur_src->elems[i];
1467 int naccepted = 0;
1468 re_token_type_t type = dfa->nodes[prev_node].type;
0742e48e 1469
46bf9de7 1470 if (IS_EPSILON_NODE (type))
15a7d175 1471 continue;
434d3784 1472#ifdef RE_ENABLE_I18N
15a7d175
UD
1473 /* If the node may accept `multi byte'. */
1474 if (ACCEPT_MB_NODE (type))
1475 naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node,
1476 str_idx, sctx->last_str_idx);
3b0bdc72 1477
434d3784 1478#endif /* RE_ENABLE_I18N */
15a7d175
UD
1479 /* We don't check backreferences here.
1480 See update_cur_sifted_state(). */
1481
1482 if (!naccepted
1483 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1484 str_idx)
1485 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1486 dfa->nexts[prev_node]))
1487 naccepted = 1;
1488
1489 if (naccepted == 0)
1490 continue;
1491
1492 if (sctx->limits.nelem)
1493 {
1494 int to_idx = str_idx + naccepted;
1495 if (check_dst_limits (dfa, &sctx->limits, mctx,
1496 dfa->nexts[prev_node], to_idx,
1497 prev_node, str_idx))
1498 continue;
1499 }
1500 ret = re_node_set_insert (&cur_dest, prev_node);
1501 if (BE (ret == -1, 0))
1502 {
1503 err = REG_ESPACE;
1504 goto free_return;
1505 }
1506 }
3b0bdc72 1507
0742e48e 1508 /* Add all the nodes which satisfy the following conditions:
15a7d175
UD
1509 - It can epsilon transit to a node in CUR_DEST.
1510 - It is in CUR_SRC.
1511 And update state_log. */
0742e48e
UD
1512 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1513 if (BE (err != REG_NOERROR, 0))
15a7d175 1514 goto free_return;
3b0bdc72 1515 }
1b2c2628
UD
1516 err = REG_NOERROR;
1517 free_return:
0742e48e 1518 re_node_set_free (&cur_dest);
1b2c2628 1519 return err;
3b0bdc72
UD
1520}
1521
1522/* Helper functions. */
1523
25337753 1524static reg_errcode_t
612546c6 1525clean_state_log_if_need (mctx, next_state_log_idx)
3b0bdc72
UD
1526 re_match_context_t *mctx;
1527 int next_state_log_idx;
1528{
1529 int top = mctx->state_log_top;
612546c6
UD
1530
1531 if (next_state_log_idx >= mctx->input->bufs_len
1532 || (next_state_log_idx >= mctx->input->valid_len
15a7d175 1533 && mctx->input->valid_len < mctx->input->len))
612546c6
UD
1534 {
1535 reg_errcode_t err;
1536 err = extend_buffers (mctx);
1537 if (BE (err != REG_NOERROR, 0))
15a7d175 1538 return err;
612546c6
UD
1539 }
1540
3b0bdc72
UD
1541 if (top < next_state_log_idx)
1542 {
612546c6 1543 memset (mctx->state_log + top + 1, '\0',
15a7d175 1544 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
3b0bdc72
UD
1545 mctx->state_log_top = next_state_log_idx;
1546 }
612546c6 1547 return REG_NOERROR;
3b0bdc72
UD
1548}
1549
1b2c2628
UD
1550static reg_errcode_t
1551merge_state_array (dfa, dst, src, num)
0742e48e
UD
1552 re_dfa_t *dfa;
1553 re_dfastate_t **dst;
1554 re_dfastate_t **src;
1555 int num;
3b0bdc72 1556{
0742e48e
UD
1557 int st_idx;
1558 reg_errcode_t err;
1559 for (st_idx = 0; st_idx < num; ++st_idx)
1560 {
1561 if (dst[st_idx] == NULL)
15a7d175 1562 dst[st_idx] = src[st_idx];
0742e48e 1563 else if (src[st_idx] != NULL)
15a7d175
UD
1564 {
1565 re_node_set merged_set;
1566 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1567 &src[st_idx]->nodes);
1568 if (BE (err != REG_NOERROR, 0))
1569 return err;
1570 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1571 re_node_set_free (&merged_set);
1572 if (BE (err != REG_NOERROR, 0))
1573 return err;
1574 }
0742e48e
UD
1575 }
1576 return REG_NOERROR;
3b0bdc72
UD
1577}
1578
0742e48e
UD
1579static reg_errcode_t
1580update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes)
1581 const regex_t *preg;
1582 re_match_context_t *mctx;
1583 re_sift_context_t *sctx;
1584 int str_idx;
1585 re_node_set *dest_nodes;
3b0bdc72 1586{
0742e48e
UD
1587 reg_errcode_t err;
1588 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1589 const re_node_set *candidates;
1590 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1591 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
1592
1593 /* At first, add the nodes which can epsilon transit to a node in
1594 DEST_NODE. */
485d775d
UD
1595 if (dest_nodes->nelem)
1596 {
1597 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1598 if (BE (err != REG_NOERROR, 0))
1599 return err;
1600 }
0742e48e
UD
1601
1602 /* Then, check the limitations in the current sift_context. */
485d775d 1603 if (dest_nodes->nelem && sctx->limits.nelem)
0742e48e
UD
1604 {
1605 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
15a7d175 1606 mctx->bkref_ents, str_idx);
0742e48e 1607 if (BE (err != REG_NOERROR, 0))
15a7d175 1608 return err;
0742e48e
UD
1609 }
1610
1611 /* Update state_log. */
1612 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1613 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1614 return err;
1615
0742e48e
UD
1616 if ((mctx->state_log[str_idx] != NULL
1617 && mctx->state_log[str_idx]->has_backref))
1618 {
1619 err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes);
1620 if (BE (err != REG_NOERROR, 0))
15a7d175 1621 return err;
0742e48e
UD
1622 }
1623 return REG_NOERROR;
3b0bdc72
UD
1624}
1625
a9388965 1626static reg_errcode_t
0742e48e
UD
1627add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1628 re_dfa_t *dfa;
1629 re_node_set *dest_nodes;
1630 const re_node_set *candidates;
3b0bdc72 1631{
0742e48e
UD
1632 reg_errcode_t err;
1633 int src_idx;
1634 re_node_set src_copy;
1635
1636 err = re_node_set_init_copy (&src_copy, dest_nodes);
1637 if (BE (err != REG_NOERROR, 0))
1638 return err;
1639 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
3b0bdc72 1640 {
0742e48e 1641 err = re_node_set_add_intersect (dest_nodes, candidates,
15a7d175
UD
1642 dfa->inveclosures
1643 + src_copy.elems[src_idx]);
0742e48e 1644 if (BE (err != REG_NOERROR, 0))
15a7d175
UD
1645 {
1646 re_node_set_free (&src_copy);
1647 return err;
1648 }
0742e48e
UD
1649 }
1650 re_node_set_free (&src_copy);
1651 return REG_NOERROR;
1652}
3b0bdc72 1653
0742e48e
UD
1654static reg_errcode_t
1655sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1656 re_dfa_t *dfa;
1657 int node;
1658 re_node_set *dest_nodes;
1659 const re_node_set *candidates;
1660{
1661 int ecl_idx;
1662 reg_errcode_t err;
1663 re_node_set *inv_eclosure = dfa->inveclosures + node;
1664 re_node_set except_nodes;
1665 re_node_set_init_empty (&except_nodes);
1666 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1667 {
15a7d175
UD
1668 int cur_node = inv_eclosure->elems[ecl_idx];
1669 if (cur_node == node)
1670 continue;
1671 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1672 {
1673 int edst1 = dfa->edests[cur_node].elems[0];
1674 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1675 ? dfa->edests[cur_node].elems[1] : -1);
1676 if ((!re_node_set_contains (inv_eclosure, edst1)
1677 && re_node_set_contains (dest_nodes, edst1))
1678 || (edst2 > 0
1679 && !re_node_set_contains (inv_eclosure, edst2)
1680 && re_node_set_contains (dest_nodes, edst2)))
1681 {
1682 err = re_node_set_add_intersect (&except_nodes, candidates,
1683 dfa->inveclosures + cur_node);
1684 if (BE (err != REG_NOERROR, 0))
1685 {
1686 re_node_set_free (&except_nodes);
1687 return err;
1688 }
1689 }
1690 }
0742e48e
UD
1691 }
1692 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1693 {
15a7d175
UD
1694 int cur_node = inv_eclosure->elems[ecl_idx];
1695 if (!re_node_set_contains (&except_nodes, cur_node))
1696 {
1697 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1698 re_node_set_remove_at (dest_nodes, idx);
1699 }
0742e48e
UD
1700 }
1701 re_node_set_free (&except_nodes);
1702 return REG_NOERROR;
1703}
1704
a3022b82
UD
1705static int
1706check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx)
1707 re_dfa_t *dfa;
1708 re_node_set *limits;
1709 re_match_context_t *mctx;
1710 int dst_node, dst_idx, src_node, src_idx;
1711{
1712 int lim_idx, src_pos, dst_pos;
1713
1714 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1715 {
485d775d 1716 int subexp_idx;
a3022b82
UD
1717 struct re_backref_cache_entry *ent;
1718 ent = mctx->bkref_ents + limits->elems[lim_idx];
485d775d 1719 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
a3022b82
UD
1720
1721 dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
15a7d175
UD
1722 dfa->eclosures + dst_node,
1723 subexp_idx, dst_node, dst_idx);
a3022b82 1724 src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
15a7d175
UD
1725 dfa->eclosures + src_node,
1726 subexp_idx, src_node, src_idx);
a3022b82
UD
1727
1728 /* In case of:
15a7d175
UD
1729 <src> <dst> ( <subexp> )
1730 ( <subexp> ) <src> <dst>
1731 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
a3022b82 1732 if (src_pos == dst_pos)
15a7d175 1733 continue; /* This is unrelated limitation. */
a3022b82 1734 else
15a7d175 1735 return 1;
a3022b82
UD
1736 }
1737 return 0;
1738}
1739
1740static int
d0b96fc4 1741check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, from_node,
15a7d175 1742 str_idx)
a3022b82
UD
1743 re_dfa_t *dfa;
1744 re_match_context_t *mctx;
1745 re_node_set *eclosures;
d0b96fc4 1746 int limit, subexp_idx, from_node, str_idx;
a3022b82
UD
1747{
1748 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1cef7b3c 1749 int node_idx;
d0b96fc4
UD
1750
1751 /* If we are outside the range of the subexpression, return -1 or 1. */
1752 if (str_idx < lim->subexp_from)
1753 return -1;
1754
1755 if (lim->subexp_to < str_idx)
1756 return 1;
1757
1758 /* If we are within the subexpression, return 0. */
1759 if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1760 return 0;
1761
1762 /* Else, we are on the boundary: examine the nodes on the epsilon
1763 closure. */
1cef7b3c
UD
1764 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1765 {
1766 int node = eclosures->elems[node_idx];
d0b96fc4
UD
1767 switch (dfa->nodes[node].type)
1768 {
1769 case OP_BACK_REF:
1cef7b3c
UD
1770 {
1771 int bi = search_cur_bkref_entry (mctx, str_idx);
1772 for (; bi < mctx->nbkref_ents; ++bi)
1773 {
1774 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
d0b96fc4 1775 int dst, cpos;
f0c7c524 1776
d0b96fc4
UD
1777 /* If this backreference goes beyond the point we're
1778 examining, don't go any further. */
1cef7b3c
UD
1779 if (ent->str_idx > str_idx)
1780 break;
d0b96fc4
UD
1781
1782 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1783 continue;
1784
1785 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1786 OP_CLOSE_SUBEXP cases below. But, if the
1787 destination node is the same node as the source
1788 node, don't recurse because it would cause an
1789 infinite loop: a regex that exhibits this behavior
1790 is ()\1*\1* */
1cef7b3c 1791 dst = dfa->edests[node].elems[0];
d0b96fc4
UD
1792 if (dst == from_node)
1793 {
1794 if (str_idx == lim->subexp_from)
1795 return -1;
1796 else /* if (str_idx == lim->subexp_to) */
1797 return 0;
1798 }
1799
1cef7b3c
UD
1800 cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
1801 dfa->eclosures + dst,
1802 subexp_idx, dst,
1803 str_idx);
d0b96fc4
UD
1804
1805 if (cpos == -1 && str_idx == lim->subexp_from)
1806 return -1;
1807
1808 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1809 return 0;
1cef7b3c 1810 }
15a7d175
UD
1811 break;
1812 }
f0c7c524 1813
d0b96fc4
UD
1814 case OP_OPEN_SUBEXP:
1815 if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1816 return -1;
1817 break;
f0c7c524 1818
d0b96fc4
UD
1819 case OP_CLOSE_SUBEXP:
1820 if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1821 return 0;
1822 break;
1823
1824 default:
15a7d175
UD
1825 break;
1826 }
a3022b82 1827 }
d0b96fc4
UD
1828
1829 if (str_idx == lim->subexp_to)
1830 return 1;
1831 else
1832 return 0;
a3022b82
UD
1833}
1834
0742e48e
UD
1835/* Check the limitations of sub expressions LIMITS, and remove the nodes
1836 which are against limitations from DEST_NODES. */
1837
1838static reg_errcode_t
1839check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1840 re_dfa_t *dfa;
1841 re_node_set *dest_nodes;
1842 const re_node_set *candidates;
1843 re_node_set *limits;
1844 struct re_backref_cache_entry *bkref_ents;
1845 int str_idx;
1846{
1847 reg_errcode_t err;
1848 int node_idx, lim_idx;
1849
1850 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1851 {
485d775d 1852 int subexp_idx;
0742e48e
UD
1853 struct re_backref_cache_entry *ent;
1854 ent = bkref_ents + limits->elems[lim_idx];
1855
1856 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
15a7d175 1857 continue; /* This is unrelated limitation. */
0742e48e 1858
485d775d 1859 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
0742e48e 1860 if (ent->subexp_to == str_idx)
15a7d175
UD
1861 {
1862 int ops_node = -1;
1863 int cls_node = -1;
1864 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1865 {
1866 int node = dest_nodes->elems[node_idx];
46bf9de7 1867 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
1868 if (type == OP_OPEN_SUBEXP
1869 && subexp_idx == dfa->nodes[node].opr.idx)
1870 ops_node = node;
1871 else if (type == OP_CLOSE_SUBEXP
1872 && subexp_idx == dfa->nodes[node].opr.idx)
1873 cls_node = node;
1874 }
1875
1876 /* Check the limitation of the open subexpression. */
1877 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1878 if (ops_node >= 0)
1879 {
46bf9de7
UD
1880 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1881 candidates);
15a7d175
UD
1882 if (BE (err != REG_NOERROR, 0))
1883 return err;
1884 }
46bf9de7 1885
15a7d175 1886 /* Check the limitation of the close subexpression. */
46bf9de7
UD
1887 if (cls_node >= 0)
1888 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1889 {
1890 int node = dest_nodes->elems[node_idx];
1891 if (!re_node_set_contains (dfa->inveclosures + node,
1892 cls_node)
1893 && !re_node_set_contains (dfa->eclosures + node,
1894 cls_node))
1895 {
1896 /* It is against this limitation.
1897 Remove it form the current sifted state. */
1898 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1899 candidates);
1900 if (BE (err != REG_NOERROR, 0))
1901 return err;
1902 --node_idx;
1903 }
1904 }
15a7d175 1905 }
0742e48e 1906 else /* (ent->subexp_to != str_idx) */
15a7d175
UD
1907 {
1908 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1909 {
1910 int node = dest_nodes->elems[node_idx];
46bf9de7 1911 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
1912 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1913 {
1914 if (subexp_idx != dfa->nodes[node].opr.idx)
1915 continue;
1916 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1917 || (type == OP_OPEN_SUBEXP))
1918 {
1919 /* It is against this limitation.
1920 Remove it form the current sifted state. */
46bf9de7
UD
1921 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1922 candidates);
15a7d175
UD
1923 if (BE (err != REG_NOERROR, 0))
1924 return err;
1925 }
1926 }
1927 }
1928 }
0742e48e
UD
1929 }
1930 return REG_NOERROR;
1931}
1932
0742e48e
UD
1933static reg_errcode_t
1934sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes)
1935 const regex_t *preg;
1936 re_match_context_t *mctx;
1937 re_sift_context_t *sctx;
1938 int str_idx;
1939 re_node_set *dest_nodes;
1940{
1941 reg_errcode_t err;
1942 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1943 int node_idx, node;
1944 re_sift_context_t local_sctx;
1945 const re_node_set *candidates;
1946 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1947 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
1948 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1949
1950 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
1951 {
0742e48e
UD
1952 int cur_bkref_idx = re_string_cur_idx (mctx->input);
1953 re_token_type_t type;
1954 node = candidates->elems[node_idx];
1955 type = dfa->nodes[node].type;
0742e48e 1956 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
15a7d175 1957 continue;
0742e48e
UD
1958 /* Avoid infinite loop for the REs like "()\1+". */
1959 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
15a7d175 1960 continue;
0742e48e 1961 if (type == OP_BACK_REF)
15a7d175 1962 {
6291ee3c
UD
1963 int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
1964 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
1965 {
1966 int disabled_idx, subexp_len, to_idx, dst_node;
1967 struct re_backref_cache_entry *entry;
1968 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
1969 if (entry->str_idx > str_idx)
1970 break;
1971 if (entry->node != node)
1972 continue;
15a7d175
UD
1973 subexp_len = entry->subexp_to - entry->subexp_from;
1974 to_idx = str_idx + subexp_len;
1975 dst_node = (subexp_len ? dfa->nexts[node]
1976 : dfa->edests[node].elems[0]);
1977
6291ee3c
UD
1978 if (to_idx > sctx->last_str_idx
1979 || sctx->sifted_states[to_idx] == NULL
1980 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
1981 dst_node)
1982 || check_dst_limits (dfa, &sctx->limits, mctx, node,
1983 str_idx, dst_node, to_idx))
15a7d175 1984 continue;
15a7d175
UD
1985 {
1986 re_dfastate_t *cur_state;
1987 entry->flag = 0;
1988 for (disabled_idx = enabled_idx + 1;
1989 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
1990 {
1991 struct re_backref_cache_entry *entry2;
1992 entry2 = mctx->bkref_ents + disabled_idx;
6291ee3c
UD
1993 if (entry2->str_idx > str_idx)
1994 break;
1995 entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
15a7d175
UD
1996 }
1997
1998 if (local_sctx.sifted_states == NULL)
1999 {
2000 local_sctx = *sctx;
2001 err = re_node_set_init_copy (&local_sctx.limits,
2002 &sctx->limits);
2003 if (BE (err != REG_NOERROR, 0))
1b2c2628 2004 goto free_return;
15a7d175
UD
2005 }
2006 local_sctx.last_node = node;
2007 local_sctx.last_str_idx = str_idx;
2008 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2009 if (BE (err < 0, 0))
1b2c2628
UD
2010 {
2011 err = REG_ESPACE;
2012 goto free_return;
2013 }
15a7d175
UD
2014 cur_state = local_sctx.sifted_states[str_idx];
2015 err = sift_states_backward (preg, mctx, &local_sctx);
2016 if (BE (err != REG_NOERROR, 0))
1b2c2628 2017 goto free_return;
15a7d175
UD
2018 if (sctx->limited_states != NULL)
2019 {
2020 err = merge_state_array (dfa, sctx->limited_states,
2021 local_sctx.sifted_states,
2022 str_idx + 1);
2023 if (BE (err != REG_NOERROR, 0))
2024 goto free_return;
2025 }
2026 local_sctx.sifted_states[str_idx] = cur_state;
2027 re_node_set_remove (&local_sctx.limits, enabled_idx);
485d775d
UD
2028 /* We must not use the variable entry here, since
2029 mctx->bkref_ents might be realloced. */
2030 mctx->bkref_ents[enabled_idx].flag = 1;
15a7d175
UD
2031 }
2032 }
6291ee3c
UD
2033 enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2034 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
2035 {
2036 struct re_backref_cache_entry *entry;
2037 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
2038 if (entry->str_idx > str_idx)
2039 break;
2040 if (entry->node == node)
15a7d175
UD
2041 entry->flag = 0;
2042 }
2043 }
0742e48e 2044 }
1b2c2628
UD
2045 err = REG_NOERROR;
2046 free_return:
0742e48e
UD
2047 if (local_sctx.sifted_states != NULL)
2048 {
0742e48e
UD
2049 re_node_set_free (&local_sctx.limits);
2050 }
2051
1b2c2628 2052 return err;
0742e48e
UD
2053}
2054
2055
2056#ifdef RE_ENABLE_I18N
2057static int
2058sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
2059 const regex_t *preg;
2060 const re_match_context_t *mctx;
2061 re_sift_context_t *sctx;
2062 int node_idx, str_idx, max_str_idx;
2063{
2064 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2065 int naccepted;
2066 /* Check the node can accept `multi byte'. */
2067 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
2068 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2069 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
15a7d175 2070 dfa->nexts[node_idx]))
0742e48e 2071 /* The node can't accept the `multi byte', or the
bb3f4825 2072 destination was already thrown away, then the node
0742e48e
UD
2073 could't accept the current input `multi byte'. */
2074 naccepted = 0;
2075 /* Otherwise, it is sure that the node could accept
2076 `naccepted' bytes input. */
2077 return naccepted;
2078}
2079#endif /* RE_ENABLE_I18N */
2080
3b0bdc72
UD
2081\f
2082/* Functions for state transition. */
2083
2084/* Return the next state to which the current state STATE will transit by
2085 accepting the current input byte, and update STATE_LOG if necessary.
2086 If STATE can accept a multibyte char/collating element/back reference
2087 update the destination of STATE_LOG. */
2088
2089static re_dfastate_t *
c13c99fa 2090transit_state (err, preg, mctx, state)
a9388965
UD
2091 reg_errcode_t *err;
2092 const regex_t *preg;
a9388965 2093 re_match_context_t *mctx;
612546c6 2094 re_dfastate_t *state;
3b0bdc72
UD
2095{
2096 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2097 re_dfastate_t **trtable, *next_state;
2098 unsigned char ch;
6291ee3c 2099 int cur_idx;
3b0bdc72 2100
612546c6
UD
2101 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
2102 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
15a7d175 2103 && mctx->input->valid_len < mctx->input->len))
612546c6
UD
2104 {
2105 *err = extend_buffers (mctx);
2106 if (BE (*err != REG_NOERROR, 0))
15a7d175 2107 return NULL;
612546c6
UD
2108 }
2109
a9388965 2110 *err = REG_NOERROR;
3b0bdc72
UD
2111 if (state == NULL)
2112 {
2113 next_state = state;
612546c6 2114 re_string_skip_bytes (mctx->input, 1);
3b0bdc72
UD
2115 }
2116 else
2117 {
434d3784 2118#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2119 /* If the current state can accept multibyte. */
2120 if (state->accept_mb)
15a7d175
UD
2121 {
2122 *err = transit_state_mb (preg, state, mctx);
2123 if (BE (*err != REG_NOERROR, 0))
2124 return NULL;
2125 }
434d3784 2126#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2127
2128 /* Then decide the next state with the single byte. */
2129 if (1)
15a7d175
UD
2130 {
2131 /* Use transition table */
2132 ch = re_string_fetch_byte (mctx->input);
c13c99fa 2133 trtable = state->trtable;
15a7d175
UD
2134 if (trtable == NULL)
2135 {
c13c99fa
UD
2136 trtable = build_trtable (preg, state);
2137 if (trtable == NULL)
2138 {
2139 *err = REG_ESPACE;
2140 return NULL;
2141 }
2142 }
2143 if (BE (state->word_trtable, 0))
2144 {
2145 unsigned int context;
2146 context
2147 = re_string_context_at (mctx->input,
2148 re_string_cur_idx (mctx->input) - 1,
2149 mctx->eflags, preg->newline_anchor);
2150 if (IS_WORD_CONTEXT (context))
2151 next_state = trtable[ch + SBC_MAX];
15a7d175 2152 else
c13c99fa 2153 next_state = trtable[ch];
15a7d175 2154 }
c13c99fa
UD
2155 else
2156 next_state = trtable[ch];
15a7d175 2157 }
c13c99fa 2158#if 0
3b0bdc72 2159 else
15a7d175
UD
2160 {
2161 /* don't use transition table */
c13c99fa 2162 next_state = transit_state_sb (err, preg, state, mctx);
15a7d175
UD
2163 if (BE (next_state == NULL && err != REG_NOERROR, 0))
2164 return NULL;
2165 }
c13c99fa 2166#endif
3b0bdc72
UD
2167 }
2168
6291ee3c 2169 cur_idx = re_string_cur_idx (mctx->input);
3b0bdc72 2170 /* Update the state_log if we need. */
612546c6 2171 if (mctx->state_log != NULL)
3b0bdc72 2172 {
3b0bdc72 2173 if (cur_idx > mctx->state_log_top)
15a7d175
UD
2174 {
2175 mctx->state_log[cur_idx] = next_state;
2176 mctx->state_log_top = cur_idx;
2177 }
612546c6 2178 else if (mctx->state_log[cur_idx] == 0)
15a7d175
UD
2179 {
2180 mctx->state_log[cur_idx] = next_state;
2181 }
3b0bdc72 2182 else
15a7d175
UD
2183 {
2184 re_dfastate_t *pstate;
2185 unsigned int context;
2186 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2187 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2188 the destination of a multibyte char/collating element/
2189 back reference. Then the next state is the union set of
2190 these destinations and the results of the transition table. */
2191 pstate = mctx->state_log[cur_idx];
2192 log_nodes = pstate->entrance_nodes;
2193 if (next_state != NULL)
2194 {
2195 table_nodes = next_state->entrance_nodes;
2196 *err = re_node_set_init_union (&next_nodes, table_nodes,
2197 log_nodes);
2198 if (BE (*err != REG_NOERROR, 0))
2199 return NULL;
2200 }
2201 else
2202 next_nodes = *log_nodes;
2203 /* Note: We already add the nodes of the initial state,
2204 then we don't need to add them here. */
2205
2206 context = re_string_context_at (mctx->input,
2207 re_string_cur_idx (mctx->input) - 1,
2208 mctx->eflags, preg->newline_anchor);
2209 next_state = mctx->state_log[cur_idx]
2210 = re_acquire_state_context (err, dfa, &next_nodes, context);
2211 /* We don't need to check errors here, since the return value of
2212 this function is next_state and ERR is already set. */
2213
2214 if (table_nodes != NULL)
2215 re_node_set_free (&next_nodes);
2216 }
6291ee3c
UD
2217 }
2218
bb3f4825 2219 if (BE (dfa->nbackref, 0) && next_state != NULL)
6291ee3c 2220 {
bb3f4825
UD
2221 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2222 later. We must check them here, since the back references in the
2223 next state might use them. */
6291ee3c
UD
2224 *err = check_subexp_matching_top (dfa, mctx, &next_state->nodes,
2225 cur_idx);
2226 if (BE (*err != REG_NOERROR, 0))
2227 return NULL;
6291ee3c 2228
bb3f4825
UD
2229 /* If the next state has back references. */
2230 if (next_state->has_backref)
2231 {
2232 *err = transit_state_bkref (preg, &next_state->nodes, mctx);
2233 if (BE (*err != REG_NOERROR, 0))
2234 return NULL;
2235 next_state = mctx->state_log[cur_idx];
2236 }
3b0bdc72
UD
2237 }
2238 return next_state;
2239}
2240
2241/* Helper functions for transit_state. */
2242
6291ee3c
UD
2243/* From the node set CUR_NODES, pick up the nodes whose types are
2244 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2245 expression. And register them to use them later for evaluating the
2246 correspoding back references. */
2247
2248static reg_errcode_t
2249check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx)
2250 re_dfa_t *dfa;
2251 re_match_context_t *mctx;
2252 re_node_set *cur_nodes;
2253 int str_idx;
2254{
2255 int node_idx;
2256 reg_errcode_t err;
2257
2258 /* TODO: This isn't efficient.
2259 Because there might be more than one nodes whose types are
2260 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2261 nodes.
2262 E.g. RE: (a){2} */
2263 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2264 {
2265 int node = cur_nodes->elems[node_idx];
2266 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
ce859332 2267 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
6291ee3c
UD
2268 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2269 {
2270 err = match_ctx_add_subtop (mctx, node, str_idx);
2271 if (BE (err != REG_NOERROR, 0))
2272 return err;
2273 }
2274 }
2275 return REG_NOERROR;
2276}
2277
c13c99fa 2278#if 0
3b0bdc72
UD
2279/* Return the next state to which the current state STATE will transit by
2280 accepting the current input byte. */
2281
2282static re_dfastate_t *
c13c99fa 2283transit_state_sb (err, preg, state, mctx)
a9388965
UD
2284 reg_errcode_t *err;
2285 const regex_t *preg;
2286 re_dfastate_t *state;
a9388965 2287 re_match_context_t *mctx;
3b0bdc72
UD
2288{
2289 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2290 re_node_set next_nodes;
2291 re_dfastate_t *next_state;
612546c6 2292 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
3b0bdc72
UD
2293 unsigned int context;
2294
a9388965 2295 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
bc15410e 2296 if (BE (*err != REG_NOERROR, 0))
a9388965 2297 return NULL;
3b0bdc72
UD
2298 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2299 {
2300 int cur_node = state->nodes.elems[node_cnt];
612546c6 2301 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
15a7d175
UD
2302 {
2303 *err = re_node_set_merge (&next_nodes,
2304 dfa->eclosures + dfa->nexts[cur_node]);
2305 if (BE (*err != REG_NOERROR, 0))
2306 {
2307 re_node_set_free (&next_nodes);
2308 return NULL;
2309 }
2310 }
3b0bdc72 2311 }
612546c6 2312 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
15a7d175 2313 preg->newline_anchor);
a9388965
UD
2314 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2315 /* We don't need to check errors here, since the return value of
2316 this function is next_state and ERR is already set. */
2317
3b0bdc72 2318 re_node_set_free (&next_nodes);
612546c6 2319 re_string_skip_bytes (mctx->input, 1);
3b0bdc72
UD
2320 return next_state;
2321}
c13c99fa 2322#endif
3b0bdc72 2323
434d3784 2324#ifdef RE_ENABLE_I18N
a9388965 2325static reg_errcode_t
612546c6 2326transit_state_mb (preg, pstate, mctx)
3b0bdc72 2327 const regex_t *preg;
612546c6 2328 re_dfastate_t *pstate;
3b0bdc72
UD
2329 re_match_context_t *mctx;
2330{
a9388965 2331 reg_errcode_t err;
3b0bdc72
UD
2332 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2333 int i;
2334
2335 for (i = 0; i < pstate->nodes.nelem; ++i)
2336 {
2337 re_node_set dest_nodes, *new_nodes;
2338 int cur_node_idx = pstate->nodes.elems[i];
2339 int naccepted = 0, dest_idx;
2340 unsigned int context;
2341 re_dfastate_t *dest_state;
2342
485d775d 2343 if (dfa->nodes[cur_node_idx].constraint)
15a7d175
UD
2344 {
2345 context = re_string_context_at (mctx->input,
2346 re_string_cur_idx (mctx->input),
2347 mctx->eflags, preg->newline_anchor);
2348 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2349 context))
2350 continue;
2351 }
3b0bdc72 2352
ad7f28c2 2353 /* How many bytes the node can accept? */
3b0bdc72 2354 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
15a7d175
UD
2355 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
2356 re_string_cur_idx (mctx->input));
3b0bdc72 2357 if (naccepted == 0)
15a7d175 2358 continue;
3b0bdc72
UD
2359
2360 /* The node can accepts `naccepted' bytes. */
612546c6 2361 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
0742e48e 2362 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
15a7d175 2363 : mctx->max_mb_elem_len);
612546c6
UD
2364 err = clean_state_log_if_need (mctx, dest_idx);
2365 if (BE (err != REG_NOERROR, 0))
15a7d175 2366 return err;
3b0bdc72
UD
2367#ifdef DEBUG
2368 assert (dfa->nexts[cur_node_idx] != -1);
2369#endif
2370 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
15a7d175 2371 then we use pstate->nodes.elems[i] instead. */
3b0bdc72
UD
2372 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2373
612546c6 2374 dest_state = mctx->state_log[dest_idx];
3b0bdc72 2375 if (dest_state == NULL)
15a7d175 2376 dest_nodes = *new_nodes;
3b0bdc72 2377 else
15a7d175
UD
2378 {
2379 err = re_node_set_init_union (&dest_nodes,
2380 dest_state->entrance_nodes, new_nodes);
2381 if (BE (err != REG_NOERROR, 0))
2382 return err;
2383 }
612546c6 2384 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
15a7d175 2385 preg->newline_anchor);
612546c6 2386 mctx->state_log[dest_idx]
15a7d175 2387 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
3b0bdc72 2388 if (dest_state != NULL)
15a7d175 2389 re_node_set_free (&dest_nodes);
1b2c2628 2390 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
15a7d175 2391 return err;
3b0bdc72 2392 }
a9388965 2393 return REG_NOERROR;
3b0bdc72 2394}
434d3784 2395#endif /* RE_ENABLE_I18N */
3b0bdc72 2396
a9388965 2397static reg_errcode_t
6291ee3c 2398transit_state_bkref (preg, nodes, mctx)
3b0bdc72 2399 const regex_t *preg;
fe9434bb 2400 const re_node_set *nodes;
3b0bdc72
UD
2401 re_match_context_t *mctx;
2402{
a9388965 2403 reg_errcode_t err;
3b0bdc72 2404 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
0742e48e 2405 int i;
612546c6 2406 int cur_str_idx = re_string_cur_idx (mctx->input);
3b0bdc72
UD
2407
2408 for (i = 0; i < nodes->nelem; ++i)
2409 {
6291ee3c 2410 int dest_str_idx, prev_nelem, bkc_idx;
3b0bdc72
UD
2411 int node_idx = nodes->elems[i];
2412 unsigned int context;
fe9434bb 2413 const re_token_t *node = dfa->nodes + node_idx;
3b0bdc72
UD
2414 re_node_set *new_dest_nodes;
2415
2416 /* Check whether `node' is a backreference or not. */
6291ee3c 2417 if (node->type != OP_BACK_REF)
15a7d175 2418 continue;
485d775d
UD
2419
2420 if (node->constraint)
15a7d175
UD
2421 {
2422 context = re_string_context_at (mctx->input, cur_str_idx,
2423 mctx->eflags, preg->newline_anchor);
2424 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2425 continue;
2426 }
3b0bdc72
UD
2427
2428 /* `node' is a backreference.
15a7d175 2429 Check the substring which the substring matched. */
6291ee3c
UD
2430 bkc_idx = mctx->nbkref_ents;
2431 err = get_subexp (preg, mctx, node_idx, cur_str_idx);
612546c6 2432 if (BE (err != REG_NOERROR, 0))
15a7d175 2433 goto free_return;
3b0bdc72
UD
2434
2435 /* And add the epsilon closures (which is `new_dest_nodes') of
15a7d175 2436 the backreference to appropriate state_log. */
3b0bdc72
UD
2437#ifdef DEBUG
2438 assert (dfa->nexts[node_idx] != -1);
2439#endif
6291ee3c 2440 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
15a7d175
UD
2441 {
2442 int subexp_len;
2443 re_dfastate_t *dest_state;
2444 struct re_backref_cache_entry *bkref_ent;
2445 bkref_ent = mctx->bkref_ents + bkc_idx;
2446 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2447 continue;
2448 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2449 new_dest_nodes = (subexp_len == 0
2450 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2451 : dfa->eclosures + dfa->nexts[node_idx]);
2452 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2453 - bkref_ent->subexp_from);
6291ee3c
UD
2454 context = re_string_context_at (mctx->input, dest_str_idx - 1,
2455 mctx->eflags, preg->newline_anchor);
15a7d175
UD
2456 dest_state = mctx->state_log[dest_str_idx];
2457 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2458 : mctx->state_log[cur_str_idx]->nodes.nelem);
2459 /* Add `new_dest_node' to state_log. */
2460 if (dest_state == NULL)
2461 {
2462 mctx->state_log[dest_str_idx]
2463 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2464 context);
2465 if (BE (mctx->state_log[dest_str_idx] == NULL
2466 && err != REG_NOERROR, 0))
2467 goto free_return;
2468 }
2469 else
2470 {
2471 re_node_set dest_nodes;
2472 err = re_node_set_init_union (&dest_nodes,
2473 dest_state->entrance_nodes,
2474 new_dest_nodes);
2475 if (BE (err != REG_NOERROR, 0))
2476 {
2477 re_node_set_free (&dest_nodes);
2478 goto free_return;
2479 }
2480 mctx->state_log[dest_str_idx]
2481 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2482 re_node_set_free (&dest_nodes);
2483 if (BE (mctx->state_log[dest_str_idx] == NULL
2484 && err != REG_NOERROR, 0))
2485 goto free_return;
2486 }
2487 /* We need to check recursively if the backreference can epsilon
2488 transit. */
2489 if (subexp_len == 0
2490 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2491 {
6291ee3c
UD
2492 err = check_subexp_matching_top (dfa, mctx, new_dest_nodes,
2493 cur_str_idx);
2494 if (BE (err != REG_NOERROR, 0))
2495 goto free_return;
2496 err = transit_state_bkref (preg, new_dest_nodes, mctx);
15a7d175
UD
2497 if (BE (err != REG_NOERROR, 0))
2498 goto free_return;
2499 }
2500 }
3b0bdc72 2501 }
1b2c2628
UD
2502 err = REG_NOERROR;
2503 free_return:
1b2c2628 2504 return err;
3b0bdc72
UD
2505}
2506
6291ee3c
UD
2507/* Enumerate all the candidates which the backreference BKREF_NODE can match
2508 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2509 Note that we might collect inappropriate candidates here.
2510 However, the cost of checking them strictly here is too high, then we
2511 delay these checking for prune_impossible_nodes(). */
3b0bdc72 2512
6291ee3c
UD
2513static reg_errcode_t
2514get_subexp (preg, mctx, bkref_node, bkref_str_idx)
2515 const regex_t *preg;
2516 re_match_context_t *mctx;
2517 int bkref_node, bkref_str_idx;
3b0bdc72 2518{
6291ee3c 2519 int subexp_num, sub_top_idx;
3b0bdc72 2520 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
fe9434bb 2521 const char *buf = (const char *) re_string_get_buffer (mctx->input);
6291ee3c
UD
2522 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2523 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2524 for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2525 {
fe9434bb
UD
2526 const struct re_backref_cache_entry *entry
2527 = &mctx->bkref_ents[cache_idx];
6291ee3c
UD
2528 if (entry->str_idx > bkref_str_idx)
2529 break;
2530 if (entry->node == bkref_node)
2531 return REG_NOERROR; /* We already checked it. */
2532 }
2533 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2534
2535 /* For each sub expression */
2536 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2537 {
2538 reg_errcode_t err;
2539 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2540 re_sub_match_last_t *sub_last;
2541 int sub_last_idx, sl_str;
fe9434bb 2542 const char *bkref_str;
6291ee3c
UD
2543
2544 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2545 continue; /* It isn't related. */
2546
2547 sl_str = sub_top->str_idx;
2548 bkref_str = buf + bkref_str_idx;
2549 /* At first, check the last node of sub expressions we already
2550 evaluated. */
2551 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2552 {
2553 int sl_str_diff;
2554 sub_last = sub_top->lasts[sub_last_idx];
2555 sl_str_diff = sub_last->str_idx - sl_str;
2556 /* The matched string by the sub expression match with the substring
2557 at the back reference? */
2558 if (sl_str_diff > 0
2559 && memcmp (bkref_str, buf + sl_str, sl_str_diff) != 0)
2560 break; /* We don't need to search this sub expression any more. */
2561 bkref_str += sl_str_diff;
2562 sl_str += sl_str_diff;
2563 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2564 bkref_str_idx);
3ee363e2
UD
2565
2566 /* Reload buf and bkref_str, since the preceding call might
2567 have reallocated the buffer. */
fe9434bb 2568 buf = (const char *) re_string_get_buffer (mctx->input);
3ee363e2
UD
2569 bkref_str = buf + bkref_str_idx;
2570
6291ee3c
UD
2571 if (err == REG_NOMATCH)
2572 continue;
2573 if (BE (err != REG_NOERROR, 0))
2574 return err;
2575 }
2576 if (sub_last_idx < sub_top->nlasts)
2577 continue;
2578 if (sub_last_idx > 0)
2579 ++sl_str;
2580 /* Then, search for the other last nodes of the sub expression. */
2581 for (; sl_str <= bkref_str_idx; ++sl_str)
2582 {
2583 int cls_node, sl_str_off;
fe9434bb 2584 const re_node_set *nodes;
6291ee3c
UD
2585 sl_str_off = sl_str - sub_top->str_idx;
2586 /* The matched string by the sub expression match with the substring
2587 at the back reference? */
3ee363e2 2588 if (sl_str_off > 0 && *bkref_str++ != buf[sl_str - 1])
6291ee3c
UD
2589 break; /* We don't need to search this sub expression any more. */
2590 if (mctx->state_log[sl_str] == NULL)
2591 continue;
2592 /* Does this state have a ')' of the sub expression? */
2593 nodes = &mctx->state_log[sl_str]->nodes;
0ce7f49c 2594 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
6291ee3c
UD
2595 if (cls_node == -1)
2596 continue; /* No. */
2597 if (sub_top->path == NULL)
2598 {
2599 sub_top->path = calloc (sizeof (state_array_t),
2600 sl_str - sub_top->str_idx + 1);
2601 if (sub_top->path == NULL)
2602 return REG_ESPACE;
2603 }
2604 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2605 in the current context? */
2606 err = check_arrival (preg, mctx, sub_top->path, sub_top->node,
0ce7f49c 2607 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
6291ee3c
UD
2608 if (err == REG_NOMATCH)
2609 continue;
2610 if (BE (err != REG_NOERROR, 0))
2611 return err;
2612 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2613 if (BE (sub_last == NULL, 0))
2614 return REG_ESPACE;
2615 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2616 bkref_str_idx);
2617 if (err == REG_NOMATCH)
2618 continue;
2619 }
2620 }
2621 return REG_NOERROR;
2622}
2623
2624/* Helper functions for get_subexp(). */
2625
2626/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2627 If it can arrive, register the sub expression expressed with SUB_TOP
2628 and SUB_LAST. */
2629
2630static reg_errcode_t
2631get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, bkref_str)
2632 const regex_t *preg;
2633 re_match_context_t *mctx;
fe9434bb 2634 const re_sub_match_top_t *sub_top;
6291ee3c
UD
2635 re_sub_match_last_t *sub_last;
2636 int bkref_node, bkref_str;
2637{
2638 reg_errcode_t err;
2639 int to_idx;
2640 /* Can the subexpression arrive the back reference? */
2641 err = check_arrival (preg, mctx, &sub_last->path, sub_last->node,
0ce7f49c 2642 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
6291ee3c
UD
2643 if (err != REG_NOERROR)
2644 return err;
2645 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2646 sub_last->str_idx);
2647 if (BE (err != REG_NOERROR, 0))
2648 return err;
2649 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2650 clean_state_log_if_need (mctx, to_idx);
2651 return REG_NOERROR;
2652}
2653
2654/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2655 Search '(' if FL_OPEN, or search ')' otherwise.
2656 TODO: This function isn't efficient...
2657 Because there might be more than one nodes whose types are
2658 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2659 nodes.
2660 E.g. RE: (a){2} */
2661
2662static int
0ce7f49c 2663find_subexp_node (dfa, nodes, subexp_idx, type)
fe9434bb
UD
2664 const re_dfa_t *dfa;
2665 const re_node_set *nodes;
0ce7f49c 2666 int subexp_idx, type;
6291ee3c
UD
2667{
2668 int cls_idx;
2669 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2670 {
2671 int cls_node = nodes->elems[cls_idx];
fe9434bb 2672 const re_token_t *node = dfa->nodes + cls_node;
0ce7f49c 2673 if (node->type == type
6291ee3c
UD
2674 && node->opr.idx == subexp_idx)
2675 return cls_node;
2676 }
2677 return -1;
2678}
2679
2680/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2681 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2682 heavily reused.
2683 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2684
2685static reg_errcode_t
2686check_arrival (preg, mctx, path, top_node, top_str, last_node, last_str,
0ce7f49c 2687 type)
6291ee3c
UD
2688 const regex_t *preg;
2689 re_match_context_t *mctx;
2690 state_array_t *path;
0ce7f49c 2691 int top_node, top_str, last_node, last_str, type;
6291ee3c
UD
2692{
2693 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2694 reg_errcode_t err;
2695 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2696 re_dfastate_t *cur_state = NULL;
2697 re_node_set *cur_nodes, next_nodes;
2698 re_dfastate_t **backup_state_log;
2699 unsigned int context;
2700
2701 subexp_num = dfa->nodes[top_node].opr.idx;
2702 /* Extend the buffer if we need. */
951d6408 2703 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
6291ee3c
UD
2704 {
2705 re_dfastate_t **new_array;
2706 int old_alloc = path->alloc;
2707 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2708 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2709 if (new_array == NULL)
951d6408
UD
2710 {
2711 path->alloc = old_alloc;
2712 return REG_ESPACE;
2713 }
6291ee3c
UD
2714 path->array = new_array;
2715 memset (new_array + old_alloc, '\0',
2716 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2717 }
2718
2719 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2720
2721 /* Temporary modify MCTX. */
2722 backup_state_log = mctx->state_log;
2723 backup_cur_idx = mctx->input->cur_idx;
2724 mctx->state_log = path->array;
2725 mctx->input->cur_idx = str_idx;
2726
2727 /* Setup initial node set. */
2728 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2729 preg->newline_anchor);
2730 if (str_idx == top_str)
2731 {
2732 err = re_node_set_init_1 (&next_nodes, top_node);
2733 if (BE (err != REG_NOERROR, 0))
2734 return err;
0ce7f49c 2735 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2736 if (BE (err != REG_NOERROR, 0))
2737 {
2738 re_node_set_free (&next_nodes);
2739 return err;
2740 }
2741 }
2742 else
2743 {
2744 cur_state = mctx->state_log[str_idx];
2745 if (cur_state && cur_state->has_backref)
2746 {
2747 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2748 if (BE ( err != REG_NOERROR, 0))
2749 return err;
2750 }
2751 else
2752 re_node_set_init_empty (&next_nodes);
2753 }
2754 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2755 {
2756 if (next_nodes.nelem)
2757 {
2758 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
0ce7f49c 2759 subexp_num, type);
6291ee3c
UD
2760 if (BE ( err != REG_NOERROR, 0))
2761 {
2762 re_node_set_free (&next_nodes);
2763 return err;
2764 }
2765 }
2766 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2767 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2768 {
2769 re_node_set_free (&next_nodes);
2770 return err;
2771 }
2772 mctx->state_log[str_idx] = cur_state;
2773 }
2774
2775 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2776 {
2777 re_node_set_empty (&next_nodes);
2778 if (mctx->state_log[str_idx + 1])
2779 {
2780 err = re_node_set_merge (&next_nodes,
2781 &mctx->state_log[str_idx + 1]->nodes);
2782 if (BE (err != REG_NOERROR, 0))
2783 {
2784 re_node_set_free (&next_nodes);
2785 return err;
2786 }
2787 }
2788 if (cur_state)
2789 {
46bf9de7
UD
2790 err = check_arrival_add_next_nodes (preg, dfa, mctx, str_idx,
2791 &cur_state->nodes, &next_nodes);
6291ee3c
UD
2792 if (BE (err != REG_NOERROR, 0))
2793 {
2794 re_node_set_free (&next_nodes);
2795 return err;
2796 }
2797 }
2798 ++str_idx;
2799 if (next_nodes.nelem)
2800 {
0ce7f49c 2801 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2802 if (BE (err != REG_NOERROR, 0))
2803 {
2804 re_node_set_free (&next_nodes);
2805 return err;
2806 }
2807 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
0ce7f49c 2808 subexp_num, type);
6291ee3c
UD
2809 if (BE ( err != REG_NOERROR, 0))
2810 {
2811 re_node_set_free (&next_nodes);
2812 return err;
2813 }
2814 }
2815 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2816 preg->newline_anchor);
2817 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2818 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2819 {
2820 re_node_set_free (&next_nodes);
2821 return err;
2822 }
2823 mctx->state_log[str_idx] = cur_state;
2824 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2825 }
2826 re_node_set_free (&next_nodes);
2827 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2828 : &mctx->state_log[last_str]->nodes);
2829 path->next_idx = str_idx;
2830
2831 /* Fix MCTX. */
2832 mctx->state_log = backup_state_log;
2833 mctx->input->cur_idx = backup_cur_idx;
2834
6291ee3c 2835 /* Then check the current node set has the node LAST_NODE. */
c0d5034e
UD
2836 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2837 return REG_NOERROR;
2838
2839 return REG_NOMATCH;
6291ee3c
UD
2840}
2841
2842/* Helper functions for check_arrival. */
2843
2844/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2845 to NEXT_NODES.
2846 TODO: This function is similar to the functions transit_state*(),
2847 however this function has many additional works.
2848 Can't we unify them? */
2849
2850static reg_errcode_t
2851check_arrival_add_next_nodes (preg, dfa, mctx, str_idx, cur_nodes, next_nodes)
2852 const regex_t *preg;
2853 re_dfa_t *dfa;
2854 re_match_context_t *mctx;
2855 int str_idx;
2856 re_node_set *cur_nodes, *next_nodes;
2857{
2858 int cur_idx;
2859 reg_errcode_t err;
2860 re_node_set union_set;
2861 re_node_set_init_empty (&union_set);
2862 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2863 {
2864 int naccepted = 0;
2865 int cur_node = cur_nodes->elems[cur_idx];
2866 re_token_type_t type = dfa->nodes[cur_node].type;
46bf9de7 2867 if (IS_EPSILON_NODE (type))
6291ee3c
UD
2868 continue;
2869#ifdef RE_ENABLE_I18N
2870 /* If the node may accept `multi byte'. */
2871 if (ACCEPT_MB_NODE (type))
2872 {
2873 naccepted = check_node_accept_bytes (preg, cur_node, mctx->input,
2874 str_idx);
2875 if (naccepted > 1)
2876 {
2877 re_dfastate_t *dest_state;
2878 int next_node = dfa->nexts[cur_node];
2879 int next_idx = str_idx + naccepted;
2880 dest_state = mctx->state_log[next_idx];
2881 re_node_set_empty (&union_set);
2882 if (dest_state)
2883 {
2884 err = re_node_set_merge (&union_set, &dest_state->nodes);
2885 if (BE (err != REG_NOERROR, 0))
2886 {
2887 re_node_set_free (&union_set);
2888 return err;
2889 }
2890 err = re_node_set_insert (&union_set, next_node);
2891 if (BE (err < 0, 0))
2892 {
2893 re_node_set_free (&union_set);
2894 return REG_ESPACE;
2895 }
2896 }
2897 else
2898 {
2899 err = re_node_set_insert (&union_set, next_node);
2900 if (BE (err < 0, 0))
2901 {
2902 re_node_set_free (&union_set);
2903 return REG_ESPACE;
2904 }
2905 }
2906 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2907 &union_set);
2908 if (BE (mctx->state_log[next_idx] == NULL
2909 && err != REG_NOERROR, 0))
2910 {
2911 re_node_set_free (&union_set);
2912 return err;
2913 }
2914 }
2915 }
2916#endif /* RE_ENABLE_I18N */
2917 if (naccepted
2918 || check_node_accept (preg, dfa->nodes + cur_node, mctx,
2919 str_idx))
2920 {
2921 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
2922 if (BE (err < 0, 0))
2923 {
2924 re_node_set_free (&union_set);
2925 return REG_ESPACE;
2926 }
2927 }
2928 }
2929 re_node_set_free (&union_set);
2930 return REG_NOERROR;
2931}
2932
2933/* For all the nodes in CUR_NODES, add the epsilon closures of them to
2934 CUR_NODES, however exclude the nodes which are:
2935 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2936 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2937*/
2938
2939static reg_errcode_t
0ce7f49c 2940check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
6291ee3c
UD
2941 re_dfa_t *dfa;
2942 re_node_set *cur_nodes;
0ce7f49c 2943 int ex_subexp, type;
6291ee3c
UD
2944{
2945 reg_errcode_t err;
2946 int idx, outside_node;
2947 re_node_set new_nodes;
2948#ifdef DEBUG
2949 assert (cur_nodes->nelem);
2950#endif
2951 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
2952 if (BE (err != REG_NOERROR, 0))
2953 return err;
2954 /* Create a new node set NEW_NODES with the nodes which are epsilon
2955 closures of the node in CUR_NODES. */
2956
2957 for (idx = 0; idx < cur_nodes->nelem; ++idx)
2958 {
2959 int cur_node = cur_nodes->elems[idx];
2960 re_node_set *eclosure = dfa->eclosures + cur_node;
0ce7f49c 2961 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
6291ee3c
UD
2962 if (outside_node == -1)
2963 {
2964 /* There are no problematic nodes, just merge them. */
2965 err = re_node_set_merge (&new_nodes, eclosure);
2966 if (BE (err != REG_NOERROR, 0))
2967 {
2968 re_node_set_free (&new_nodes);
2969 return err;
2970 }
2971 }
2972 else
2973 {
2974 /* There are problematic nodes, re-calculate incrementally. */
2975 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
0ce7f49c 2976 ex_subexp, type);
6291ee3c
UD
2977 if (BE (err != REG_NOERROR, 0))
2978 {
2979 re_node_set_free (&new_nodes);
2980 return err;
2981 }
2982 }
2983 }
2984 re_node_set_free (cur_nodes);
2985 *cur_nodes = new_nodes;
2986 return REG_NOERROR;
2987}
2988
2989/* Helper function for check_arrival_expand_ecl.
2990 Check incrementally the epsilon closure of TARGET, and if it isn't
2991 problematic append it to DST_NODES. */
2992
2993static reg_errcode_t
0ce7f49c 2994check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
6291ee3c 2995 re_dfa_t *dfa;
0ce7f49c 2996 int target, ex_subexp, type;
6291ee3c
UD
2997 re_node_set *dst_nodes;
2998{
0ce7f49c 2999 int cur_node;
6291ee3c
UD
3000 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3001 {
3002 int err;
6291ee3c 3003
0ce7f49c 3004 if (dfa->nodes[cur_node].type == type
6291ee3c
UD
3005 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3006 {
0ce7f49c 3007 if (type == OP_CLOSE_SUBEXP)
6291ee3c
UD
3008 {
3009 err = re_node_set_insert (dst_nodes, cur_node);
3010 if (BE (err == -1, 0))
3011 return REG_ESPACE;
3012 }
3013 break;
3014 }
3015 err = re_node_set_insert (dst_nodes, cur_node);
3016 if (BE (err == -1, 0))
3017 return REG_ESPACE;
3018 if (dfa->edests[cur_node].nelem == 0)
3019 break;
3020 if (dfa->edests[cur_node].nelem == 2)
3021 {
3022 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3023 dfa->edests[cur_node].elems[1],
0ce7f49c 3024 ex_subexp, type);
6291ee3c
UD
3025 if (BE (err != REG_NOERROR, 0))
3026 return err;
3027 }
3028 cur_node = dfa->edests[cur_node].elems[0];
3029 }
3030 return REG_NOERROR;
3031}
3032
3033
3034/* For all the back references in the current state, calculate the
3035 destination of the back references by the appropriate entry
3036 in MCTX->BKREF_ENTS. */
3037
3038static reg_errcode_t
3039expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num,
0ce7f49c 3040 type)
6291ee3c
UD
3041 const regex_t *preg;
3042 re_match_context_t *mctx;
0ce7f49c 3043 int cur_str, last_str, subexp_num, type;
6291ee3c
UD
3044 re_node_set *cur_nodes;
3045{
3046 reg_errcode_t err;
3047 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3048 int cache_idx, cache_idx_start;
3049 /* The current state. */
3050
3051 cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3052 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3053 {
3054 int to_idx, next_node;
3055 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3056 if (ent->str_idx > cur_str)
3057 break;
3058 /* Is this entry ENT is appropriate? */
3059 if (!re_node_set_contains (cur_nodes, ent->node))
3060 continue; /* No. */
3061
3062 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3063 /* Calculate the destination of the back reference, and append it
3064 to MCTX->STATE_LOG. */
3065 if (to_idx == cur_str)
3066 {
3067 /* The backreference did epsilon transit, we must re-check all the
3068 node in the current state. */
3069 re_node_set new_dests;
3070 reg_errcode_t err2, err3;
3071 next_node = dfa->edests[ent->node].elems[0];
3072 if (re_node_set_contains (cur_nodes, next_node))
3073 continue;
3074 err = re_node_set_init_1 (&new_dests, next_node);
0ce7f49c 3075 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
6291ee3c
UD
3076 err3 = re_node_set_merge (cur_nodes, &new_dests);
3077 re_node_set_free (&new_dests);
3078 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3079 || err3 != REG_NOERROR, 0))
3080 {
3081 err = (err != REG_NOERROR ? err
3082 : (err2 != REG_NOERROR ? err2 : err3));
3083 return err;
3084 }
3085 /* TODO: It is still inefficient... */
3086 cache_idx = cache_idx_start - 1;
3087 continue;
3088 }
3089 else
3090 {
3091 re_node_set union_set;
3092 next_node = dfa->nexts[ent->node];
3093 if (mctx->state_log[to_idx])
3094 {
3095 int ret;
3096 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3097 next_node))
3098 continue;
3099 err = re_node_set_init_copy (&union_set,
3100 &mctx->state_log[to_idx]->nodes);
3101 ret = re_node_set_insert (&union_set, next_node);
3102 if (BE (err != REG_NOERROR || ret < 0, 0))
3103 {
3104 re_node_set_free (&union_set);
3105 err = err != REG_NOERROR ? err : REG_ESPACE;
3106 return err;
3107 }
3108 }
3109 else
3110 {
3111 err = re_node_set_init_1 (&union_set, next_node);
3112 if (BE (err != REG_NOERROR, 0))
3113 return err;
3114 }
3115 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3116 re_node_set_free (&union_set);
3117 if (BE (mctx->state_log[to_idx] == NULL
3118 && err != REG_NOERROR, 0))
3119 return err;
3120 }
3121 }
3122 return REG_NOERROR;
3123}
3124
3125/* Build transition table for the state.
3126 Return the new table if succeeded, otherwise return NULL. */
3127
3128static re_dfastate_t **
c13c99fa 3129build_trtable (preg, state)
6291ee3c 3130 const regex_t *preg;
c13c99fa 3131 re_dfastate_t *state;
6291ee3c
UD
3132{
3133 reg_errcode_t err;
3134 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3ce12656
UD
3135 int i, j, ch;
3136 unsigned int elem, mask;
6291ee3c
UD
3137 int dests_node_malloced = 0, dest_states_malloced = 0;
3138 int ndests; /* Number of the destination states from `state'. */
3139 re_dfastate_t **trtable;
3140 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3141 re_node_set follows, *dests_node;
3142 bitset *dests_ch;
3143 bitset acceptable;
3b0bdc72
UD
3144
3145 /* We build DFA states which corresponds to the destination nodes
3146 from `state'. `dests_node[i]' represents the nodes which i-th
3147 destination state contains, and `dests_ch[i]' represents the
3148 characters which i-th destination state accepts. */
05dab910
RM
3149#ifdef _LIBC
3150 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3151 dests_node = (re_node_set *)
3152 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3153 else
3154#endif
3155 {
3156 dests_node = (re_node_set *)
3157 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3158 if (BE (dests_node == NULL, 0))
3159 return NULL;
3160 dests_node_malloced = 1;
3161 }
3162 dests_ch = (bitset *) (dests_node + SBC_MAX);
3b0bdc72
UD
3163
3164 /* Initialize transiton table. */
c13c99fa 3165 state->word_trtable = 0;
3b0bdc72
UD
3166
3167 /* At first, group all nodes belonging to `state' into several
3168 destinations. */
3169 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
bc15410e 3170 if (BE (ndests <= 0, 0))
3b0bdc72 3171 {
05dab910
RM
3172 if (dests_node_malloced)
3173 free (dests_node);
a9388965 3174 /* Return NULL in case of an error, trtable otherwise. */
05dab910 3175 if (ndests == 0)
c13c99fa 3176 {
3ce12656
UD
3177 state->trtable = (re_dfastate_t **)
3178 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3179 return state->trtable;
c13c99fa 3180 }
05dab910 3181 return NULL;
3b0bdc72
UD
3182 }
3183
a9388965 3184 err = re_node_set_alloc (&follows, ndests + 1);
05dab910
RM
3185 if (BE (err != REG_NOERROR, 0))
3186 goto out_free;
3187
3188#ifdef _LIBC
3189 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3190 + ndests * 3 * sizeof (re_dfastate_t *)))
3191 dest_states = (re_dfastate_t **)
3192 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3193 else
3194#endif
3195 {
3196 dest_states = (re_dfastate_t **)
3197 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3198 if (BE (dest_states == NULL, 0))
3199 {
3200out_free:
3201 if (dest_states_malloced)
3202 free (dest_states);
3203 re_node_set_free (&follows);
3204 for (i = 0; i < ndests; ++i)
3205 re_node_set_free (dests_node + i);
05dab910
RM
3206 if (dests_node_malloced)
3207 free (dests_node);
3208 return NULL;
3209 }
3210 dest_states_malloced = 1;
3211 }
3212 dest_states_word = dest_states + ndests;
3213 dest_states_nl = dest_states_word + ndests;
3214 bitset_empty (acceptable);
a9388965 3215
3b0bdc72
UD
3216 /* Then build the states for all destinations. */
3217 for (i = 0; i < ndests; ++i)
3218 {
3219 int next_node;
3220 re_node_set_empty (&follows);
3221 /* Merge the follows of this destination states. */
3222 for (j = 0; j < dests_node[i].nelem; ++j)
15a7d175
UD
3223 {
3224 next_node = dfa->nexts[dests_node[i].elems[j]];
3225 if (next_node != -1)
3226 {
3227 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3228 if (BE (err != REG_NOERROR, 0))
05dab910 3229 goto out_free;
15a7d175
UD
3230 }
3231 }
a9388965 3232 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
bc15410e 3233 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
15a7d175 3234 goto out_free;
3b0bdc72 3235 /* If the new state has context constraint,
15a7d175 3236 build appropriate states for these contexts. */
3b0bdc72 3237 if (dest_states[i]->has_constraint)
15a7d175
UD
3238 {
3239 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3240 CONTEXT_WORD);
3241 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3242 goto out_free;
3ce12656
UD
3243
3244 if (dest_states[i] != dest_states_word[i]
3245 && dfa->mb_cur_max > 1)
3246 state->word_trtable = 1;
3247
15a7d175
UD
3248 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3249 CONTEXT_NEWLINE);
3250 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3251 goto out_free;
3ce12656 3252 }
3b0bdc72 3253 else
15a7d175
UD
3254 {
3255 dest_states_word[i] = dest_states[i];
3256 dest_states_nl[i] = dest_states[i];
3257 }
3b0bdc72
UD
3258 bitset_merge (acceptable, dests_ch[i]);
3259 }
3260
3ce12656
UD
3261 if (!BE (state->word_trtable, 0))
3262 {
3263 /* We don't care about whether the following character is a word
3264 character, or we are in a single-byte character set so we can
3265 discern by looking at the character code: allocate a
3266 256-entry transition table. */
3267 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3268 if (BE (trtable == NULL, 0))
3269 goto out_free;
3270
3271 /* For all characters ch...: */
3272 for (i = 0; i < BITSET_UINTS; ++i)
3273 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3274 elem;
3275 mask <<= 1, elem >>= 1, ++ch)
3276 if (BE (elem & 1, 0))
3277 {
3278 /* There must be exactly one destination which accepts
3279 character ch. See group_nodes_into_DFAstates. */
3280 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3281 ;
3282
3283 /* j-th destination accepts the word character ch. */
3284 if (IS_WORD_CHAR (ch))
3285 trtable[ch] = dest_states_word[j];
3286 else
3287 trtable[ch] = dest_states[j];
3288 }
3289 }
3290 else
3291 {
3292 /* We care about whether the following character is a word
3293 character, and we are in a multi-byte character set: discern
3294 by looking at the character code: build two 256-entry
3295 transition tables, one starting at trtable[0] and one
3296 starting at trtable[SBC_MAX]. */
3297 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3298 2 * SBC_MAX);
3299 if (BE (trtable == NULL, 0))
3300 goto out_free;
3301
3302 /* For all characters ch...: */
3303 for (i = 0; i < BITSET_UINTS; ++i)
3304 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3305 elem;
3306 mask <<= 1, elem >>= 1, ++ch)
3307 if (BE (elem & 1, 0))
3308 {
3309 /* There must be exactly one destination which accepts
3310 character ch. See group_nodes_into_DFAstates. */
3311 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3312 ;
3313
3314 /* j-th destination accepts the word character ch. */
3315 trtable[ch] = dest_states[j];
3316 trtable[ch + SBC_MAX] = dest_states_word[j];
3317 }
3318 }
3319
3b0bdc72 3320 /* new line */
c202c2c5
UD
3321 if (bitset_contain (acceptable, NEWLINE_CHAR))
3322 {
3323 /* The current state accepts newline character. */
3ce12656
UD
3324 for (j = 0; j < ndests; ++j)
3325 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
15a7d175
UD
3326 {
3327 /* k-th destination accepts newline character. */
3ce12656 3328 trtable[NEWLINE_CHAR] = dest_states_nl[j];
c13c99fa 3329 if (state->word_trtable)
3ce12656 3330 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
15a7d175
UD
3331 /* There must be only one destination which accepts
3332 newline. See group_nodes_into_DFAstates. */
3333 break;
3334 }
c202c2c5 3335 }
3b0bdc72 3336
05dab910
RM
3337 if (dest_states_malloced)
3338 free (dest_states);
3b0bdc72
UD
3339
3340 re_node_set_free (&follows);
3341 for (i = 0; i < ndests; ++i)
3342 re_node_set_free (dests_node + i);
3343
05dab910
RM
3344 if (dests_node_malloced)
3345 free (dests_node);
3b0bdc72 3346
c13c99fa 3347 state->trtable = trtable;
3b0bdc72
UD
3348 return trtable;
3349}
3350
3351/* Group all nodes belonging to STATE into several destinations.
3352 Then for all destinations, set the nodes belonging to the destination
3353 to DESTS_NODE[i] and set the characters accepted by the destination
3354 to DEST_CH[i]. This function return the number of destinations. */
3355
3356static int
3357group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
3358 const regex_t *preg;
3359 const re_dfastate_t *state;
3360 re_node_set *dests_node;
3361 bitset *dests_ch;
3362{
a9388965 3363 reg_errcode_t err;
3b0bdc72
UD
3364 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3365 int i, j, k;
3366 int ndests; /* Number of the destinations from `state'. */
3367 bitset accepts; /* Characters a node can accept. */
3368 const re_node_set *cur_nodes = &state->nodes;
3369 bitset_empty (accepts);
3370 ndests = 0;
3371
3372 /* For all the nodes belonging to `state', */
3373 for (i = 0; i < cur_nodes->nelem; ++i)
3374 {
3b0bdc72
UD
3375 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3376 re_token_type_t type = node->type;
485d775d 3377 unsigned int constraint = node->constraint;
3b0bdc72
UD
3378
3379 /* Enumerate all single byte character this node can accept. */
3380 if (type == CHARACTER)
15a7d175 3381 bitset_set (accepts, node->opr.c);
3b0bdc72 3382 else if (type == SIMPLE_BRACKET)
15a7d175
UD
3383 {
3384 bitset_merge (accepts, node->opr.sbcset);
3385 }
3b0bdc72 3386 else if (type == OP_PERIOD)
15a7d175 3387 {
65e6becf
UD
3388#ifdef RE_ENABLE_I18N
3389 if (dfa->mb_cur_max > 1)
3390 bitset_merge (accepts, dfa->sb_char);
3391 else
0ce7f49c 3392#endif
65e6becf 3393 bitset_set_all (accepts);
15a7d175
UD
3394 if (!(preg->syntax & RE_DOT_NEWLINE))
3395 bitset_clear (accepts, '\n');
3396 if (preg->syntax & RE_DOT_NOT_NULL)
3397 bitset_clear (accepts, '\0');
3398 }
c0d5034e 3399#ifdef RE_ENABLE_I18N
ad7f28c2
UD
3400 else if (type == OP_UTF8_PERIOD)
3401 {
3402 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3403 if (!(preg->syntax & RE_DOT_NEWLINE))
3404 bitset_clear (accepts, '\n');
3405 if (preg->syntax & RE_DOT_NOT_NULL)
3406 bitset_clear (accepts, '\0');
3407 }
c0d5034e 3408#endif
3b0bdc72 3409 else
15a7d175 3410 continue;
3b0bdc72
UD
3411
3412 /* Check the `accepts' and sift the characters which are not
15a7d175 3413 match it the context. */
3b0bdc72 3414 if (constraint)
15a7d175 3415 {
15a7d175
UD
3416 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3417 {
3418 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3419 bitset_empty (accepts);
3420 if (accepts_newline)
3421 bitset_set (accepts, NEWLINE_CHAR);
3422 else
3423 continue;
3424 }
66b110e8
UD
3425 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3426 {
3427 bitset_empty (accepts);
3428 continue;
3429 }
c13c99fa 3430
66b110e8 3431 if (constraint & NEXT_WORD_CONSTRAINT)
65e6becf 3432 {
457beec8 3433 unsigned int any_set = 0;
1cef7b3c
UD
3434 if (type == CHARACTER && !node->word_char)
3435 {
3436 bitset_empty (accepts);
3437 continue;
3438 }
65e6becf
UD
3439#ifdef RE_ENABLE_I18N
3440 if (dfa->mb_cur_max > 1)
3441 for (j = 0; j < BITSET_UINTS; ++j)
457beec8 3442 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
65e6becf
UD
3443 else
3444#endif
3445 for (j = 0; j < BITSET_UINTS; ++j)
457beec8
UD
3446 any_set |= (accepts[j] &= dfa->word_char[j]);
3447 if (!any_set)
3448 continue;
65e6becf 3449 }
66b110e8 3450 if (constraint & NEXT_NOTWORD_CONSTRAINT)
65e6becf 3451 {
457beec8 3452 unsigned int any_set = 0;
1cef7b3c
UD
3453 if (type == CHARACTER && node->word_char)
3454 {
3455 bitset_empty (accepts);
3456 continue;
3457 }
65e6becf
UD
3458#ifdef RE_ENABLE_I18N
3459 if (dfa->mb_cur_max > 1)
3460 for (j = 0; j < BITSET_UINTS; ++j)
457beec8 3461 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
65e6becf
UD
3462 else
3463#endif
3464 for (j = 0; j < BITSET_UINTS; ++j)
457beec8
UD
3465 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3466 if (!any_set)
3467 continue;
65e6becf 3468 }
15a7d175 3469 }
3b0bdc72
UD
3470
3471 /* Then divide `accepts' into DFA states, or create a new
457beec8 3472 state. Above, we make sure that accepts is not empty. */
3b0bdc72 3473 for (j = 0; j < ndests; ++j)
15a7d175
UD
3474 {
3475 bitset intersec; /* Intersection sets, see below. */
3476 bitset remains;
3477 /* Flags, see below. */
3478 int has_intersec, not_subset, not_consumed;
3479
3480 /* Optimization, skip if this state doesn't accept the character. */
3481 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3482 continue;
3483
3484 /* Enumerate the intersection set of this state and `accepts'. */
3485 has_intersec = 0;
3486 for (k = 0; k < BITSET_UINTS; ++k)
3487 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3488 /* And skip if the intersection set is empty. */
3489 if (!has_intersec)
3490 continue;
3491
3492 /* Then check if this state is a subset of `accepts'. */
3493 not_subset = not_consumed = 0;
3494 for (k = 0; k < BITSET_UINTS; ++k)
3495 {
3496 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3497 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3498 }
3499
3500 /* If this state isn't a subset of `accepts', create a
3501 new group state, which has the `remains'. */
3502 if (not_subset)
3503 {
3504 bitset_copy (dests_ch[ndests], remains);
3505 bitset_copy (dests_ch[j], intersec);
3506 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3507 if (BE (err != REG_NOERROR, 0))
3508 goto error_return;
3509 ++ndests;
3510 }
3511
3512 /* Put the position in the current group. */
3513 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3514 if (BE (err < 0, 0))
3515 goto error_return;
3516
3517 /* If all characters are consumed, go to next node. */
3518 if (!not_consumed)
3519 break;
3520 }
3b0bdc72
UD
3521 /* Some characters remain, create a new group. */
3522 if (j == ndests)
15a7d175
UD
3523 {
3524 bitset_copy (dests_ch[ndests], accepts);
3525 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3526 if (BE (err != REG_NOERROR, 0))
3527 goto error_return;
3528 ++ndests;
3529 bitset_empty (accepts);
3530 }
3b0bdc72
UD
3531 }
3532 return ndests;
1b2c2628
UD
3533 error_return:
3534 for (j = 0; j < ndests; ++j)
3535 re_node_set_free (dests_node + j);
3536 return -1;
3b0bdc72
UD
3537}
3538
434d3784
UD
3539#ifdef RE_ENABLE_I18N
3540/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3541 Return the number of the bytes the node accepts.
3542 STR_IDX is the current index of the input string.
3543
3544 This function handles the nodes which can accept one character, or
3545 one collating element like '.', '[a-z]', opposite to the other nodes
3546 can only accept one byte. */
3b0bdc72
UD
3547
3548static int
3549check_node_accept_bytes (preg, node_idx, input, str_idx)
3550 const regex_t *preg;
3551 int node_idx, str_idx;
3552 const re_string_t *input;
3553{
3554 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3555 const re_token_t *node = dfa->nodes + node_idx;
ad7f28c2 3556 int char_len, elem_len;
434d3784 3557 int i;
ad7f28c2
UD
3558
3559 if (BE (node->type == OP_UTF8_PERIOD, 0))
3560 {
3561 unsigned char c = re_string_byte_at (input, str_idx), d;
3562 if (BE (c < 0xc2, 1))
3563 return 0;
3564
3565 if (str_idx + 2 > input->len)
3566 return 0;
3567
3568 d = re_string_byte_at (input, str_idx + 1);
3569 if (c < 0xe0)
3570 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3571 else if (c < 0xf0)
3572 {
3573 char_len = 3;
3574 if (c == 0xe0 && d < 0xa0)
3575 return 0;
3576 }
3577 else if (c < 0xf8)
3578 {
3579 char_len = 4;
3580 if (c == 0xf0 && d < 0x90)
3581 return 0;
3582 }
3583 else if (c < 0xfc)
3584 {
3585 char_len = 5;
3586 if (c == 0xf8 && d < 0x88)
3587 return 0;
3588 }
3589 else if (c < 0xfe)
3590 {
3591 char_len = 6;
3592 if (c == 0xfc && d < 0x84)
3593 return 0;
3594 }
3595 else
3596 return 0;
3597
3598 if (str_idx + char_len > input->len)
3599 return 0;
3600
3601 for (i = 1; i < char_len; ++i)
3602 {
3603 d = re_string_byte_at (input, str_idx + i);
3604 if (d < 0x80 || d > 0xbf)
3605 return 0;
3606 }
3607 return char_len;
3608 }
3609
3610 char_len = re_string_char_size_at (input, str_idx);
3b0bdc72
UD
3611 if (node->type == OP_PERIOD)
3612 {
ad7f28c2
UD
3613 if (char_len <= 1)
3614 return 0;
3615 /* FIXME: I don't think this if is needed, as both '\n'
3616 and '\0' are char_len == 1. */
434d3784 3617 /* '.' accepts any one character except the following two cases. */
3b0bdc72 3618 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
15a7d175
UD
3619 re_string_byte_at (input, str_idx) == '\n') ||
3620 ((preg->syntax & RE_DOT_NOT_NULL) &&
3621 re_string_byte_at (input, str_idx) == '\0'))
3622 return 0;
3b0bdc72
UD
3623 return char_len;
3624 }
ad7f28c2
UD
3625
3626 elem_len = re_string_elem_size_at (input, str_idx);
3627 if (elem_len <= 1 && char_len <= 1)
3628 return 0;
3629
3630 if (node->type == COMPLEX_BRACKET)
3b0bdc72
UD
3631 {
3632 const re_charset_t *cset = node->opr.mbcset;
434d3784 3633# ifdef _LIBC
85c54a32
UD
3634 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3635 + str_idx);
5f93cd52
UD
3636 int j;
3637 uint32_t nrules;
434d3784
UD
3638# endif /* _LIBC */
3639 int match_len = 0;
3640 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
15a7d175 3641 ? re_string_wchar_at (input, str_idx) : 0);
434d3784
UD
3642
3643 /* match with multibyte character? */
3644 for (i = 0; i < cset->nmbchars; ++i)
15a7d175
UD
3645 if (wc == cset->mbchars[i])
3646 {
3647 match_len = char_len;
3648 goto check_node_accept_bytes_match;
3649 }
434d3784
UD
3650 /* match with character_class? */
3651 for (i = 0; i < cset->nchar_classes; ++i)
15a7d175
UD
3652 {
3653 wctype_t wt = cset->char_classes[i];
3654 if (__iswctype (wc, wt))
3655 {
3656 match_len = char_len;
3657 goto check_node_accept_bytes_match;
3658 }
3659 }
434d3784
UD
3660
3661# ifdef _LIBC
5f93cd52 3662 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3b0bdc72 3663 if (nrules != 0)
15a7d175
UD
3664 {
3665 unsigned int in_collseq = 0;
3666 const int32_t *table, *indirect;
3667 const unsigned char *weights, *extra;
3668 const char *collseqwc;
3669 int32_t idx;
3670 /* This #include defines a local function! */
434d3784 3671# include <locale/weight.h>
3b0bdc72 3672
15a7d175
UD
3673 /* match with collating_symbol? */
3674 if (cset->ncoll_syms)
3675 extra = (const unsigned char *)
3676 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3677 for (i = 0; i < cset->ncoll_syms; ++i)
3678 {
3679 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3680 /* Compare the length of input collating element and
3681 the length of current collating element. */
3682 if (*coll_sym != elem_len)
3683 continue;
3684 /* Compare each bytes. */
3685 for (j = 0; j < *coll_sym; j++)
3686 if (pin[j] != coll_sym[1 + j])
3687 break;
3688 if (j == *coll_sym)
3689 {
3690 /* Match if every bytes is equal. */
3691 match_len = j;
3692 goto check_node_accept_bytes_match;
3693 }
3694 }
3695
3696 if (cset->nranges)
3697 {
3698 if (elem_len <= char_len)
3699 {
3700 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
25337753 3701 in_collseq = __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
3702 }
3703 else
3704 in_collseq = find_collation_sequence_value (pin, elem_len);
3705 }
3706 /* match with range expression? */
3707 for (i = 0; i < cset->nranges; ++i)
3708 if (cset->range_starts[i] <= in_collseq
3709 && in_collseq <= cset->range_ends[i])
3710 {
3711 match_len = elem_len;
3712 goto check_node_accept_bytes_match;
3713 }
3714
3715 /* match with equivalence_class? */
3716 if (cset->nequiv_classes)
3717 {
3718 const unsigned char *cp = pin;
3719 table = (const int32_t *)
3720 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3721 weights = (const unsigned char *)
3722 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3723 extra = (const unsigned char *)
3724 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3725 indirect = (const int32_t *)
3726 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3727 idx = findidx (&cp);
3728 if (idx > 0)
3729 for (i = 0; i < cset->nequiv_classes; ++i)
3730 {
3731 int32_t equiv_class_idx = cset->equiv_classes[i];
3732 size_t weight_len = weights[idx];
3733 if (weight_len == weights[equiv_class_idx])
3734 {
3735 int cnt = 0;
3736 while (cnt <= weight_len
3737 && (weights[equiv_class_idx + 1 + cnt]
3738 == weights[idx + 1 + cnt]))
3739 ++cnt;
3740 if (cnt > weight_len)
3741 {
3742 match_len = elem_len;
3743 goto check_node_accept_bytes_match;
3744 }
3745 }
3746 }
3747 }
3748 }
434d3784
UD
3749 else
3750# endif /* _LIBC */
15a7d175
UD
3751 {
3752 /* match with range expression? */
92b27c74 3753#if __GNUC__ >= 2
15a7d175 3754 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
92b27c74 3755#else
15a7d175
UD
3756 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3757 cmp_buf[2] = wc;
92b27c74 3758#endif
15a7d175
UD
3759 for (i = 0; i < cset->nranges; ++i)
3760 {
3761 cmp_buf[0] = cset->range_starts[i];
3762 cmp_buf[4] = cset->range_ends[i];
3763 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3764 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3765 {
3766 match_len = char_len;
3767 goto check_node_accept_bytes_match;
3768 }
3769 }
3770 }
434d3784
UD
3771 check_node_accept_bytes_match:
3772 if (!cset->non_match)
15a7d175 3773 return match_len;
434d3784 3774 else
15a7d175
UD
3775 {
3776 if (match_len > 0)
3777 return 0;
3778 else
3779 return (elem_len > char_len) ? elem_len : char_len;
3780 }
3b0bdc72
UD
3781 }
3782 return 0;
3783}
3784
434d3784 3785# ifdef _LIBC
3b0bdc72
UD
3786static unsigned int
3787find_collation_sequence_value (mbs, mbs_len)
c202c2c5 3788 const unsigned char *mbs;
3b0bdc72
UD
3789 size_t mbs_len;
3790{
3791 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3792 if (nrules == 0)
3793 {
3794 if (mbs_len == 1)
15a7d175
UD
3795 {
3796 /* No valid character. Match it as a single byte character. */
3797 const unsigned char *collseq = (const unsigned char *)
3798 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3799 return collseq[mbs[0]];
3800 }
3b0bdc72
UD
3801 return UINT_MAX;
3802 }
3803 else
3804 {
3805 int32_t idx;
3806 const unsigned char *extra = (const unsigned char *)
15a7d175 3807 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3808
3809 for (idx = 0; ;)
15a7d175
UD
3810 {
3811 int mbs_cnt, found = 0;
3812 int32_t elem_mbs_len;
3813 /* Skip the name of collating element name. */
3814 idx = idx + extra[idx] + 1;
3815 elem_mbs_len = extra[idx++];
3816 if (mbs_len == elem_mbs_len)
3817 {
3818 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3819 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3820 break;
3821 if (mbs_cnt == elem_mbs_len)
3822 /* Found the entry. */
3823 found = 1;
3824 }
3825 /* Skip the byte sequence of the collating element. */
3826 idx += elem_mbs_len;
3827 /* Adjust for the alignment. */
3828 idx = (idx + 3) & ~3;
3829 /* Skip the collation sequence value. */
3830 idx += sizeof (uint32_t);
3831 /* Skip the wide char sequence of the collating element. */
3832 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3833 /* If we found the entry, return the sequence value. */
3834 if (found)
3835 return *(uint32_t *) (extra + idx);
3836 /* Skip the collation sequence value. */
3837 idx += sizeof (uint32_t);
3838 }
3b0bdc72
UD
3839 }
3840}
434d3784
UD
3841# endif /* _LIBC */
3842#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3843
3844/* Check whether the node accepts the byte which is IDX-th
3845 byte of the INPUT. */
3846
3847static int
612546c6 3848check_node_accept (preg, node, mctx, idx)
3b0bdc72
UD
3849 const regex_t *preg;
3850 const re_token_t *node;
612546c6
UD
3851 const re_match_context_t *mctx;
3852 int idx;
3b0bdc72 3853{
3b0bdc72 3854 unsigned char ch;
485d775d 3855 if (node->constraint)
3b0bdc72
UD
3856 {
3857 /* The node has constraints. Check whether the current context
15a7d175 3858 satisfies the constraints. */
612546c6 3859 unsigned int context = re_string_context_at (mctx->input, idx,
15a7d175
UD
3860 mctx->eflags,
3861 preg->newline_anchor);
3b0bdc72 3862 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
15a7d175 3863 return 0;
3b0bdc72 3864 }
612546c6 3865 ch = re_string_byte_at (mctx->input, idx);
ad7f28c2
UD
3866 switch (node->type)
3867 {
3868 case CHARACTER:
3869 return node->opr.c == ch;
3870 case SIMPLE_BRACKET:
3871 return bitset_contain (node->opr.sbcset, ch);
c0d5034e 3872#ifdef RE_ENABLE_I18N
ad7f28c2
UD
3873 case OP_UTF8_PERIOD:
3874 if (ch >= 0x80)
3875 return 0;
3876 /* FALLTHROUGH */
c0d5034e 3877#endif
ad7f28c2
UD
3878 case OP_PERIOD:
3879 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
3880 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
3881 default:
3882 return 0;
3883 }
3b0bdc72 3884}
612546c6
UD
3885
3886/* Extend the buffers, if the buffers have run out. */
3887
3888static reg_errcode_t
3889extend_buffers (mctx)
3890 re_match_context_t *mctx;
3891{
3892 reg_errcode_t ret;
3893 re_string_t *pstr = mctx->input;
3894
3895 /* Double the lengthes of the buffers. */
3896 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3897 if (BE (ret != REG_NOERROR, 0))
3898 return ret;
3899
3900 if (mctx->state_log != NULL)
3901 {
3902 /* And double the length of state_log. */
951d6408
UD
3903 /* XXX We have no indication of the size of this buffer. If this
3904 allocation fail we have no indication that the state_log array
3905 does not have the right size. */
3906 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3907 pstr->bufs_len + 1);
1b2c2628 3908 if (BE (new_array == NULL, 0))
15a7d175 3909 return REG_ESPACE;
1b2c2628 3910 mctx->state_log = new_array;
612546c6
UD
3911 }
3912
3913 /* Then reconstruct the buffers. */
3914 if (pstr->icase)
3915 {
3916#ifdef RE_ENABLE_I18N
3c0fb574 3917 if (pstr->mb_cur_max > 1)
bb3f4825
UD
3918 {
3919 ret = build_wcs_upper_buffer (pstr);
3920 if (BE (ret != REG_NOERROR, 0))
3921 return ret;
3922 }
612546c6
UD
3923 else
3924#endif /* RE_ENABLE_I18N */
15a7d175 3925 build_upper_buffer (pstr);
612546c6
UD
3926 }
3927 else
3928 {
3929#ifdef RE_ENABLE_I18N
3c0fb574 3930 if (pstr->mb_cur_max > 1)
15a7d175 3931 build_wcs_buffer (pstr);
612546c6
UD
3932 else
3933#endif /* RE_ENABLE_I18N */
15a7d175
UD
3934 {
3935 if (pstr->trans != NULL)
3936 re_string_translate_buffer (pstr);
15a7d175 3937 }
612546c6
UD
3938 }
3939 return REG_NOERROR;
3940}
3941
3b0bdc72
UD
3942\f
3943/* Functions for matching context. */
3944
6291ee3c
UD
3945/* Initialize MCTX. */
3946
a9388965 3947static reg_errcode_t
612546c6 3948match_ctx_init (mctx, eflags, input, n)
3b0bdc72 3949 re_match_context_t *mctx;
612546c6
UD
3950 int eflags, n;
3951 re_string_t *input;
3b0bdc72
UD
3952{
3953 mctx->eflags = eflags;
612546c6
UD
3954 mctx->input = input;
3955 mctx->match_last = -1;
3b0bdc72 3956 if (n > 0)
a9388965
UD
3957 {
3958 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
6291ee3c
UD
3959 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
3960 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
15a7d175 3961 return REG_ESPACE;
a9388965 3962 }
3b0bdc72
UD
3963 else
3964 mctx->bkref_ents = NULL;
3965 mctx->nbkref_ents = 0;
3966 mctx->abkref_ents = n;
6291ee3c
UD
3967 mctx->max_mb_elem_len = 1;
3968 mctx->nsub_tops = 0;
3969 mctx->asub_tops = n;
a9388965 3970 return REG_NOERROR;
3b0bdc72
UD
3971}
3972
6291ee3c
UD
3973/* Clean the entries which depend on the current input in MCTX.
3974 This function must be invoked when the matcher changes the start index
3975 of the input, or changes the input string. */
3976
3977static void
3978match_ctx_clean (mctx)
3979 re_match_context_t *mctx;
3980{
3981 match_ctx_free_subtops (mctx);
3982 mctx->nsub_tops = 0;
3983 mctx->nbkref_ents = 0;
3984}
3985
3986/* Free all the memory associated with MCTX. */
3987
3b0bdc72
UD
3988static void
3989match_ctx_free (mctx)
3990 re_match_context_t *mctx;
3991{
6291ee3c
UD
3992 match_ctx_free_subtops (mctx);
3993 re_free (mctx->sub_tops);
3b0bdc72
UD
3994 re_free (mctx->bkref_ents);
3995}
3996
6291ee3c
UD
3997/* Free all the memory associated with MCTX->SUB_TOPS. */
3998
3999static void
4000match_ctx_free_subtops (mctx)
4001 re_match_context_t *mctx;
4002{
4003 int st_idx;
4004 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4005 {
4006 int sl_idx;
4007 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4008 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4009 {
4010 re_sub_match_last_t *last = top->lasts[sl_idx];
4011 re_free (last->path.array);
4012 re_free (last);
4013 }
4014 re_free (top->lasts);
4015 if (top->path)
4016 {
4017 re_free (top->path->array);
4018 re_free (top->path);
4019 }
4020 free (top);
4021 }
4022}
4023
4024/* Add a new backreference entry to MCTX.
4025 Note that we assume that caller never call this function with duplicate
4026 entry, and call with STR_IDX which isn't smaller than any existing entry.
4027*/
3b0bdc72 4028
a9388965 4029static reg_errcode_t
0742e48e 4030match_ctx_add_entry (mctx, node, str_idx, from, to)
6291ee3c
UD
4031 re_match_context_t *mctx;
4032 int node, str_idx, from, to;
3b0bdc72
UD
4033{
4034 if (mctx->nbkref_ents >= mctx->abkref_ents)
4035 {
1b2c2628
UD
4036 struct re_backref_cache_entry* new_entry;
4037 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
15a7d175 4038 mctx->abkref_ents * 2);
1b2c2628 4039 if (BE (new_entry == NULL, 0))
15a7d175
UD
4040 {
4041 re_free (mctx->bkref_ents);
4042 return REG_ESPACE;
4043 }
1b2c2628 4044 mctx->bkref_ents = new_entry;
3b0bdc72 4045 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
15a7d175 4046 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3b0bdc72
UD
4047 mctx->abkref_ents *= 2;
4048 }
4049 mctx->bkref_ents[mctx->nbkref_ents].node = node;
0742e48e
UD
4050 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4051 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4052 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4053 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
4054 if (mctx->max_mb_elem_len < to - from)
4055 mctx->max_mb_elem_len = to - from;
a9388965 4056 return REG_NOERROR;
3b0bdc72 4057}
0742e48e 4058
6291ee3c
UD
4059/* Search for the first entry which has the same str_idx.
4060 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4061
4062static int
4063search_cur_bkref_entry (mctx, str_idx)
4064 re_match_context_t *mctx;
4065 int str_idx;
4066{
4067 int left, right, mid;
4068 right = mctx->nbkref_ents;
4069 for (left = 0; left < right;)
4070 {
4071 mid = (left + right) / 2;
4072 if (mctx->bkref_ents[mid].str_idx < str_idx)
4073 left = mid + 1;
4074 else
4075 right = mid;
4076 }
4077 return left;
4078}
4079
0742e48e
UD
4080static void
4081match_ctx_clear_flag (mctx)
4082 re_match_context_t *mctx;
4083{
4084 int i;
4085 for (i = 0; i < mctx->nbkref_ents; ++i)
951d6408 4086 mctx->bkref_ents[i].flag = 0;
0742e48e
UD
4087}
4088
6291ee3c
UD
4089/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4090 at STR_IDX. */
4091
4092static reg_errcode_t
4093match_ctx_add_subtop (mctx, node, str_idx)
4094 re_match_context_t *mctx;
4095 int node, str_idx;
4096{
4097#ifdef DEBUG
4098 assert (mctx->sub_tops != NULL);
4099 assert (mctx->asub_tops > 0);
4100#endif
951d6408 4101 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
6291ee3c 4102 {
951d6408
UD
4103 int new_asub_tops = mctx->asub_tops * 2;
4104 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4105 re_sub_match_top_t *,
4106 new_asub_tops);
6291ee3c
UD
4107 if (BE (new_array == NULL, 0))
4108 return REG_ESPACE;
4109 mctx->sub_tops = new_array;
951d6408 4110 mctx->asub_tops = new_asub_tops;
6291ee3c
UD
4111 }
4112 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
951d6408 4113 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
6291ee3c
UD
4114 return REG_ESPACE;
4115 mctx->sub_tops[mctx->nsub_tops]->node = node;
4116 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4117 return REG_NOERROR;
4118}
4119
4120/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4121 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4122
4123static re_sub_match_last_t *
4124match_ctx_add_sublast (subtop, node, str_idx)
4125 re_sub_match_top_t *subtop;
4126 int node, str_idx;
4127{
4128 re_sub_match_last_t *new_entry;
951d6408 4129 if (BE (subtop->nlasts == subtop->alasts, 0))
6291ee3c 4130 {
951d6408
UD
4131 int new_alasts = 2 * subtop->alasts + 1;
4132 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4133 re_sub_match_last_t *,
4134 new_alasts);
6291ee3c
UD
4135 if (BE (new_array == NULL, 0))
4136 return NULL;
4137 subtop->lasts = new_array;
951d6408 4138 subtop->alasts = new_alasts;
6291ee3c
UD
4139 }
4140 new_entry = calloc (1, sizeof (re_sub_match_last_t));
951d6408
UD
4141 if (BE (new_entry != NULL, 1))
4142 {
4143 subtop->lasts[subtop->nlasts] = new_entry;
4144 new_entry->node = node;
4145 new_entry->str_idx = str_idx;
4146 ++subtop->nlasts;
4147 }
6291ee3c
UD
4148 return new_entry;
4149}
4150
0742e48e
UD
4151static void
4152sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
15a7d175 4153 check_subexp)
0742e48e
UD
4154 re_sift_context_t *sctx;
4155 re_dfastate_t **sifted_sts, **limited_sts;
4156 int last_node, last_str_idx, check_subexp;
4157{
4158 sctx->sifted_states = sifted_sts;
4159 sctx->limited_states = limited_sts;
4160 sctx->last_node = last_node;
4161 sctx->last_str_idx = last_str_idx;
4162 sctx->check_subexp = check_subexp;
485d775d
UD
4163 sctx->cur_bkref = -1;
4164 sctx->cls_subexp_idx = -1;
0742e48e
UD
4165 re_node_set_init_empty (&sctx->limits);
4166}