]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regexec.c
Update.
[thirdparty/glibc.git] / posix / regexec.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
56b168be 2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
a9388965 21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
56b168be 22 int n) internal_function;
3ce12656
UD
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
e3a87852
UD
25static void match_ctx_free_subtops (re_match_context_t *mctx)
26 internal_function;
a9388965 27static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
e3a87852
UD
28 int str_idx, int from, int to)
29 internal_function;
30static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
31 internal_function;
3ce12656 32static void match_ctx_clear_flag (re_match_context_t *mctx) internal_function;
6291ee3c 33static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
3ce12656 34 int str_idx) internal_function;
6291ee3c 35static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
e3a87852
UD
36 int node, int str_idx)
37 internal_function;
0742e48e 38static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
15a7d175 39 re_dfastate_t **limited_sts, int last_node,
e3a87852
UD
40 int last_str_idx, int check_subexp)
41 internal_function;
a9388965 42static reg_errcode_t re_search_internal (const regex_t *preg,
15a7d175
UD
43 const char *string, int length,
44 int start, int range, int stop,
45 size_t nmatch, regmatch_t pmatch[],
3ce12656 46 int eflags) internal_function;
ac3d553b 47static int re_search_2_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
48 const char *string1, int length1,
49 const char *string2, int length2,
50 int start, int range, struct re_registers *regs,
3ce12656 51 int stop, int ret_len) internal_function;
ac3d553b 52static int re_search_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
53 const char *string, int length, int start,
54 int range, int stop, struct re_registers *regs,
3ce12656 55 int ret_len) internal_function;
ac3d553b 56static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
3ce12656 57 int nregs, int regs_allocated) internal_function;
bb3f4825 58static inline re_dfastate_t *acquire_init_state_context
e3a87852
UD
59 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
60 __attribute ((always_inline)) internal_function;
61static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
62 internal_function;
15bad1a5
UD
63static int check_matching (re_match_context_t *mctx, int fl_longest_match,
64 int *p_match_first)
e3a87852 65 internal_function;
3b0bdc72 66static int check_halt_node_context (const re_dfa_t *dfa, int node,
3ce12656 67 unsigned int context) internal_function;
e3a87852
UD
68static int check_halt_state_context (const re_match_context_t *mctx,
69 const re_dfastate_t *state, int idx)
70 internal_function;
6b6557e8
UD
71static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
72 regmatch_t *prev_idx_match, int cur_node,
3ce12656 73 int cur_idx, int nmatch) internal_function;
e3a87852
UD
74static int proceed_next_node (const re_match_context_t *mctx,
75 int nregs, regmatch_t *regs,
15a7d175 76 int *pidx, int node, re_node_set *eps_via_nodes,
3ce12656 77 struct re_fail_stack_t *fs) internal_function;
1b2c2628 78static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
15a7d175
UD
79 int str_idx, int *dests, int nregs,
80 regmatch_t *regs,
3ce12656 81 re_node_set *eps_via_nodes) internal_function;
a3022b82 82static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
3ce12656 83 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
612546c6 84static reg_errcode_t set_regs (const regex_t *preg,
15a7d175
UD
85 const re_match_context_t *mctx,
86 size_t nmatch, regmatch_t *pmatch,
3ce12656
UD
87 int fl_backtrack) internal_function;
88static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
485d775d 89
434d3784 90#ifdef RE_ENABLE_I18N
e3a87852 91static int sift_states_iter_mb (const re_match_context_t *mctx,
15a7d175 92 re_sift_context_t *sctx,
3ce12656 93 int node_idx, int str_idx, int max_str_idx) internal_function;
434d3784 94#endif /* RE_ENABLE_I18N */
e3a87852 95static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
3ce12656 96 re_sift_context_t *sctx) internal_function;
e3a87852 97static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
15a7d175
UD
98 re_sift_context_t *sctx,
99 int str_idx,
3ce12656 100 re_node_set *dest_nodes) internal_function;
0742e48e 101static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
15a7d175 102 re_node_set *dest_nodes,
3ce12656 103 const re_node_set *candidates) internal_function;
0742e48e 104static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
15a7d175 105 re_node_set *dest_nodes,
3ce12656 106 const re_node_set *and_nodes) internal_function;
e3a87852
UD
107static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110static int check_dst_limits_calc_pos (re_match_context_t *mctx,
15a7d175 111 int limit, re_node_set *eclosures,
3ce12656 112 int subexp_idx, int node, int str_idx) internal_function;
0742e48e 113static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
15a7d175
UD
114 re_node_set *dest_nodes,
115 const re_node_set *candidates,
116 re_node_set *limits,
117 struct re_backref_cache_entry *bkref_ents,
3ce12656 118 int str_idx) internal_function;
e3a87852 119static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
15a7d175 120 re_sift_context_t *sctx,
3ce12656 121 int str_idx, re_node_set *dest_nodes) internal_function;
7c1be3ec
UD
122static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
123 int next_state_log_idx) internal_function;
0742e48e 124static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
3ce12656 125 re_dfastate_t **src, int num) internal_function;
4c595adb
UD
126static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
e3a87852 128static re_dfastate_t *transit_state (reg_errcode_t *err,
15a7d175 129 re_match_context_t *mctx,
3ce12656 130 re_dfastate_t *state) internal_function;
4c595adb
UD
131static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state) internal_function;
e3a87852 134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6291ee3c 135 re_node_set *cur_nodes,
3ce12656 136 int str_idx) internal_function;
c13c99fa 137#if 0
e3a87852
UD
138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate) internal_function;
c13c99fa 141#endif
434d3784 142#ifdef RE_ENABLE_I18N
e3a87852
UD
143static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
434d3784 145#endif /* RE_ENABLE_I18N */
e3a87852
UD
146static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
147 const re_node_set *nodes) internal_function;
148static reg_errcode_t get_subexp (re_match_context_t *mctx,
3ce12656 149 int bkref_node, int bkref_str_idx) internal_function;
e3a87852 150static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
fe9434bb 151 const re_sub_match_top_t *sub_top,
6291ee3c 152 re_sub_match_last_t *sub_last,
3ce12656 153 int bkref_node, int bkref_str) internal_function;
fe9434bb 154static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
3ce12656 155 int subexp_idx, int type) internal_function;
e3a87852 156static reg_errcode_t check_arrival (re_match_context_t *mctx,
6291ee3c
UD
157 state_array_t *path, int top_node,
158 int top_str, int last_node, int last_str,
3ce12656 159 int type) internal_function;
e3a87852 160static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6291ee3c
UD
161 int str_idx,
162 re_node_set *cur_nodes,
3ce12656 163 re_node_set *next_nodes) internal_function;
6291ee3c
UD
164static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
165 re_node_set *cur_nodes,
3ce12656 166 int ex_subexp, int type) internal_function;
6291ee3c
UD
167static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
168 re_node_set *dst_nodes,
169 int target, int ex_subexp,
3ce12656 170 int type) internal_function;
e3a87852 171static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6291ee3c
UD
172 re_node_set *cur_nodes, int cur_str,
173 int last_str, int subexp_num,
3ce12656 174 int type) internal_function;
56b168be 175static re_dfastate_t **build_trtable (re_dfa_t *dfa,
3ce12656 176 re_dfastate_t *state) internal_function;
434d3784 177#ifdef RE_ENABLE_I18N
56b168be 178static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3ce12656 179 const re_string_t *input, int idx) internal_function;
434d3784 180# ifdef _LIBC
c202c2c5 181static unsigned int find_collation_sequence_value (const unsigned char *mbs,
3ce12656 182 size_t name_len) internal_function;
434d3784
UD
183# endif /* _LIBC */
184#endif /* RE_ENABLE_I18N */
56b168be 185static int group_nodes_into_DFAstates (re_dfa_t *dfa,
15a7d175
UD
186 const re_dfastate_t *state,
187 re_node_set *states_node,
3ce12656 188 bitset *states_ch) internal_function;
e3a87852
UD
189static int check_node_accept (const re_match_context_t *mctx,
190 const re_token_t *node, int idx) internal_function;
3ce12656 191static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
3b0bdc72
UD
192\f
193/* Entry point for POSIX code. */
194
195/* regexec searches for a given pattern, specified by PREG, in the
196 string STRING.
197
198 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
199 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
200 least NMATCH elements, and we set them to the offsets of the
201 corresponding matched substrings.
202
203 EFLAGS specifies `execution flags' which affect matching: if
204 REG_NOTBOL is set, then ^ does not match at the beginning of the
205 string; if REG_NOTEOL is set, then $ does not match at the end.
206
207 We return 0 if we find a match and REG_NOMATCH if not. */
208
209int
210regexec (preg, string, nmatch, pmatch, eflags)
92b27c74
UD
211 const regex_t *__restrict preg;
212 const char *__restrict string;
3b0bdc72
UD
213 size_t nmatch;
214 regmatch_t pmatch[];
215 int eflags;
216{
a9388965 217 reg_errcode_t err;
6fefb4e0
UD
218 int start, length;
219 if (eflags & REG_STARTEND)
220 {
221 start = pmatch[0].rm_so;
222 length = pmatch[0].rm_eo;
223 }
224 else
225 {
226 start = 0;
227 length = strlen (string);
228 }
3b0bdc72 229 if (preg->no_sub)
6fefb4e0
UD
230 err = re_search_internal (preg, string, length, start, length - start,
231 length, 0, NULL, eflags);
3b0bdc72 232 else
6fefb4e0
UD
233 err = re_search_internal (preg, string, length, start, length - start,
234 length, nmatch, pmatch, eflags);
a9388965 235 return err != REG_NOERROR;
3b0bdc72
UD
236}
237#ifdef _LIBC
238weak_alias (__regexec, regexec)
239#endif
240
241/* Entry points for GNU code. */
242
ac3d553b
UD
243/* re_match, re_search, re_match_2, re_search_2
244
245 The former two functions operate on STRING with length LENGTH,
246 while the later two operate on concatenation of STRING1 and STRING2
247 with lengths LENGTH1 and LENGTH2, respectively.
248
249 re_match() matches the compiled pattern in BUFP against the string,
250 starting at index START.
251
252 re_search() first tries matching at index START, then it tries to match
253 starting from index START + 1, and so on. The last start position tried
254 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
255 way as re_match().)
256
257 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258 the first STOP characters of the concatenation of the strings should be
259 concerned.
260
261 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
263 computed relative to the concatenation, not relative to the individual
264 strings.)
265
266 On success, re_match* functions return the length of the match, re_search*
267 return the position of the start of the match. Return value -1 means no
268 match was found and -2 indicates an internal error. */
3b0bdc72
UD
269
270int
ac3d553b
UD
271re_match (bufp, string, length, start, regs)
272 struct re_pattern_buffer *bufp;
3b0bdc72
UD
273 const char *string;
274 int length, start;
275 struct re_registers *regs;
276{
ac3d553b 277 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
3b0bdc72
UD
278}
279#ifdef _LIBC
280weak_alias (__re_match, re_match)
281#endif
282
ac3d553b
UD
283int
284re_search (bufp, string, length, start, range, regs)
285 struct re_pattern_buffer *bufp;
286 const char *string;
287 int length, start, range;
288 struct re_registers *regs;
289{
290 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
291}
292#ifdef _LIBC
293weak_alias (__re_search, re_search)
294#endif
3b0bdc72
UD
295
296int
ac3d553b
UD
297re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
298 struct re_pattern_buffer *bufp;
299 const char *string1, *string2;
300 int length1, length2, start, stop;
301 struct re_registers *regs;
3b0bdc72 302{
ac3d553b 303 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 304 start, 0, regs, stop, 1);
3b0bdc72
UD
305}
306#ifdef _LIBC
307weak_alias (__re_match_2, re_match_2)
308#endif
309
3b0bdc72 310int
ac3d553b
UD
311re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
312 struct re_pattern_buffer *bufp;
313 const char *string1, *string2;
314 int length1, length2, start, range, stop;
315 struct re_registers *regs;
316{
317 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 318 start, range, regs, stop, 0);
ac3d553b
UD
319}
320#ifdef _LIBC
321weak_alias (__re_search_2, re_search_2)
322#endif
323
324static int
325re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
15a7d175 326 stop, ret_len)
ac3d553b
UD
327 struct re_pattern_buffer *bufp;
328 const char *string1, *string2;
329 int length1, length2, start, range, stop, ret_len;
330 struct re_registers *regs;
331{
332 const char *str;
333 int rval;
334 int len = length1 + length2;
335 int free_str = 0;
336
337 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
338 return -2;
339
340 /* Concatenate the strings. */
341 if (length2 > 0)
342 if (length1 > 0)
343 {
15a7d175
UD
344 char *s = re_malloc (char, len);
345
346 if (BE (s == NULL, 0))
347 return -2;
348 memcpy (s, string1, length1);
349 memcpy (s + length1, string2, length2);
350 str = s;
351 free_str = 1;
ac3d553b
UD
352 }
353 else
354 str = string2;
355 else
356 str = string1;
357
358 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
15a7d175 359 ret_len);
ac3d553b 360 if (free_str)
1b2c2628 361 re_free ((char *) str);
ac3d553b
UD
362 return rval;
363}
364
365/* The parameters have the same meaning as those of re_search.
366 Additional parameters:
367 If RET_LEN is nonzero the length of the match is returned (re_match style);
368 otherwise the position of the match is returned. */
369
370static int
371re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
372 struct re_pattern_buffer *bufp;
373 const char *string;
374 int length, start, range, stop, ret_len;
375 struct re_registers *regs;
3b0bdc72 376{
a9388965 377 reg_errcode_t result;
3b0bdc72 378 regmatch_t *pmatch;
ac3d553b
UD
379 int nregs, rval;
380 int eflags = 0;
381
382 /* Check for out-of-range. */
a877402c 383 if (BE (start < 0 || start > length, 0))
ac3d553b
UD
384 return -1;
385 if (BE (start + range > length, 0))
386 range = length - start;
a877402c
UD
387 else if (BE (start + range < 0, 0))
388 range = -start;
3b0bdc72
UD
389
390 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
391 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
392
ac3d553b
UD
393 /* Compile fastmap if we haven't yet. */
394 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
395 re_compile_fastmap (bufp);
396
397 if (BE (bufp->no_sub, 0))
398 regs = NULL;
3b0bdc72
UD
399
400 /* We need at least 1 register. */
ac3d553b
UD
401 if (regs == NULL)
402 nregs = 1;
403 else if (BE (bufp->regs_allocated == REGS_FIXED &&
15a7d175 404 regs->num_regs < bufp->re_nsub + 1, 0))
ac3d553b
UD
405 {
406 nregs = regs->num_regs;
407 if (BE (nregs < 1, 0))
15a7d175
UD
408 {
409 /* Nothing can be copied to regs. */
410 regs = NULL;
411 nregs = 1;
412 }
ac3d553b
UD
413 }
414 else
415 nregs = bufp->re_nsub + 1;
3b0bdc72 416 pmatch = re_malloc (regmatch_t, nregs);
bc15410e
UD
417 if (BE (pmatch == NULL, 0))
418 return -2;
3b0bdc72 419
ac3d553b 420 result = re_search_internal (bufp, string, length, start, range, stop,
15a7d175 421 nregs, pmatch, eflags);
3b0bdc72 422
ac3d553b
UD
423 rval = 0;
424
425 /* I hope we needn't fill ther regs with -1's when no match was found. */
426 if (result != REG_NOERROR)
427 rval = -1;
428 else if (regs != NULL)
3b0bdc72 429 {
ac3d553b
UD
430 /* If caller wants register contents data back, copy them. */
431 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
15a7d175 432 bufp->regs_allocated);
ac3d553b 433 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
15a7d175 434 rval = -2;
3b0bdc72
UD
435 }
436
ac3d553b 437 if (BE (rval == 0, 1))
3b0bdc72 438 {
ac3d553b 439 if (ret_len)
15a7d175
UD
440 {
441 assert (pmatch[0].rm_so == start);
442 rval = pmatch[0].rm_eo - start;
443 }
ac3d553b 444 else
15a7d175 445 rval = pmatch[0].rm_so;
3b0bdc72 446 }
3b0bdc72
UD
447 re_free (pmatch);
448 return rval;
449}
3b0bdc72 450
ac3d553b
UD
451static unsigned
452re_copy_regs (regs, pmatch, nregs, regs_allocated)
3b0bdc72 453 struct re_registers *regs;
ac3d553b
UD
454 regmatch_t *pmatch;
455 int nregs, regs_allocated;
3b0bdc72 456{
ac3d553b
UD
457 int rval = REGS_REALLOCATE;
458 int i;
459 int need_regs = nregs + 1;
460 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
461 uses. */
462
463 /* Have the register data arrays been allocated? */
464 if (regs_allocated == REGS_UNALLOCATED)
44777771 465 { /* No. So allocate them with malloc. */
a96c63ed
UD
466 regs->start = re_malloc (regoff_t, need_regs);
467 regs->end = re_malloc (regoff_t, need_regs);
468 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
15a7d175 469 return REGS_UNALLOCATED;
ac3d553b
UD
470 regs->num_regs = need_regs;
471 }
472 else if (regs_allocated == REGS_REALLOCATE)
473 { /* Yes. If we need more elements than were already
15a7d175
UD
474 allocated, reallocate them. If we need fewer, just
475 leave it alone. */
951d6408 476 if (BE (need_regs > regs->num_regs, 0))
15a7d175 477 {
44777771
UD
478 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
479 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
480 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
a96c63ed 481 return REGS_UNALLOCATED;
44777771
UD
482 regs->start = new_start;
483 regs->end = new_end;
15a7d175
UD
484 regs->num_regs = need_regs;
485 }
ac3d553b
UD
486 }
487 else
488 {
489 assert (regs_allocated == REGS_FIXED);
490 /* This function may not be called with REGS_FIXED and nregs too big. */
491 assert (regs->num_regs >= nregs);
492 rval = REGS_FIXED;
493 }
494
495 /* Copy the regs. */
496 for (i = 0; i < nregs; ++i)
497 {
498 regs->start[i] = pmatch[i].rm_so;
499 regs->end[i] = pmatch[i].rm_eo;
500 }
501 for ( ; i < regs->num_regs; ++i)
502 regs->start[i] = regs->end[i] = -1;
503
504 return rval;
3b0bdc72 505}
3b0bdc72
UD
506
507/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
508 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
509 this memory for recording register information. STARTS and ENDS
510 must be allocated using the malloc library routine, and must each
511 be at least NUM_REGS * sizeof (regoff_t) bytes long.
512
513 If NUM_REGS == 0, then subsequent matches should allocate their own
514 register data.
515
516 Unless this function is called, the first search or match using
517 PATTERN_BUFFER will allocate its own register data, without
518 freeing the old data. */
519
520void
521re_set_registers (bufp, regs, num_regs, starts, ends)
522 struct re_pattern_buffer *bufp;
523 struct re_registers *regs;
524 unsigned num_regs;
525 regoff_t *starts, *ends;
526{
527 if (num_regs)
528 {
529 bufp->regs_allocated = REGS_REALLOCATE;
530 regs->num_regs = num_regs;
531 regs->start = starts;
532 regs->end = ends;
533 }
534 else
535 {
536 bufp->regs_allocated = REGS_UNALLOCATED;
537 regs->num_regs = 0;
538 regs->start = regs->end = (regoff_t *) 0;
539 }
540}
541#ifdef _LIBC
542weak_alias (__re_set_registers, re_set_registers)
543#endif
544\f
545/* Entry points compatible with 4.2 BSD regex library. We don't define
546 them unless specifically requested. */
547
548#if defined _REGEX_RE_COMP || defined _LIBC
549int
550# ifdef _LIBC
551weak_function
552# endif
553re_exec (s)
554 const char *s;
555{
556 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
557}
558#endif /* _REGEX_RE_COMP */
559\f
560static re_node_set empty_set;
561
562/* Internal entry point. */
563
564/* Searches for a compiled pattern PREG in the string STRING, whose
565 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
566 mingings with regexec. START, and RANGE have the same meanings
567 with re_search.
a9388965
UD
568 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
569 otherwise return the error code.
3b0bdc72
UD
570 Note: We assume front end functions already check ranges.
571 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
572
a9388965 573static reg_errcode_t
ac3d553b 574re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
15a7d175 575 eflags)
3b0bdc72
UD
576 const regex_t *preg;
577 const char *string;
ac3d553b 578 int length, start, range, stop, eflags;
3b0bdc72
UD
579 size_t nmatch;
580 regmatch_t pmatch[];
581{
a9388965 582 reg_errcode_t err;
3b0bdc72 583 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
612546c6 584 int left_lim, right_lim, incr;
3b0bdc72 585 int fl_longest_match, match_first, match_last = -1;
1b2c2628 586 int fast_translate, sb;
e3a87852
UD
587#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
588 re_match_context_t mctx = { .dfa = dfa };
589#else
3b0bdc72 590 re_match_context_t mctx;
e3a87852 591#endif
1b2c2628
UD
592 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
593 && range && !preg->can_be_null) ? preg->fastmap : NULL);
3b0bdc72 594
e3a87852
UD
595#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
596 memset (&mctx, '\0', sizeof (re_match_context_t));
597 mctx.dfa = dfa;
598#endif
599
3b0bdc72 600 /* Check if the DFA haven't been compiled. */
bc15410e 601 if (BE (preg->used == 0 || dfa->init_state == NULL
15a7d175
UD
602 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
603 || dfa->init_state_begbuf == NULL, 0))
a9388965 604 return REG_NOMATCH;
3b0bdc72 605
c70f81dd
UD
606#ifdef DEBUG
607 /* We assume front-end functions already check them. */
608 assert (start + range >= 0 && start + range <= length);
609#endif
610
611 /* If initial states with non-begbuf contexts have no elements,
612 the regex must be anchored. If preg->newline_anchor is set,
613 we'll never use init_state_nl, so do not check it. */
614 if (dfa->init_state->nodes.nelem == 0
615 && dfa->init_state_word->nodes.nelem == 0
616 && (dfa->init_state_nl->nodes.nelem == 0
617 || !preg->newline_anchor))
618 {
619 if (start != 0 && start + range != 0)
620 return REG_NOMATCH;
621 start = range = 0;
622 }
623
3b0bdc72
UD
624 re_node_set_init_empty (&empty_set);
625
626 /* We must check the longest matching, if nmatch > 0. */
6291ee3c 627 fl_longest_match = (nmatch != 0 || dfa->nbackref);
3b0bdc72 628
56b168be 629 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
f0c7c524 630 preg->translate, preg->syntax & RE_ICASE, dfa);
612546c6 631 if (BE (err != REG_NOERROR, 0))
1b2c2628 632 goto free_return;
56b168be
UD
633 mctx.input.stop = stop;
634 mctx.input.raw_stop = stop;
635 mctx.input.newline_anchor = preg->newline_anchor;
612546c6 636
56b168be 637 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
612546c6 638 if (BE (err != REG_NOERROR, 0))
1b2c2628 639 goto free_return;
612546c6 640
3b0bdc72
UD
641 /* We will log all the DFA states through which the dfa pass,
642 if nmatch > 1, or this dfa has "multibyte node", which is a
643 back-reference or a node which can accept multibyte character or
644 multi character collating element. */
645 if (nmatch > 1 || dfa->has_mb_node)
a9388965 646 {
56b168be 647 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
612546c6 648 if (BE (mctx.state_log == NULL, 0))
15a7d175
UD
649 {
650 err = REG_ESPACE;
651 goto free_return;
652 }
a9388965 653 }
3b0bdc72 654 else
612546c6 655 mctx.state_log = NULL;
3b0bdc72 656
612546c6 657 match_first = start;
56b168be
UD
658 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
659 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
612546c6 660
3b0bdc72 661 /* Check incrementally whether of not the input string match. */
612546c6
UD
662 incr = (range < 0) ? -1 : 1;
663 left_lim = (range < 0) ? start + range : start;
664 right_lim = (range < 0) ? start : start + range;
3c0fb574 665 sb = dfa->mb_cur_max == 1;
1b2c2628 666 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
612546c6
UD
667
668 for (;;)
3b0bdc72 669 {
612546c6 670 /* At first get the current byte from input string. */
1b2c2628
UD
671 if (fastmap)
672 {
673 if (BE (fast_translate, 1))
674 {
675 unsigned RE_TRANSLATE_TYPE t
676 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
677 if (BE (range >= 0, 1))
678 {
679 if (BE (t != NULL, 0))
680 {
681 while (BE (match_first < right_lim, 1)
682 && !fastmap[t[(unsigned char) string[match_first]]])
683 ++match_first;
684 }
685 else
686 {
687 while (BE (match_first < right_lim, 1)
688 && !fastmap[(unsigned char) string[match_first]])
689 ++match_first;
690 }
691 if (BE (match_first == right_lim, 0))
692 {
693 int ch = match_first >= length
694 ? 0 : (unsigned char) string[match_first];
695 if (!fastmap[t ? t[ch] : ch])
696 break;
697 }
698 }
699 else
700 {
701 while (match_first >= left_lim)
702 {
703 int ch = match_first >= length
704 ? 0 : (unsigned char) string[match_first];
705 if (fastmap[t ? t[ch] : ch])
706 break;
707 --match_first;
708 }
709 if (match_first < left_lim)
710 break;
711 }
712 }
713 else
714 {
715 int ch;
716
717 do
718 {
719 /* In this case, we can't determine easily the current byte,
720 since it might be a component byte of a multibyte
721 character. Then we use the constructed buffer
722 instead. */
723 /* If MATCH_FIRST is out of the valid range, reconstruct the
724 buffers. */
56b168be
UD
725 if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
726 <= match_first
727 || match_first < mctx.input.raw_mbs_idx)
1b2c2628 728 {
56b168be
UD
729 err = re_string_reconstruct (&mctx.input, match_first,
730 eflags);
1b2c2628
UD
731 if (BE (err != REG_NOERROR, 0))
732 goto free_return;
733 }
734 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
735 Note that MATCH_FIRST must not be smaller than 0. */
736 ch = ((match_first >= length) ? 0
56b168be
UD
737 : re_string_byte_at (&mctx.input,
738 match_first
739 - mctx.input.raw_mbs_idx));
1b2c2628
UD
740 if (fastmap[ch])
741 break;
742 match_first += incr;
743 }
744 while (match_first >= left_lim && match_first <= right_lim);
745 if (! fastmap[ch])
746 break;
747 }
748 }
612546c6 749
1b2c2628 750 /* Reconstruct the buffers so that the matcher can assume that
fe9434bb 751 the matching starts from the beginning of the buffer. */
56b168be 752 err = re_string_reconstruct (&mctx.input, match_first, eflags);
1b2c2628
UD
753 if (BE (err != REG_NOERROR, 0))
754 goto free_return;
3b0bdc72 755#ifdef RE_ENABLE_I18N
1b2c2628 756 /* Eliminate it when it is a component of a multibyte character
15a7d175 757 and isn't the head of a multibyte character. */
56b168be 758 if (sb || re_string_first_byte (&mctx.input, 0))
3b0bdc72 759#endif
1b2c2628
UD
760 {
761 /* It seems to be appropriate one, then use the matcher. */
762 /* We assume that the matching starts from 0. */
763 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
15bad1a5
UD
764 match_last = check_matching (&mctx, fl_longest_match,
765 range >= 0 ? &match_first : NULL);
1b2c2628
UD
766 if (match_last != -1)
767 {
768 if (BE (match_last == -2, 0))
769 {
770 err = REG_ESPACE;
771 goto free_return;
772 }
773 else
6291ee3c
UD
774 {
775 mctx.match_last = match_last;
776 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
777 {
778 re_dfastate_t *pstate = mctx.state_log[match_last];
e3a87852
UD
779 mctx.last_node = check_halt_state_context (&mctx, pstate,
780 match_last);
6291ee3c
UD
781 }
782 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
783 || dfa->nbackref)
784 {
e3a87852 785 err = prune_impossible_nodes (&mctx);
6291ee3c
UD
786 if (err == REG_NOERROR)
787 break;
788 if (BE (err != REG_NOMATCH, 0))
789 goto free_return;
97fd3a30 790 match_last = -1;
6291ee3c
UD
791 }
792 else
fe9434bb 793 break; /* We found a match. */
6291ee3c 794 }
1b2c2628 795 }
6291ee3c 796 match_ctx_clean (&mctx);
1b2c2628 797 }
3b0bdc72 798 /* Update counter. */
612546c6
UD
799 match_first += incr;
800 if (match_first < left_lim || right_lim < match_first)
15a7d175 801 break;
3b0bdc72
UD
802 }
803
804 /* Set pmatch[] if we need. */
805 if (match_last != -1 && nmatch > 0)
806 {
807 int reg_idx;
808
809 /* Initialize registers. */
97fd3a30 810 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
15a7d175 811 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
3b0bdc72
UD
812
813 /* Set the points where matching start/end. */
612546c6 814 pmatch[0].rm_so = 0;
6291ee3c 815 pmatch[0].rm_eo = mctx.match_last;
3b0bdc72
UD
816
817 if (!preg->no_sub && nmatch > 1)
15a7d175 818 {
15a7d175
UD
819 err = set_regs (preg, &mctx, nmatch, pmatch,
820 dfa->has_plural_match && dfa->nbackref > 0);
821 if (BE (err != REG_NOERROR, 0))
822 goto free_return;
823 }
612546c6
UD
824
825 /* At last, add the offset to the each registers, since we slided
97fd3a30
UD
826 the buffers so that we could assume that the matching starts
827 from 0. */
612546c6 828 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
15a7d175
UD
829 if (pmatch[reg_idx].rm_so != -1)
830 {
c0d5034e 831#ifdef RE_ENABLE_I18N
56b168be 832 if (BE (mctx.input.offsets_needed != 0, 0))
bb3f4825 833 {
56b168be
UD
834 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
835 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
bb3f4825 836 else
56b168be
UD
837 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
838 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
839 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
bb3f4825 840 else
56b168be 841 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
bb3f4825 842 }
c0d5034e 843#else
56b168be 844 assert (mctx.input.offsets_needed == 0);
c0d5034e 845#endif
15a7d175
UD
846 pmatch[reg_idx].rm_so += match_first;
847 pmatch[reg_idx].rm_eo += match_first;
848 }
3b0bdc72 849 }
1b2c2628
UD
850 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
851 free_return:
612546c6 852 re_free (mctx.state_log);
3b0bdc72
UD
853 if (dfa->nbackref)
854 match_ctx_free (&mctx);
56b168be 855 re_string_destruct (&mctx.input);
1b2c2628 856 return err;
3b0bdc72
UD
857}
858
6291ee3c 859static reg_errcode_t
e3a87852 860prune_impossible_nodes (mctx)
6291ee3c
UD
861 re_match_context_t *mctx;
862{
e3a87852 863 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
864 int halt_node, match_last;
865 reg_errcode_t ret;
6291ee3c
UD
866 re_dfastate_t **sifted_states;
867 re_dfastate_t **lim_states = NULL;
868 re_sift_context_t sctx;
869#ifdef DEBUG
870 assert (mctx->state_log != NULL);
871#endif
872 match_last = mctx->match_last;
873 halt_node = mctx->last_node;
874 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
875 if (BE (sifted_states == NULL, 0))
876 {
877 ret = REG_ESPACE;
878 goto free_return;
879 }
880 if (dfa->nbackref)
881 {
882 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
883 if (BE (lim_states == NULL, 0))
884 {
885 ret = REG_ESPACE;
886 goto free_return;
887 }
888 while (1)
889 {
890 memset (lim_states, '\0',
891 sizeof (re_dfastate_t *) * (match_last + 1));
892 match_ctx_clear_flag (mctx);
893 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
894 match_last, 0);
e3a87852 895 ret = sift_states_backward (mctx, &sctx);
6291ee3c
UD
896 re_node_set_free (&sctx.limits);
897 if (BE (ret != REG_NOERROR, 0))
898 goto free_return;
899 if (sifted_states[0] != NULL || lim_states[0] != NULL)
900 break;
901 do
902 {
903 --match_last;
904 if (match_last < 0)
905 {
906 ret = REG_NOMATCH;
907 goto free_return;
908 }
97fd3a30
UD
909 } while (mctx->state_log[match_last] == NULL
910 || !mctx->state_log[match_last]->halt);
e3a87852 911 halt_node = check_halt_state_context (mctx,
6291ee3c 912 mctx->state_log[match_last],
e3a87852 913 match_last);
6291ee3c
UD
914 }
915 ret = merge_state_array (dfa, sifted_states, lim_states,
916 match_last + 1);
917 re_free (lim_states);
918 lim_states = NULL;
919 if (BE (ret != REG_NOERROR, 0))
920 goto free_return;
921 }
922 else
923 {
924 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
925 match_last, 0);
e3a87852 926 ret = sift_states_backward (mctx, &sctx);
6291ee3c
UD
927 re_node_set_free (&sctx.limits);
928 if (BE (ret != REG_NOERROR, 0))
929 goto free_return;
930 }
931 re_free (mctx->state_log);
932 mctx->state_log = sifted_states;
933 sifted_states = NULL;
934 mctx->last_node = halt_node;
935 mctx->match_last = match_last;
936 ret = REG_NOERROR;
937 free_return:
938 re_free (sifted_states);
939 re_free (lim_states);
940 return ret;
941}
942
a9388965 943/* Acquire an initial state and return it.
3b0bdc72
UD
944 We must select appropriate initial state depending on the context,
945 since initial states may have constraints like "\<", "^", etc.. */
946
bb3f4825 947static inline re_dfastate_t *
e3a87852 948acquire_init_state_context (err, mctx, idx)
a9388965 949 reg_errcode_t *err;
612546c6
UD
950 const re_match_context_t *mctx;
951 int idx;
3b0bdc72 952{
e3a87852 953 re_dfa_t *const dfa = mctx->dfa;
3b0bdc72
UD
954 if (dfa->init_state->has_constraint)
955 {
956 unsigned int context;
56b168be 957 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
3b0bdc72 958 if (IS_WORD_CONTEXT (context))
15a7d175 959 return dfa->init_state_word;
3b0bdc72 960 else if (IS_ORDINARY_CONTEXT (context))
15a7d175 961 return dfa->init_state;
3b0bdc72 962 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
15a7d175 963 return dfa->init_state_begbuf;
3b0bdc72 964 else if (IS_NEWLINE_CONTEXT (context))
15a7d175 965 return dfa->init_state_nl;
3b0bdc72 966 else if (IS_BEGBUF_CONTEXT (context))
15a7d175
UD
967 {
968 /* It is relatively rare case, then calculate on demand. */
4c595adb
UD
969 return re_acquire_state_context (err, dfa,
970 dfa->init_state->entrance_nodes,
971 context);
15a7d175 972 }
3b0bdc72 973 else
15a7d175
UD
974 /* Must not happen? */
975 return dfa->init_state;
3b0bdc72
UD
976 }
977 else
978 return dfa->init_state;
979}
980
981/* Check whether the regular expression match input string INPUT or not,
a9388965
UD
982 and return the index where the matching end, return -1 if not match,
983 or return -2 in case of an error.
612546c6 984 FL_LONGEST_MATCH means we want the POSIX longest matching.
15bad1a5
UD
985 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
986 next place where we may want to try matching.
612546c6
UD
987 Note that the matcher assume that the maching starts from the current
988 index of the buffer. */
3b0bdc72
UD
989
990static int
15bad1a5 991check_matching (mctx, fl_longest_match, p_match_first)
3b0bdc72 992 re_match_context_t *mctx;
c13c99fa 993 int fl_longest_match;
15bad1a5 994 int *p_match_first;
3b0bdc72 995{
e3a87852 996 re_dfa_t *const dfa = mctx->dfa;
a9388965 997 reg_errcode_t err;
612546c6
UD
998 int match = 0;
999 int match_last = -1;
56b168be 1000 int cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72 1001 re_dfastate_t *cur_state;
4c595adb
UD
1002 int at_init_state = p_match_first != NULL;
1003 int next_start_idx = cur_str_idx;
3b0bdc72 1004
4c595adb 1005 err = REG_NOERROR;
e3a87852 1006 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
4c595adb 1007 /* An initial state must not be NULL (invalid). */
bc15410e 1008 if (BE (cur_state == NULL, 0))
4c595adb
UD
1009 {
1010 assert (err == REG_ESPACE);
1011 return -2;
1012 }
0742e48e 1013
4c595adb 1014 if (mctx->state_log != NULL)
6291ee3c 1015 {
4c595adb 1016 mctx->state_log[cur_str_idx] = cur_state;
6291ee3c 1017
4c595adb
UD
1018 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1019 later. E.g. Processing back references. */
1020 if (BE (dfa->nbackref, 0))
bb3f4825 1021 {
4c595adb
UD
1022 at_init_state = 0;
1023 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
bb3f4825
UD
1024 if (BE (err != REG_NOERROR, 0))
1025 return err;
4c595adb
UD
1026
1027 if (cur_state->has_backref)
1028 {
1029 err = transit_state_bkref (mctx, &cur_state->nodes);
1030 if (BE (err != REG_NOERROR, 0))
1031 return err;
1032 }
bb3f4825 1033 }
0742e48e
UD
1034 }
1035
3b0bdc72 1036 /* If the RE accepts NULL string. */
bb3f4825 1037 if (BE (cur_state->halt, 0))
3b0bdc72
UD
1038 {
1039 if (!cur_state->has_constraint
e3a87852 1040 || check_halt_state_context (mctx, cur_state, cur_str_idx))
15a7d175
UD
1041 {
1042 if (!fl_longest_match)
1043 return cur_str_idx;
1044 else
1045 {
1046 match_last = cur_str_idx;
1047 match = 1;
1048 }
1049 }
3b0bdc72
UD
1050 }
1051
56b168be 1052 while (!re_string_eoi (&mctx->input))
3b0bdc72 1053 {
15bad1a5 1054 re_dfastate_t *old_state = cur_state;
e3a87852 1055 cur_state = transit_state (&err, mctx, cur_state);
4c595adb
UD
1056 if (mctx->state_log != NULL)
1057 cur_state = merge_state_with_log (&err, mctx, cur_state);
15bad1a5 1058
4c595adb 1059 if (cur_state == NULL)
15a7d175 1060 {
4c595adb
UD
1061 /* Reached the invalid state or an error. Try to recover a valid
1062 state using the state log, if available and if we have not
1063 already found a valid (even if not the longest) match. */
15a7d175
UD
1064 if (BE (err != REG_NOERROR, 0))
1065 return -2;
4c595adb
UD
1066
1067 if (mctx->state_log == NULL
1068 || (match && !fl_longest_match)
1069 || (cur_state = find_recover_state (&err, mctx)) == NULL)
15a7d175 1070 break;
4c595adb
UD
1071 }
1072
1073 if (at_init_state)
1074 {
1075 if (old_state == cur_state)
1076 next_start_idx = re_string_cur_idx (&mctx->input);
c13c99fa 1077 else
4c595adb 1078 at_init_state = 0;
15a7d175 1079 }
3b0bdc72 1080
4c595adb 1081 if (cur_state->halt)
15a7d175 1082 {
4c595adb 1083 /* Reached a halt state.
15a7d175
UD
1084 Check the halt state can satisfy the current context. */
1085 if (!cur_state->has_constraint
e3a87852 1086 || check_halt_state_context (mctx, cur_state,
56b168be 1087 re_string_cur_idx (&mctx->input)))
15a7d175
UD
1088 {
1089 /* We found an appropriate halt state. */
56b168be 1090 match_last = re_string_cur_idx (&mctx->input);
15a7d175
UD
1091 match = 1;
1092 if (!fl_longest_match)
1093 break;
1094 }
1095 }
3b0bdc72 1096 }
15bad1a5 1097
4c595adb
UD
1098 if (match_last == -1 && p_match_first)
1099 *p_match_first += next_start_idx;
15bad1a5 1100
3b0bdc72
UD
1101 return match_last;
1102}
1103
1104/* Check NODE match the current context. */
1105
1106static int check_halt_node_context (dfa, node, context)
1107 const re_dfa_t *dfa;
1108 int node;
1109 unsigned int context;
1110{
3b0bdc72 1111 re_token_type_t type = dfa->nodes[node].type;
485d775d
UD
1112 unsigned int constraint = dfa->nodes[node].constraint;
1113 if (type != END_OF_RE)
3b0bdc72 1114 return 0;
485d775d
UD
1115 if (!constraint)
1116 return 1;
1117 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
3b0bdc72
UD
1118 return 0;
1119 return 1;
1120}
1121
1122/* Check the halt state STATE match the current context.
1123 Return 0 if not match, if the node, STATE has, is a halt node and
1124 match the context, return the node. */
1125
1126static int
e3a87852 1127check_halt_state_context (mctx, state, idx)
612546c6 1128 const re_match_context_t *mctx;
e3a87852 1129 const re_dfastate_t *state;
612546c6 1130 int idx;
3b0bdc72 1131{
3b0bdc72
UD
1132 int i;
1133 unsigned int context;
1134#ifdef DEBUG
1135 assert (state->halt);
1136#endif
56b168be 1137 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
3b0bdc72 1138 for (i = 0; i < state->nodes.nelem; ++i)
e3a87852 1139 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
3b0bdc72
UD
1140 return state->nodes.elems[i];
1141 return 0;
1142}
1143
a9388965
UD
1144/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1145 corresponding to the DFA).
1146 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1147 of errors. */
3b0bdc72
UD
1148
1149static int
e3a87852 1150proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
3b0bdc72 1151 const re_match_context_t *mctx;
e3a87852 1152 regmatch_t *regs;
a3022b82 1153 int nregs, *pidx, node;
3b0bdc72 1154 re_node_set *eps_via_nodes;
a3022b82 1155 struct re_fail_stack_t *fs;
3b0bdc72 1156{
e3a87852 1157 re_dfa_t *const dfa = mctx->dfa;
485d775d 1158 int i, err, dest_node;
81c64d40 1159 dest_node = -1;
3b0bdc72
UD
1160 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1161 {
485d775d
UD
1162 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1163 int ndest, dest_nodes[2];
a9388965 1164 err = re_node_set_insert (eps_via_nodes, node);
bc15410e 1165 if (BE (err < 0, 0))
44777771 1166 return -2;
a3022b82 1167 /* Pick up valid destinations. */
485d775d 1168 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
15a7d175
UD
1169 {
1170 int candidate = dfa->edests[node].elems[i];
1171 if (!re_node_set_contains (cur_nodes, candidate))
1172 continue;
1173 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1174 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1175 ++ndest;
1176 }
a3022b82 1177 if (ndest <= 1)
15a7d175 1178 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
a3022b82
UD
1179 /* In order to avoid infinite loop like "(a*)*". */
1180 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
15a7d175 1181 return dest_nodes[1];
44777771
UD
1182 if (fs != NULL
1183 && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1184 eps_via_nodes))
1185 return -2;
a3022b82 1186 return dest_nodes[0];
3b0bdc72
UD
1187 }
1188 else
1189 {
485d775d 1190 int naccepted = 0;
3b0bdc72 1191 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72 1192
434d3784 1193#ifdef RE_ENABLE_I18N
3b0bdc72 1194 if (ACCEPT_MB_NODE (type))
56b168be 1195 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
434d3784
UD
1196 else
1197#endif /* RE_ENABLE_I18N */
1198 if (type == OP_BACK_REF)
15a7d175
UD
1199 {
1200 int subexp_idx = dfa->nodes[node].opr.idx;
1201 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1202 if (fs != NULL)
1203 {
1204 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1205 return -1;
1206 else if (naccepted)
1207 {
56b168be 1208 char *buf = (char *) re_string_get_buffer (&mctx->input);
6291ee3c
UD
1209 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1210 naccepted) != 0)
15a7d175
UD
1211 return -1;
1212 }
1213 }
1214
1215 if (naccepted == 0)
1216 {
1217 err = re_node_set_insert (eps_via_nodes, node);
1218 if (BE (err < 0, 0))
1219 return -2;
1220 dest_node = dfa->edests[node].elems[0];
1221 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1222 dest_node))
1223 return dest_node;
1224 }
1225 }
3b0bdc72
UD
1226
1227 if (naccepted != 0
e3a87852 1228 || check_node_accept (mctx, dfa->nodes + node, *pidx))
15a7d175
UD
1229 {
1230 dest_node = dfa->nexts[node];
1231 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1232 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1233 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1234 dest_node)))
1235 return -1;
1236 re_node_set_empty (eps_via_nodes);
1237 return dest_node;
1238 }
3b0bdc72 1239 }
a3022b82
UD
1240 return -1;
1241}
1242
1243static reg_errcode_t
1244push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1245 struct re_fail_stack_t *fs;
1246 int str_idx, *dests, nregs;
1247 regmatch_t *regs;
1248 re_node_set *eps_via_nodes;
1249{
1250 reg_errcode_t err;
1251 int num = fs->num++;
1252 if (fs->num == fs->alloc)
1253 {
1b2c2628 1254 struct re_fail_stack_ent_t *new_array;
1b2c2628 1255 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
44777771 1256 * fs->alloc * 2));
1b2c2628 1257 if (new_array == NULL)
15a7d175 1258 return REG_ESPACE;
44777771 1259 fs->alloc *= 2;
1b2c2628 1260 fs->stack = new_array;
a3022b82
UD
1261 }
1262 fs->stack[num].idx = str_idx;
1263 fs->stack[num].node = dests[1];
1264 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
5a299c96
UD
1265 if (fs->stack[num].regs == NULL)
1266 return REG_ESPACE;
a3022b82
UD
1267 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1268 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1269 return err;
1270}
1b2c2628 1271
a3022b82
UD
1272static int
1273pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1274 struct re_fail_stack_t *fs;
1275 int *pidx, nregs;
1276 regmatch_t *regs;
1277 re_node_set *eps_via_nodes;
1278{
1279 int num = --fs->num;
1280 assert (num >= 0);
44777771 1281 *pidx = fs->stack[num].idx;
a3022b82
UD
1282 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1283 re_node_set_free (eps_via_nodes);
485d775d 1284 re_free (fs->stack[num].regs);
a3022b82
UD
1285 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1286 return fs->stack[num].node;
3b0bdc72
UD
1287}
1288
1289/* Set the positions where the subexpressions are starts/ends to registers
1290 PMATCH.
1291 Note: We assume that pmatch[0] is already set, and
97fd3a30 1292 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
3b0bdc72 1293
a9388965 1294static reg_errcode_t
a3022b82
UD
1295set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1296 const regex_t *preg;
1297 const re_match_context_t *mctx;
1298 size_t nmatch;
1299 regmatch_t *pmatch;
1300 int fl_backtrack;
3b0bdc72 1301{
6b6557e8 1302 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 1303 int idx, cur_node, real_nmatch;
3b0bdc72 1304 re_node_set eps_via_nodes;
a3022b82 1305 struct re_fail_stack_t *fs;
6b6557e8
UD
1306 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1307 regmatch_t *prev_idx_match;
1308
3b0bdc72
UD
1309#ifdef DEBUG
1310 assert (nmatch > 1);
612546c6 1311 assert (mctx->state_log != NULL);
3b0bdc72 1312#endif
a3022b82
UD
1313 if (fl_backtrack)
1314 {
1315 fs = &fs_body;
1316 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
6b6557e8
UD
1317 if (fs->stack == NULL)
1318 return REG_ESPACE;
a3022b82
UD
1319 }
1320 else
1321 fs = NULL;
6b6557e8 1322
3b0bdc72
UD
1323 cur_node = dfa->init_node;
1324 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1325 re_node_set_init_empty (&eps_via_nodes);
6b6557e8
UD
1326
1327 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1328 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1329
3b0bdc72
UD
1330 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1331 {
6b6557e8
UD
1332 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1333
a3022b82 1334 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
15a7d175
UD
1335 {
1336 int reg_idx;
1337 if (fs)
1338 {
1339 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1340 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1341 break;
1342 if (reg_idx == nmatch)
1343 {
1344 re_node_set_free (&eps_via_nodes);
1345 return free_fail_stack_return (fs);
1346 }
1347 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1348 &eps_via_nodes);
1349 }
1350 else
1351 {
1352 re_node_set_free (&eps_via_nodes);
1353 return REG_NOERROR;
1354 }
1355 }
3b0bdc72
UD
1356
1357 /* Proceed to next node. */
e3a87852 1358 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
15a7d175 1359 &eps_via_nodes, fs);
a3022b82 1360
bc15410e 1361 if (BE (cur_node < 0, 0))
15a7d175 1362 {
44777771
UD
1363 if (BE (cur_node == -2, 0))
1364 {
1365 re_node_set_free (&eps_via_nodes);
1366 free_fail_stack_return (fs);
1367 return REG_ESPACE;
1368 }
15a7d175
UD
1369 if (fs)
1370 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1371 &eps_via_nodes);
1372 else
1373 {
1374 re_node_set_free (&eps_via_nodes);
1375 return REG_NOMATCH;
1376 }
1377 }
3b0bdc72
UD
1378 }
1379 re_node_set_free (&eps_via_nodes);
485d775d
UD
1380 return free_fail_stack_return (fs);
1381}
1382
1383static reg_errcode_t
1384free_fail_stack_return (fs)
1385 struct re_fail_stack_t *fs;
1386{
1387 if (fs)
1388 {
1389 int fs_idx;
1390 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
15a7d175
UD
1391 {
1392 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1393 re_free (fs->stack[fs_idx].regs);
1394 }
485d775d
UD
1395 re_free (fs->stack);
1396 }
a9388965 1397 return REG_NOERROR;
3b0bdc72
UD
1398}
1399
81c64d40 1400static void
6b6557e8 1401update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
81c64d40 1402 re_dfa_t *dfa;
6b6557e8 1403 regmatch_t *pmatch, *prev_idx_match;
81c64d40
UD
1404 int cur_node, cur_idx, nmatch;
1405{
1406 int type = dfa->nodes[cur_node].type;
81c64d40
UD
1407 if (type == OP_OPEN_SUBEXP)
1408 {
6b6557e8
UD
1409 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1410
81c64d40 1411 /* We are at the first node of this sub expression. */
6b6557e8
UD
1412 if (reg_num < nmatch)
1413 {
1414 pmatch[reg_num].rm_so = cur_idx;
1415 pmatch[reg_num].rm_eo = -1;
1416 }
81c64d40
UD
1417 }
1418 else if (type == OP_CLOSE_SUBEXP)
6b6557e8
UD
1419 {
1420 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1421 if (reg_num < nmatch)
1422 {
1423 /* We are at the last node of this sub expression. */
1424 if (pmatch[reg_num].rm_so < cur_idx)
1425 {
1426 pmatch[reg_num].rm_eo = cur_idx;
1427 /* This is a non-empty match or we are not inside an optional
1428 subexpression. Accept this right away. */
1429 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1430 }
1431 else
1432 {
1433 if (dfa->nodes[cur_node].opt_subexp
1434 && prev_idx_match[reg_num].rm_so != -1)
1435 /* We transited through an empty match for an optional
1436 subexpression, like (a?)*, and this is not the subexp's
1437 first match. Copy back the old content of the registers
1438 so that matches of an inner subexpression are undone as
1439 well, like in ((a?))*. */
1440 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1441 else
1442 /* We completed a subexpression, but it may be part of
1443 an optional one, so do not update PREV_IDX_MATCH. */
1444 pmatch[reg_num].rm_eo = cur_idx;
1445 }
1446 }
1447 }
0742e48e 1448}
81c64d40 1449
0742e48e 1450/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
612546c6
UD
1451 and sift the nodes in each states according to the following rules.
1452 Updated state_log will be wrote to STATE_LOG.
3b0bdc72
UD
1453
1454 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1455 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
15a7d175
UD
1456 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1457 the LAST_NODE, we throw away the node `a'.
612546c6 1458 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
15a7d175
UD
1459 string `s' and transit to `b':
1460 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1461 away the node `a'.
1462 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
bb3f4825 1463 thrown away, we throw away the node `a'.
d0b96fc4 1464 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
15a7d175
UD
1465 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1466 node `a'.
bb3f4825 1467 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
15a7d175 1468 we throw away the node `a'. */
3b0bdc72
UD
1469
1470#define STATE_NODE_CONTAINS(state,node) \
1471 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1472
a9388965 1473static reg_errcode_t
e3a87852 1474sift_states_backward (mctx, sctx)
0742e48e
UD
1475 re_match_context_t *mctx;
1476 re_sift_context_t *sctx;
3b0bdc72 1477{
e3a87852 1478 re_dfa_t *const dfa = mctx->dfa;
a9388965 1479 reg_errcode_t err;
0742e48e
UD
1480 int null_cnt = 0;
1481 int str_idx = sctx->last_str_idx;
1482 re_node_set cur_dest;
1483 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
3b0bdc72
UD
1484
1485#ifdef DEBUG
612546c6 1486 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
3b0bdc72 1487#endif
0742e48e 1488 cur_src = &mctx->state_log[str_idx]->nodes;
3b0bdc72
UD
1489
1490 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1491 transit to the last_node and the last_node itself. */
0742e48e 1492 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
bc15410e 1493 if (BE (err != REG_NOERROR, 0))
a9388965 1494 return err;
e3a87852 1495 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
0742e48e 1496 if (BE (err != REG_NOERROR, 0))
1b2c2628 1497 goto free_return;
3b0bdc72
UD
1498
1499 /* Then check each states in the state_log. */
612546c6 1500 while (str_idx > 0)
3b0bdc72 1501 {
0742e48e 1502 int i, ret;
3b0bdc72 1503 /* Update counters. */
0742e48e
UD
1504 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1505 if (null_cnt > mctx->max_mb_elem_len)
15a7d175
UD
1506 {
1507 memset (sctx->sifted_states, '\0',
1508 sizeof (re_dfastate_t *) * str_idx);
1509 re_node_set_free (&cur_dest);
1510 return REG_NOERROR;
1511 }
0742e48e 1512 re_node_set_empty (&cur_dest);
3b0bdc72 1513 --str_idx;
0742e48e 1514 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1515 : &mctx->state_log[str_idx]->nodes);
3b0bdc72
UD
1516
1517 /* Then build the next sifted state.
15a7d175
UD
1518 We build the next sifted state on `cur_dest', and update
1519 `sifted_states[str_idx]' with `cur_dest'.
1520 Note:
1521 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1522 `cur_src' points the node_set of the old `state_log[str_idx]'. */
0742e48e 1523 for (i = 0; i < cur_src->nelem; i++)
15a7d175
UD
1524 {
1525 int prev_node = cur_src->elems[i];
1526 int naccepted = 0;
1527 re_token_type_t type = dfa->nodes[prev_node].type;
0742e48e 1528
46bf9de7 1529 if (IS_EPSILON_NODE (type))
15a7d175 1530 continue;
434d3784 1531#ifdef RE_ENABLE_I18N
15a7d175
UD
1532 /* If the node may accept `multi byte'. */
1533 if (ACCEPT_MB_NODE (type))
e3a87852 1534 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
15a7d175 1535 str_idx, sctx->last_str_idx);
3b0bdc72 1536
434d3784 1537#endif /* RE_ENABLE_I18N */
15a7d175
UD
1538 /* We don't check backreferences here.
1539 See update_cur_sifted_state(). */
1540
1541 if (!naccepted
e3a87852 1542 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
15a7d175
UD
1543 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1544 dfa->nexts[prev_node]))
1545 naccepted = 1;
1546
1547 if (naccepted == 0)
1548 continue;
1549
1550 if (sctx->limits.nelem)
1551 {
1552 int to_idx = str_idx + naccepted;
e3a87852 1553 if (check_dst_limits (mctx, &sctx->limits,
15a7d175
UD
1554 dfa->nexts[prev_node], to_idx,
1555 prev_node, str_idx))
1556 continue;
1557 }
1558 ret = re_node_set_insert (&cur_dest, prev_node);
1559 if (BE (ret == -1, 0))
1560 {
1561 err = REG_ESPACE;
1562 goto free_return;
1563 }
1564 }
3b0bdc72 1565
0742e48e 1566 /* Add all the nodes which satisfy the following conditions:
15a7d175
UD
1567 - It can epsilon transit to a node in CUR_DEST.
1568 - It is in CUR_SRC.
1569 And update state_log. */
e3a87852 1570 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
0742e48e 1571 if (BE (err != REG_NOERROR, 0))
15a7d175 1572 goto free_return;
3b0bdc72 1573 }
1b2c2628
UD
1574 err = REG_NOERROR;
1575 free_return:
0742e48e 1576 re_node_set_free (&cur_dest);
1b2c2628 1577 return err;
3b0bdc72
UD
1578}
1579
1580/* Helper functions. */
1581
25337753 1582static reg_errcode_t
7c1be3ec 1583clean_state_log_if_needed (mctx, next_state_log_idx)
3b0bdc72
UD
1584 re_match_context_t *mctx;
1585 int next_state_log_idx;
1586{
1587 int top = mctx->state_log_top;
612546c6 1588
56b168be
UD
1589 if (next_state_log_idx >= mctx->input.bufs_len
1590 || (next_state_log_idx >= mctx->input.valid_len
1591 && mctx->input.valid_len < mctx->input.len))
612546c6
UD
1592 {
1593 reg_errcode_t err;
1594 err = extend_buffers (mctx);
1595 if (BE (err != REG_NOERROR, 0))
15a7d175 1596 return err;
612546c6
UD
1597 }
1598
3b0bdc72
UD
1599 if (top < next_state_log_idx)
1600 {
612546c6 1601 memset (mctx->state_log + top + 1, '\0',
15a7d175 1602 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
3b0bdc72
UD
1603 mctx->state_log_top = next_state_log_idx;
1604 }
612546c6 1605 return REG_NOERROR;
3b0bdc72
UD
1606}
1607
1b2c2628
UD
1608static reg_errcode_t
1609merge_state_array (dfa, dst, src, num)
0742e48e
UD
1610 re_dfa_t *dfa;
1611 re_dfastate_t **dst;
1612 re_dfastate_t **src;
1613 int num;
3b0bdc72 1614{
0742e48e
UD
1615 int st_idx;
1616 reg_errcode_t err;
1617 for (st_idx = 0; st_idx < num; ++st_idx)
1618 {
1619 if (dst[st_idx] == NULL)
15a7d175 1620 dst[st_idx] = src[st_idx];
0742e48e 1621 else if (src[st_idx] != NULL)
15a7d175
UD
1622 {
1623 re_node_set merged_set;
1624 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1625 &src[st_idx]->nodes);
1626 if (BE (err != REG_NOERROR, 0))
1627 return err;
1628 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1629 re_node_set_free (&merged_set);
1630 if (BE (err != REG_NOERROR, 0))
1631 return err;
1632 }
0742e48e
UD
1633 }
1634 return REG_NOERROR;
3b0bdc72
UD
1635}
1636
0742e48e 1637static reg_errcode_t
e3a87852 1638update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
0742e48e
UD
1639 re_match_context_t *mctx;
1640 re_sift_context_t *sctx;
1641 int str_idx;
1642 re_node_set *dest_nodes;
3b0bdc72 1643{
e3a87852 1644 re_dfa_t *const dfa = mctx->dfa;
0742e48e 1645 reg_errcode_t err;
0742e48e
UD
1646 const re_node_set *candidates;
1647 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1648 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
1649
1650 /* At first, add the nodes which can epsilon transit to a node in
1651 DEST_NODE. */
485d775d
UD
1652 if (dest_nodes->nelem)
1653 {
1654 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1655 if (BE (err != REG_NOERROR, 0))
1656 return err;
1657 }
0742e48e
UD
1658
1659 /* Then, check the limitations in the current sift_context. */
485d775d 1660 if (dest_nodes->nelem && sctx->limits.nelem)
0742e48e
UD
1661 {
1662 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
15a7d175 1663 mctx->bkref_ents, str_idx);
0742e48e 1664 if (BE (err != REG_NOERROR, 0))
15a7d175 1665 return err;
0742e48e
UD
1666 }
1667
1668 /* Update state_log. */
1669 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1670 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1671 return err;
1672
0742e48e
UD
1673 if ((mctx->state_log[str_idx] != NULL
1674 && mctx->state_log[str_idx]->has_backref))
1675 {
e3a87852 1676 err = sift_states_bkref (mctx, sctx, str_idx, dest_nodes);
0742e48e 1677 if (BE (err != REG_NOERROR, 0))
15a7d175 1678 return err;
0742e48e
UD
1679 }
1680 return REG_NOERROR;
3b0bdc72
UD
1681}
1682
a9388965 1683static reg_errcode_t
0742e48e
UD
1684add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1685 re_dfa_t *dfa;
1686 re_node_set *dest_nodes;
1687 const re_node_set *candidates;
3b0bdc72 1688{
0742e48e
UD
1689 reg_errcode_t err;
1690 int src_idx;
1691 re_node_set src_copy;
1692
1693 err = re_node_set_init_copy (&src_copy, dest_nodes);
1694 if (BE (err != REG_NOERROR, 0))
1695 return err;
1696 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
3b0bdc72 1697 {
0742e48e 1698 err = re_node_set_add_intersect (dest_nodes, candidates,
15a7d175
UD
1699 dfa->inveclosures
1700 + src_copy.elems[src_idx]);
0742e48e 1701 if (BE (err != REG_NOERROR, 0))
15a7d175
UD
1702 {
1703 re_node_set_free (&src_copy);
1704 return err;
1705 }
0742e48e
UD
1706 }
1707 re_node_set_free (&src_copy);
1708 return REG_NOERROR;
1709}
3b0bdc72 1710
0742e48e
UD
1711static reg_errcode_t
1712sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1713 re_dfa_t *dfa;
1714 int node;
1715 re_node_set *dest_nodes;
1716 const re_node_set *candidates;
1717{
1718 int ecl_idx;
1719 reg_errcode_t err;
1720 re_node_set *inv_eclosure = dfa->inveclosures + node;
1721 re_node_set except_nodes;
1722 re_node_set_init_empty (&except_nodes);
1723 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1724 {
15a7d175
UD
1725 int cur_node = inv_eclosure->elems[ecl_idx];
1726 if (cur_node == node)
1727 continue;
1728 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1729 {
1730 int edst1 = dfa->edests[cur_node].elems[0];
1731 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1732 ? dfa->edests[cur_node].elems[1] : -1);
1733 if ((!re_node_set_contains (inv_eclosure, edst1)
1734 && re_node_set_contains (dest_nodes, edst1))
1735 || (edst2 > 0
1736 && !re_node_set_contains (inv_eclosure, edst2)
1737 && re_node_set_contains (dest_nodes, edst2)))
1738 {
1739 err = re_node_set_add_intersect (&except_nodes, candidates,
1740 dfa->inveclosures + cur_node);
1741 if (BE (err != REG_NOERROR, 0))
1742 {
1743 re_node_set_free (&except_nodes);
1744 return err;
1745 }
1746 }
1747 }
0742e48e
UD
1748 }
1749 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1750 {
15a7d175
UD
1751 int cur_node = inv_eclosure->elems[ecl_idx];
1752 if (!re_node_set_contains (&except_nodes, cur_node))
1753 {
1754 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1755 re_node_set_remove_at (dest_nodes, idx);
1756 }
0742e48e
UD
1757 }
1758 re_node_set_free (&except_nodes);
1759 return REG_NOERROR;
1760}
1761
a3022b82 1762static int
e3a87852 1763check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
a3022b82 1764 re_match_context_t *mctx;
e3a87852 1765 re_node_set *limits;
a3022b82
UD
1766 int dst_node, dst_idx, src_node, src_idx;
1767{
e3a87852 1768 re_dfa_t *const dfa = mctx->dfa;
a3022b82
UD
1769 int lim_idx, src_pos, dst_pos;
1770
1771 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1772 {
485d775d 1773 int subexp_idx;
a3022b82
UD
1774 struct re_backref_cache_entry *ent;
1775 ent = mctx->bkref_ents + limits->elems[lim_idx];
485d775d 1776 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
a3022b82 1777
e3a87852 1778 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
15a7d175
UD
1779 dfa->eclosures + dst_node,
1780 subexp_idx, dst_node, dst_idx);
e3a87852 1781 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
15a7d175
UD
1782 dfa->eclosures + src_node,
1783 subexp_idx, src_node, src_idx);
a3022b82
UD
1784
1785 /* In case of:
15a7d175
UD
1786 <src> <dst> ( <subexp> )
1787 ( <subexp> ) <src> <dst>
1788 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
a3022b82 1789 if (src_pos == dst_pos)
15a7d175 1790 continue; /* This is unrelated limitation. */
a3022b82 1791 else
15a7d175 1792 return 1;
a3022b82
UD
1793 }
1794 return 0;
1795}
1796
1797static int
e3a87852 1798check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
15a7d175 1799 str_idx)
a3022b82
UD
1800 re_match_context_t *mctx;
1801 re_node_set *eclosures;
d0b96fc4 1802 int limit, subexp_idx, from_node, str_idx;
a3022b82 1803{
e3a87852 1804 re_dfa_t *const dfa = mctx->dfa;
a3022b82 1805 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1cef7b3c 1806 int node_idx;
d0b96fc4
UD
1807
1808 /* If we are outside the range of the subexpression, return -1 or 1. */
1809 if (str_idx < lim->subexp_from)
1810 return -1;
1811
1812 if (lim->subexp_to < str_idx)
1813 return 1;
1814
1815 /* If we are within the subexpression, return 0. */
1816 if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1817 return 0;
1818
1819 /* Else, we are on the boundary: examine the nodes on the epsilon
1820 closure. */
1cef7b3c
UD
1821 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1822 {
1823 int node = eclosures->elems[node_idx];
d0b96fc4
UD
1824 switch (dfa->nodes[node].type)
1825 {
1826 case OP_BACK_REF:
1cef7b3c
UD
1827 {
1828 int bi = search_cur_bkref_entry (mctx, str_idx);
1829 for (; bi < mctx->nbkref_ents; ++bi)
1830 {
1831 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
d0b96fc4 1832 int dst, cpos;
f0c7c524 1833
d0b96fc4
UD
1834 /* If this backreference goes beyond the point we're
1835 examining, don't go any further. */
1cef7b3c
UD
1836 if (ent->str_idx > str_idx)
1837 break;
d0b96fc4
UD
1838
1839 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1840 continue;
1841
1842 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1843 OP_CLOSE_SUBEXP cases below. But, if the
1844 destination node is the same node as the source
1845 node, don't recurse because it would cause an
1846 infinite loop: a regex that exhibits this behavior
1847 is ()\1*\1* */
1cef7b3c 1848 dst = dfa->edests[node].elems[0];
d0b96fc4
UD
1849 if (dst == from_node)
1850 {
1851 if (str_idx == lim->subexp_from)
1852 return -1;
1853 else /* if (str_idx == lim->subexp_to) */
1854 return 0;
1855 }
1856
e3a87852 1857 cpos = check_dst_limits_calc_pos (mctx, limit,
1cef7b3c
UD
1858 dfa->eclosures + dst,
1859 subexp_idx, dst,
1860 str_idx);
d0b96fc4
UD
1861
1862 if (cpos == -1 && str_idx == lim->subexp_from)
1863 return -1;
1864
1865 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1866 return 0;
1cef7b3c 1867 }
15a7d175
UD
1868 break;
1869 }
f0c7c524 1870
d0b96fc4
UD
1871 case OP_OPEN_SUBEXP:
1872 if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1873 return -1;
1874 break;
f0c7c524 1875
d0b96fc4
UD
1876 case OP_CLOSE_SUBEXP:
1877 if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1878 return 0;
1879 break;
1880
1881 default:
15a7d175
UD
1882 break;
1883 }
a3022b82 1884 }
d0b96fc4
UD
1885
1886 if (str_idx == lim->subexp_to)
1887 return 1;
1888 else
1889 return 0;
a3022b82
UD
1890}
1891
0742e48e
UD
1892/* Check the limitations of sub expressions LIMITS, and remove the nodes
1893 which are against limitations from DEST_NODES. */
1894
1895static reg_errcode_t
1896check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1897 re_dfa_t *dfa;
1898 re_node_set *dest_nodes;
1899 const re_node_set *candidates;
1900 re_node_set *limits;
1901 struct re_backref_cache_entry *bkref_ents;
1902 int str_idx;
1903{
1904 reg_errcode_t err;
1905 int node_idx, lim_idx;
1906
1907 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1908 {
485d775d 1909 int subexp_idx;
0742e48e
UD
1910 struct re_backref_cache_entry *ent;
1911 ent = bkref_ents + limits->elems[lim_idx];
1912
1913 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
15a7d175 1914 continue; /* This is unrelated limitation. */
0742e48e 1915
485d775d 1916 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
0742e48e 1917 if (ent->subexp_to == str_idx)
15a7d175
UD
1918 {
1919 int ops_node = -1;
1920 int cls_node = -1;
1921 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1922 {
1923 int node = dest_nodes->elems[node_idx];
46bf9de7 1924 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
1925 if (type == OP_OPEN_SUBEXP
1926 && subexp_idx == dfa->nodes[node].opr.idx)
1927 ops_node = node;
1928 else if (type == OP_CLOSE_SUBEXP
1929 && subexp_idx == dfa->nodes[node].opr.idx)
1930 cls_node = node;
1931 }
1932
1933 /* Check the limitation of the open subexpression. */
1934 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1935 if (ops_node >= 0)
1936 {
46bf9de7
UD
1937 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1938 candidates);
15a7d175
UD
1939 if (BE (err != REG_NOERROR, 0))
1940 return err;
1941 }
46bf9de7 1942
15a7d175 1943 /* Check the limitation of the close subexpression. */
46bf9de7
UD
1944 if (cls_node >= 0)
1945 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1946 {
1947 int node = dest_nodes->elems[node_idx];
1948 if (!re_node_set_contains (dfa->inveclosures + node,
1949 cls_node)
1950 && !re_node_set_contains (dfa->eclosures + node,
1951 cls_node))
1952 {
1953 /* It is against this limitation.
1954 Remove it form the current sifted state. */
1955 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1956 candidates);
1957 if (BE (err != REG_NOERROR, 0))
1958 return err;
1959 --node_idx;
1960 }
1961 }
15a7d175 1962 }
0742e48e 1963 else /* (ent->subexp_to != str_idx) */
15a7d175
UD
1964 {
1965 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1966 {
1967 int node = dest_nodes->elems[node_idx];
46bf9de7 1968 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
1969 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1970 {
1971 if (subexp_idx != dfa->nodes[node].opr.idx)
1972 continue;
1973 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1974 || (type == OP_OPEN_SUBEXP))
1975 {
1976 /* It is against this limitation.
1977 Remove it form the current sifted state. */
46bf9de7
UD
1978 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1979 candidates);
15a7d175
UD
1980 if (BE (err != REG_NOERROR, 0))
1981 return err;
1982 }
1983 }
1984 }
1985 }
0742e48e
UD
1986 }
1987 return REG_NOERROR;
1988}
1989
0742e48e 1990static reg_errcode_t
e3a87852 1991sift_states_bkref (mctx, sctx, str_idx, dest_nodes)
0742e48e
UD
1992 re_match_context_t *mctx;
1993 re_sift_context_t *sctx;
1994 int str_idx;
1995 re_node_set *dest_nodes;
1996{
e3a87852 1997 re_dfa_t *const dfa = mctx->dfa;
0742e48e 1998 reg_errcode_t err;
0742e48e
UD
1999 int node_idx, node;
2000 re_sift_context_t local_sctx;
2001 const re_node_set *candidates;
2002 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 2003 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
2004 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2005
2006 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2007 {
56b168be 2008 int cur_bkref_idx = re_string_cur_idx (&mctx->input);
0742e48e
UD
2009 re_token_type_t type;
2010 node = candidates->elems[node_idx];
2011 type = dfa->nodes[node].type;
0742e48e 2012 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
15a7d175 2013 continue;
0742e48e
UD
2014 /* Avoid infinite loop for the REs like "()\1+". */
2015 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
15a7d175 2016 continue;
0742e48e 2017 if (type == OP_BACK_REF)
15a7d175 2018 {
6291ee3c
UD
2019 int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2020 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
2021 {
2022 int disabled_idx, subexp_len, to_idx, dst_node;
2023 struct re_backref_cache_entry *entry;
2024 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
2025 if (entry->str_idx > str_idx)
2026 break;
2027 if (entry->node != node)
2028 continue;
15a7d175
UD
2029 subexp_len = entry->subexp_to - entry->subexp_from;
2030 to_idx = str_idx + subexp_len;
2031 dst_node = (subexp_len ? dfa->nexts[node]
2032 : dfa->edests[node].elems[0]);
2033
6291ee3c
UD
2034 if (to_idx > sctx->last_str_idx
2035 || sctx->sifted_states[to_idx] == NULL
2036 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
2037 dst_node)
e3a87852 2038 || check_dst_limits (mctx, &sctx->limits, node,
6291ee3c 2039 str_idx, dst_node, to_idx))
15a7d175 2040 continue;
15a7d175
UD
2041 {
2042 re_dfastate_t *cur_state;
2043 entry->flag = 0;
2044 for (disabled_idx = enabled_idx + 1;
2045 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
2046 {
2047 struct re_backref_cache_entry *entry2;
2048 entry2 = mctx->bkref_ents + disabled_idx;
6291ee3c
UD
2049 if (entry2->str_idx > str_idx)
2050 break;
2051 entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
15a7d175
UD
2052 }
2053
2054 if (local_sctx.sifted_states == NULL)
2055 {
2056 local_sctx = *sctx;
2057 err = re_node_set_init_copy (&local_sctx.limits,
2058 &sctx->limits);
2059 if (BE (err != REG_NOERROR, 0))
1b2c2628 2060 goto free_return;
15a7d175
UD
2061 }
2062 local_sctx.last_node = node;
2063 local_sctx.last_str_idx = str_idx;
2064 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2065 if (BE (err < 0, 0))
1b2c2628
UD
2066 {
2067 err = REG_ESPACE;
2068 goto free_return;
2069 }
15a7d175 2070 cur_state = local_sctx.sifted_states[str_idx];
e3a87852 2071 err = sift_states_backward (mctx, &local_sctx);
15a7d175 2072 if (BE (err != REG_NOERROR, 0))
1b2c2628 2073 goto free_return;
15a7d175
UD
2074 if (sctx->limited_states != NULL)
2075 {
2076 err = merge_state_array (dfa, sctx->limited_states,
2077 local_sctx.sifted_states,
2078 str_idx + 1);
2079 if (BE (err != REG_NOERROR, 0))
2080 goto free_return;
2081 }
2082 local_sctx.sifted_states[str_idx] = cur_state;
2083 re_node_set_remove (&local_sctx.limits, enabled_idx);
485d775d
UD
2084 /* We must not use the variable entry here, since
2085 mctx->bkref_ents might be realloced. */
2086 mctx->bkref_ents[enabled_idx].flag = 1;
15a7d175
UD
2087 }
2088 }
6291ee3c
UD
2089 enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2090 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
2091 {
2092 struct re_backref_cache_entry *entry;
2093 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
2094 if (entry->str_idx > str_idx)
2095 break;
2096 if (entry->node == node)
15a7d175
UD
2097 entry->flag = 0;
2098 }
2099 }
0742e48e 2100 }
1b2c2628
UD
2101 err = REG_NOERROR;
2102 free_return:
0742e48e
UD
2103 if (local_sctx.sifted_states != NULL)
2104 {
0742e48e
UD
2105 re_node_set_free (&local_sctx.limits);
2106 }
2107
1b2c2628 2108 return err;
0742e48e
UD
2109}
2110
2111
2112#ifdef RE_ENABLE_I18N
2113static int
e3a87852 2114sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
0742e48e
UD
2115 const re_match_context_t *mctx;
2116 re_sift_context_t *sctx;
2117 int node_idx, str_idx, max_str_idx;
2118{
e3a87852 2119 re_dfa_t *const dfa = mctx->dfa;
0742e48e
UD
2120 int naccepted;
2121 /* Check the node can accept `multi byte'. */
56b168be 2122 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
0742e48e
UD
2123 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2124 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
15a7d175 2125 dfa->nexts[node_idx]))
0742e48e 2126 /* The node can't accept the `multi byte', or the
bb3f4825 2127 destination was already thrown away, then the node
0742e48e
UD
2128 could't accept the current input `multi byte'. */
2129 naccepted = 0;
2130 /* Otherwise, it is sure that the node could accept
2131 `naccepted' bytes input. */
2132 return naccepted;
2133}
2134#endif /* RE_ENABLE_I18N */
2135
3b0bdc72
UD
2136\f
2137/* Functions for state transition. */
2138
2139/* Return the next state to which the current state STATE will transit by
2140 accepting the current input byte, and update STATE_LOG if necessary.
2141 If STATE can accept a multibyte char/collating element/back reference
2142 update the destination of STATE_LOG. */
2143
2144static re_dfastate_t *
e3a87852 2145transit_state (err, mctx, state)
a9388965 2146 reg_errcode_t *err;
a9388965 2147 re_match_context_t *mctx;
612546c6 2148 re_dfastate_t *state;
3b0bdc72 2149{
e3a87852 2150 re_dfa_t *const dfa = mctx->dfa;
58845a70 2151 re_dfastate_t **trtable;
3b0bdc72
UD
2152 unsigned char ch;
2153
56b168be
UD
2154 if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
2155 || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
2156 && mctx->input.valid_len < mctx->input.len))
612546c6
UD
2157 {
2158 *err = extend_buffers (mctx);
2159 if (BE (*err != REG_NOERROR, 0))
15a7d175 2160 return NULL;
612546c6
UD
2161 }
2162
434d3784 2163#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2164 /* If the current state can accept multibyte. */
2165 if (state->accept_mb)
15a7d175 2166 {
e3a87852 2167 *err = transit_state_mb (mctx, state);
15a7d175
UD
2168 if (BE (*err != REG_NOERROR, 0))
2169 return NULL;
2170 }
434d3784 2171#endif /* RE_ENABLE_I18N */
3b0bdc72 2172
4c595adb
UD
2173 /* Then decide the next state with the single byte. */
2174 if (1)
2175 {
2176 /* Use transition table */
2177 ch = re_string_fetch_byte (&mctx->input);
2178 trtable = state->trtable;
2179 if (trtable == NULL)
2180 {
2181 trtable = build_trtable (dfa, state);
2182 if (trtable == NULL)
c13c99fa 2183 {
4c595adb
UD
2184 *err = REG_ESPACE;
2185 return NULL;
15a7d175 2186 }
4c595adb
UD
2187 }
2188 if (BE (state->word_trtable, 0))
2189 {
2190 unsigned int context;
2191 context
2192 = re_string_context_at (&mctx->input,
2193 re_string_cur_idx (&mctx->input) - 1,
2194 mctx->eflags);
2195 if (IS_WORD_CONTEXT (context))
2196 return trtable[ch + SBC_MAX];
c13c99fa 2197 else
4c595adb 2198 return trtable[ch];
15a7d175 2199 }
3b0bdc72 2200 else
4c595adb 2201 return trtable[ch];
3b0bdc72 2202 }
4c595adb
UD
2203#if 0
2204 else
2205 /* don't use transition table */
2206 return transit_state_sb (err, mctx, state);
2207#endif
2208}
3b0bdc72 2209
4c595adb 2210/* Update the state_log if we need */
58845a70
UD
2211re_dfastate_t *
2212merge_state_with_log (err, mctx, next_state)
2213 reg_errcode_t *err;
2214 re_match_context_t *mctx;
2215 re_dfastate_t *next_state;
2216{
2217 re_dfa_t *const dfa = mctx->dfa;
4c595adb
UD
2218 int cur_idx = re_string_cur_idx (&mctx->input);
2219
2220 if (cur_idx > mctx->state_log_top)
3b0bdc72 2221 {
4c595adb
UD
2222 mctx->state_log[cur_idx] = next_state;
2223 mctx->state_log_top = cur_idx;
2224 }
2225 else if (mctx->state_log[cur_idx] == 0)
2226 {
2227 mctx->state_log[cur_idx] = next_state;
2228 }
2229 else
2230 {
2231 re_dfastate_t *pstate;
2232 unsigned int context;
2233 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2234 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2235 the destination of a multibyte char/collating element/
2236 back reference. Then the next state is the union set of
2237 these destinations and the results of the transition table. */
2238 pstate = mctx->state_log[cur_idx];
2239 log_nodes = pstate->entrance_nodes;
2240 if (next_state != NULL)
2241 {
2242 table_nodes = next_state->entrance_nodes;
2243 *err = re_node_set_init_union (&next_nodes, table_nodes,
15a7d175 2244 log_nodes);
4c595adb
UD
2245 if (BE (*err != REG_NOERROR, 0))
2246 return NULL;
2247 }
2248 else
2249 next_nodes = *log_nodes;
2250 /* Note: We already add the nodes of the initial state,
2251 then we don't need to add them here. */
2252
2253 context = re_string_context_at (&mctx->input,
2254 re_string_cur_idx (&mctx->input) - 1,
2255 mctx->eflags);
2256 next_state = mctx->state_log[cur_idx]
2257 = re_acquire_state_context (err, dfa, &next_nodes, context);
2258 /* We don't need to check errors here, since the return value of
2259 this function is next_state and ERR is already set. */
2260
2261 if (table_nodes != NULL)
2262 re_node_set_free (&next_nodes);
6291ee3c
UD
2263 }
2264
bb3f4825 2265 if (BE (dfa->nbackref, 0) && next_state != NULL)
6291ee3c 2266 {
bb3f4825
UD
2267 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2268 later. We must check them here, since the back references in the
2269 next state might use them. */
e3a87852 2270 *err = check_subexp_matching_top (mctx, &next_state->nodes,
6291ee3c
UD
2271 cur_idx);
2272 if (BE (*err != REG_NOERROR, 0))
2273 return NULL;
6291ee3c 2274
bb3f4825
UD
2275 /* If the next state has back references. */
2276 if (next_state->has_backref)
2277 {
e3a87852 2278 *err = transit_state_bkref (mctx, &next_state->nodes);
bb3f4825
UD
2279 if (BE (*err != REG_NOERROR, 0))
2280 return NULL;
2281 next_state = mctx->state_log[cur_idx];
2282 }
3b0bdc72 2283 }
4c595adb 2284
3b0bdc72
UD
2285 return next_state;
2286}
2287
4c595adb
UD
2288/* Skip bytes in the input that correspond to part of a
2289 multi-byte match, then look in the log for a state
2290 from which to restart matching. */
2291re_dfastate_t *
2292find_recover_state (err, mctx)
2293 reg_errcode_t *err;
2294 re_match_context_t *mctx;
2295{
2296 re_dfastate_t *cur_state = NULL;
2297 do
2298 {
2299 int max = mctx->state_log_top;
2300 int cur_str_idx = re_string_cur_idx (&mctx->input);
2301
2302 do
2303 {
2304 if (++cur_str_idx > max)
2305 return NULL;
2306 re_string_skip_bytes (&mctx->input, 1);
2307 }
2308 while (mctx->state_log[cur_str_idx] == NULL);
2309
2310 cur_state = merge_state_with_log (err, mctx, NULL);
2311 }
2312 while (err == REG_NOERROR && cur_state == NULL);
2313 return cur_state;
2314}
2315
3b0bdc72
UD
2316/* Helper functions for transit_state. */
2317
6291ee3c
UD
2318/* From the node set CUR_NODES, pick up the nodes whose types are
2319 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2320 expression. And register them to use them later for evaluating the
2321 correspoding back references. */
2322
2323static reg_errcode_t
e3a87852 2324check_subexp_matching_top (mctx, cur_nodes, str_idx)
6291ee3c
UD
2325 re_match_context_t *mctx;
2326 re_node_set *cur_nodes;
2327 int str_idx;
2328{
e3a87852 2329 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
2330 int node_idx;
2331 reg_errcode_t err;
2332
2333 /* TODO: This isn't efficient.
2334 Because there might be more than one nodes whose types are
2335 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2336 nodes.
2337 E.g. RE: (a){2} */
2338 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2339 {
2340 int node = cur_nodes->elems[node_idx];
2341 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
ce859332 2342 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
6291ee3c
UD
2343 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2344 {
2345 err = match_ctx_add_subtop (mctx, node, str_idx);
2346 if (BE (err != REG_NOERROR, 0))
2347 return err;
2348 }
2349 }
2350 return REG_NOERROR;
2351}
2352
c13c99fa 2353#if 0
3b0bdc72
UD
2354/* Return the next state to which the current state STATE will transit by
2355 accepting the current input byte. */
2356
2357static re_dfastate_t *
e3a87852 2358transit_state_sb (err, mctx, state)
a9388965 2359 reg_errcode_t *err;
a9388965 2360 re_match_context_t *mctx;
e3a87852 2361 re_dfastate_t *state;
3b0bdc72 2362{
e3a87852 2363 re_dfa_t *const dfa = mctx->dfa;
3b0bdc72
UD
2364 re_node_set next_nodes;
2365 re_dfastate_t *next_state;
56b168be 2366 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72
UD
2367 unsigned int context;
2368
a9388965 2369 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
bc15410e 2370 if (BE (*err != REG_NOERROR, 0))
a9388965 2371 return NULL;
3b0bdc72
UD
2372 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2373 {
2374 int cur_node = state->nodes.elems[node_cnt];
e3a87852 2375 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
15a7d175
UD
2376 {
2377 *err = re_node_set_merge (&next_nodes,
2378 dfa->eclosures + dfa->nexts[cur_node]);
2379 if (BE (*err != REG_NOERROR, 0))
2380 {
2381 re_node_set_free (&next_nodes);
2382 return NULL;
2383 }
2384 }
3b0bdc72 2385 }
56b168be 2386 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
a9388965
UD
2387 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2388 /* We don't need to check errors here, since the return value of
2389 this function is next_state and ERR is already set. */
2390
3b0bdc72 2391 re_node_set_free (&next_nodes);
56b168be 2392 re_string_skip_bytes (&mctx->input, 1);
3b0bdc72
UD
2393 return next_state;
2394}
c13c99fa 2395#endif
3b0bdc72 2396
434d3784 2397#ifdef RE_ENABLE_I18N
a9388965 2398static reg_errcode_t
e3a87852 2399transit_state_mb (mctx, pstate)
3b0bdc72 2400 re_match_context_t *mctx;
e3a87852 2401 re_dfastate_t *pstate;
3b0bdc72 2402{
e3a87852 2403 re_dfa_t *const dfa = mctx->dfa;
a9388965 2404 reg_errcode_t err;
3b0bdc72
UD
2405 int i;
2406
2407 for (i = 0; i < pstate->nodes.nelem; ++i)
2408 {
2409 re_node_set dest_nodes, *new_nodes;
2410 int cur_node_idx = pstate->nodes.elems[i];
2411 int naccepted = 0, dest_idx;
2412 unsigned int context;
2413 re_dfastate_t *dest_state;
2414
485d775d 2415 if (dfa->nodes[cur_node_idx].constraint)
15a7d175 2416 {
56b168be
UD
2417 context = re_string_context_at (&mctx->input,
2418 re_string_cur_idx (&mctx->input),
2419 mctx->eflags);
15a7d175
UD
2420 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2421 context))
2422 continue;
2423 }
3b0bdc72 2424
ad7f28c2 2425 /* How many bytes the node can accept? */
3b0bdc72 2426 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
56b168be
UD
2427 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2428 re_string_cur_idx (&mctx->input));
3b0bdc72 2429 if (naccepted == 0)
15a7d175 2430 continue;
3b0bdc72
UD
2431
2432 /* The node can accepts `naccepted' bytes. */
56b168be 2433 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
0742e48e 2434 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
15a7d175 2435 : mctx->max_mb_elem_len);
7c1be3ec 2436 err = clean_state_log_if_needed (mctx, dest_idx);
612546c6 2437 if (BE (err != REG_NOERROR, 0))
15a7d175 2438 return err;
3b0bdc72
UD
2439#ifdef DEBUG
2440 assert (dfa->nexts[cur_node_idx] != -1);
2441#endif
2442 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
15a7d175 2443 then we use pstate->nodes.elems[i] instead. */
3b0bdc72
UD
2444 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2445
612546c6 2446 dest_state = mctx->state_log[dest_idx];
3b0bdc72 2447 if (dest_state == NULL)
15a7d175 2448 dest_nodes = *new_nodes;
3b0bdc72 2449 else
15a7d175
UD
2450 {
2451 err = re_node_set_init_union (&dest_nodes,
2452 dest_state->entrance_nodes, new_nodes);
2453 if (BE (err != REG_NOERROR, 0))
2454 return err;
2455 }
56b168be 2456 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
612546c6 2457 mctx->state_log[dest_idx]
15a7d175 2458 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
3b0bdc72 2459 if (dest_state != NULL)
15a7d175 2460 re_node_set_free (&dest_nodes);
1b2c2628 2461 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
15a7d175 2462 return err;
3b0bdc72 2463 }
a9388965 2464 return REG_NOERROR;
3b0bdc72 2465}
434d3784 2466#endif /* RE_ENABLE_I18N */
3b0bdc72 2467
a9388965 2468static reg_errcode_t
e3a87852 2469transit_state_bkref (mctx, nodes)
3b0bdc72 2470 re_match_context_t *mctx;
e3a87852 2471 const re_node_set *nodes;
3b0bdc72 2472{
e3a87852 2473 re_dfa_t *const dfa = mctx->dfa;
a9388965 2474 reg_errcode_t err;
0742e48e 2475 int i;
56b168be 2476 int cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72
UD
2477
2478 for (i = 0; i < nodes->nelem; ++i)
2479 {
6291ee3c 2480 int dest_str_idx, prev_nelem, bkc_idx;
3b0bdc72
UD
2481 int node_idx = nodes->elems[i];
2482 unsigned int context;
fe9434bb 2483 const re_token_t *node = dfa->nodes + node_idx;
3b0bdc72
UD
2484 re_node_set *new_dest_nodes;
2485
2486 /* Check whether `node' is a backreference or not. */
6291ee3c 2487 if (node->type != OP_BACK_REF)
15a7d175 2488 continue;
485d775d
UD
2489
2490 if (node->constraint)
15a7d175 2491 {
56b168be
UD
2492 context = re_string_context_at (&mctx->input, cur_str_idx,
2493 mctx->eflags);
15a7d175
UD
2494 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2495 continue;
2496 }
3b0bdc72
UD
2497
2498 /* `node' is a backreference.
15a7d175 2499 Check the substring which the substring matched. */
6291ee3c 2500 bkc_idx = mctx->nbkref_ents;
e3a87852 2501 err = get_subexp (mctx, node_idx, cur_str_idx);
612546c6 2502 if (BE (err != REG_NOERROR, 0))
15a7d175 2503 goto free_return;
3b0bdc72
UD
2504
2505 /* And add the epsilon closures (which is `new_dest_nodes') of
15a7d175 2506 the backreference to appropriate state_log. */
3b0bdc72
UD
2507#ifdef DEBUG
2508 assert (dfa->nexts[node_idx] != -1);
2509#endif
6291ee3c 2510 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
15a7d175
UD
2511 {
2512 int subexp_len;
2513 re_dfastate_t *dest_state;
2514 struct re_backref_cache_entry *bkref_ent;
2515 bkref_ent = mctx->bkref_ents + bkc_idx;
2516 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2517 continue;
2518 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2519 new_dest_nodes = (subexp_len == 0
2520 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2521 : dfa->eclosures + dfa->nexts[node_idx]);
2522 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2523 - bkref_ent->subexp_from);
56b168be
UD
2524 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2525 mctx->eflags);
15a7d175
UD
2526 dest_state = mctx->state_log[dest_str_idx];
2527 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2528 : mctx->state_log[cur_str_idx]->nodes.nelem);
2529 /* Add `new_dest_node' to state_log. */
2530 if (dest_state == NULL)
2531 {
2532 mctx->state_log[dest_str_idx]
2533 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2534 context);
2535 if (BE (mctx->state_log[dest_str_idx] == NULL
2536 && err != REG_NOERROR, 0))
2537 goto free_return;
2538 }
2539 else
2540 {
2541 re_node_set dest_nodes;
2542 err = re_node_set_init_union (&dest_nodes,
2543 dest_state->entrance_nodes,
2544 new_dest_nodes);
2545 if (BE (err != REG_NOERROR, 0))
2546 {
2547 re_node_set_free (&dest_nodes);
2548 goto free_return;
2549 }
2550 mctx->state_log[dest_str_idx]
2551 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2552 re_node_set_free (&dest_nodes);
2553 if (BE (mctx->state_log[dest_str_idx] == NULL
2554 && err != REG_NOERROR, 0))
2555 goto free_return;
2556 }
2557 /* We need to check recursively if the backreference can epsilon
2558 transit. */
2559 if (subexp_len == 0
2560 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2561 {
e3a87852 2562 err = check_subexp_matching_top (mctx, new_dest_nodes,
6291ee3c
UD
2563 cur_str_idx);
2564 if (BE (err != REG_NOERROR, 0))
2565 goto free_return;
e3a87852 2566 err = transit_state_bkref (mctx, new_dest_nodes);
15a7d175
UD
2567 if (BE (err != REG_NOERROR, 0))
2568 goto free_return;
2569 }
2570 }
3b0bdc72 2571 }
1b2c2628
UD
2572 err = REG_NOERROR;
2573 free_return:
1b2c2628 2574 return err;
3b0bdc72
UD
2575}
2576
6291ee3c
UD
2577/* Enumerate all the candidates which the backreference BKREF_NODE can match
2578 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2579 Note that we might collect inappropriate candidates here.
2580 However, the cost of checking them strictly here is too high, then we
2581 delay these checking for prune_impossible_nodes(). */
3b0bdc72 2582
6291ee3c 2583static reg_errcode_t
e3a87852 2584get_subexp (mctx, bkref_node, bkref_str_idx)
6291ee3c
UD
2585 re_match_context_t *mctx;
2586 int bkref_node, bkref_str_idx;
3b0bdc72 2587{
e3a87852 2588 re_dfa_t *const dfa = mctx->dfa;
6291ee3c 2589 int subexp_num, sub_top_idx;
56b168be 2590 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
6291ee3c
UD
2591 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2592 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2593 for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2594 {
fe9434bb
UD
2595 const struct re_backref_cache_entry *entry
2596 = &mctx->bkref_ents[cache_idx];
6291ee3c
UD
2597 if (entry->str_idx > bkref_str_idx)
2598 break;
2599 if (entry->node == bkref_node)
2600 return REG_NOERROR; /* We already checked it. */
2601 }
2602 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2603
2604 /* For each sub expression */
2605 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2606 {
2607 reg_errcode_t err;
2608 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2609 re_sub_match_last_t *sub_last;
7c1be3ec 2610 int sub_last_idx, sl_str, bkref_str_off;
6291ee3c
UD
2611
2612 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2613 continue; /* It isn't related. */
2614
2615 sl_str = sub_top->str_idx;
7c1be3ec 2616 bkref_str_off = bkref_str_idx;
6291ee3c
UD
2617 /* At first, check the last node of sub expressions we already
2618 evaluated. */
2619 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2620 {
2621 int sl_str_diff;
2622 sub_last = sub_top->lasts[sub_last_idx];
2623 sl_str_diff = sub_last->str_idx - sl_str;
2624 /* The matched string by the sub expression match with the substring
2625 at the back reference? */
c1baba0f
UD
2626 if (sl_str_diff > 0)
2627 {
2628 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2629 {
2630 /* Not enough chars for a successful match. */
2631 if (bkref_str_off + sl_str_diff > mctx->input.len)
2632 break;
2633
2634 err = clean_state_log_if_needed (mctx,
2635 bkref_str_off
2636 + sl_str_diff);
2637 if (BE (err != REG_NOERROR, 0))
2638 return err;
2639 buf = (const char *) re_string_get_buffer (&mctx->input);
2640 }
2641 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2642 break; /* We don't need to search this sub expression any more. */
2643 }
7c1be3ec 2644 bkref_str_off += sl_str_diff;
6291ee3c 2645 sl_str += sl_str_diff;
e3a87852 2646 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
6291ee3c 2647 bkref_str_idx);
3ee363e2 2648
7c1be3ec
UD
2649 /* Reload buf, since the preceding call might have reallocated
2650 the buffer. */
56b168be 2651 buf = (const char *) re_string_get_buffer (&mctx->input);
3ee363e2 2652
6291ee3c
UD
2653 if (err == REG_NOMATCH)
2654 continue;
2655 if (BE (err != REG_NOERROR, 0))
2656 return err;
2657 }
7c1be3ec 2658
6291ee3c
UD
2659 if (sub_last_idx < sub_top->nlasts)
2660 continue;
2661 if (sub_last_idx > 0)
2662 ++sl_str;
2663 /* Then, search for the other last nodes of the sub expression. */
2664 for (; sl_str <= bkref_str_idx; ++sl_str)
2665 {
2666 int cls_node, sl_str_off;
fe9434bb 2667 const re_node_set *nodes;
6291ee3c
UD
2668 sl_str_off = sl_str - sub_top->str_idx;
2669 /* The matched string by the sub expression match with the substring
2670 at the back reference? */
c1baba0f
UD
2671 if (sl_str_off > 0)
2672 {
2673 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2674 {
2675 /* If we are at the end of the input, we cannot match. */
2676 if (bkref_str_off >= mctx->input.len)
2677 break;
2678
2679 err = extend_buffers (mctx);
2680 if (BE (err != REG_NOERROR, 0))
2681 return err;
2682
2683 buf = (const char *) re_string_get_buffer (&mctx->input);
2684 }
2685 if (buf [bkref_str_off++] != buf[sl_str - 1])
2686 break; /* We don't need to search this sub expression
2687 any more. */
2688 }
6291ee3c
UD
2689 if (mctx->state_log[sl_str] == NULL)
2690 continue;
2691 /* Does this state have a ')' of the sub expression? */
2692 nodes = &mctx->state_log[sl_str]->nodes;
0ce7f49c 2693 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
6291ee3c
UD
2694 if (cls_node == -1)
2695 continue; /* No. */
2696 if (sub_top->path == NULL)
2697 {
2698 sub_top->path = calloc (sizeof (state_array_t),
2699 sl_str - sub_top->str_idx + 1);
2700 if (sub_top->path == NULL)
2701 return REG_ESPACE;
2702 }
2703 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2704 in the current context? */
e3a87852 2705 err = check_arrival (mctx, sub_top->path, sub_top->node,
0ce7f49c 2706 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
6291ee3c
UD
2707 if (err == REG_NOMATCH)
2708 continue;
2709 if (BE (err != REG_NOERROR, 0))
2710 return err;
2711 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2712 if (BE (sub_last == NULL, 0))
2713 return REG_ESPACE;
e3a87852 2714 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
6291ee3c
UD
2715 bkref_str_idx);
2716 if (err == REG_NOMATCH)
2717 continue;
2718 }
2719 }
2720 return REG_NOERROR;
2721}
2722
2723/* Helper functions for get_subexp(). */
2724
2725/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2726 If it can arrive, register the sub expression expressed with SUB_TOP
2727 and SUB_LAST. */
2728
2729static reg_errcode_t
e3a87852 2730get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
6291ee3c 2731 re_match_context_t *mctx;
fe9434bb 2732 const re_sub_match_top_t *sub_top;
6291ee3c
UD
2733 re_sub_match_last_t *sub_last;
2734 int bkref_node, bkref_str;
2735{
2736 reg_errcode_t err;
2737 int to_idx;
2738 /* Can the subexpression arrive the back reference? */
e3a87852 2739 err = check_arrival (mctx, &sub_last->path, sub_last->node,
0ce7f49c 2740 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
6291ee3c
UD
2741 if (err != REG_NOERROR)
2742 return err;
2743 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2744 sub_last->str_idx);
2745 if (BE (err != REG_NOERROR, 0))
2746 return err;
2747 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
c1baba0f 2748 return clean_state_log_if_needed (mctx, to_idx);
6291ee3c
UD
2749}
2750
2751/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2752 Search '(' if FL_OPEN, or search ')' otherwise.
2753 TODO: This function isn't efficient...
2754 Because there might be more than one nodes whose types are
2755 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2756 nodes.
2757 E.g. RE: (a){2} */
2758
2759static int
0ce7f49c 2760find_subexp_node (dfa, nodes, subexp_idx, type)
fe9434bb
UD
2761 const re_dfa_t *dfa;
2762 const re_node_set *nodes;
0ce7f49c 2763 int subexp_idx, type;
6291ee3c
UD
2764{
2765 int cls_idx;
2766 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2767 {
2768 int cls_node = nodes->elems[cls_idx];
fe9434bb 2769 const re_token_t *node = dfa->nodes + cls_node;
0ce7f49c 2770 if (node->type == type
6291ee3c
UD
2771 && node->opr.idx == subexp_idx)
2772 return cls_node;
2773 }
2774 return -1;
2775}
2776
2777/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2778 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2779 heavily reused.
2780 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2781
2782static reg_errcode_t
e3a87852 2783check_arrival (mctx, path, top_node, top_str, last_node, last_str,
0ce7f49c 2784 type)
6291ee3c
UD
2785 re_match_context_t *mctx;
2786 state_array_t *path;
0ce7f49c 2787 int top_node, top_str, last_node, last_str, type;
6291ee3c 2788{
e3a87852 2789 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
2790 reg_errcode_t err;
2791 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2792 re_dfastate_t *cur_state = NULL;
2793 re_node_set *cur_nodes, next_nodes;
2794 re_dfastate_t **backup_state_log;
2795 unsigned int context;
2796
2797 subexp_num = dfa->nodes[top_node].opr.idx;
2798 /* Extend the buffer if we need. */
951d6408 2799 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
6291ee3c
UD
2800 {
2801 re_dfastate_t **new_array;
2802 int old_alloc = path->alloc;
2803 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2804 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2805 if (new_array == NULL)
951d6408
UD
2806 {
2807 path->alloc = old_alloc;
2808 return REG_ESPACE;
2809 }
6291ee3c
UD
2810 path->array = new_array;
2811 memset (new_array + old_alloc, '\0',
2812 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2813 }
2814
2815 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2816
2817 /* Temporary modify MCTX. */
2818 backup_state_log = mctx->state_log;
56b168be 2819 backup_cur_idx = mctx->input.cur_idx;
6291ee3c 2820 mctx->state_log = path->array;
56b168be 2821 mctx->input.cur_idx = str_idx;
6291ee3c
UD
2822
2823 /* Setup initial node set. */
56b168be 2824 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
6291ee3c
UD
2825 if (str_idx == top_str)
2826 {
2827 err = re_node_set_init_1 (&next_nodes, top_node);
2828 if (BE (err != REG_NOERROR, 0))
2829 return err;
0ce7f49c 2830 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2831 if (BE (err != REG_NOERROR, 0))
2832 {
2833 re_node_set_free (&next_nodes);
2834 return err;
2835 }
2836 }
2837 else
2838 {
2839 cur_state = mctx->state_log[str_idx];
2840 if (cur_state && cur_state->has_backref)
2841 {
2842 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2843 if (BE ( err != REG_NOERROR, 0))
2844 return err;
2845 }
2846 else
2847 re_node_set_init_empty (&next_nodes);
2848 }
2849 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2850 {
2851 if (next_nodes.nelem)
2852 {
e3a87852 2853 err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
0ce7f49c 2854 subexp_num, type);
6291ee3c
UD
2855 if (BE ( err != REG_NOERROR, 0))
2856 {
2857 re_node_set_free (&next_nodes);
2858 return err;
2859 }
2860 }
2861 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2862 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2863 {
2864 re_node_set_free (&next_nodes);
2865 return err;
2866 }
2867 mctx->state_log[str_idx] = cur_state;
2868 }
2869
2870 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2871 {
2872 re_node_set_empty (&next_nodes);
2873 if (mctx->state_log[str_idx + 1])
2874 {
2875 err = re_node_set_merge (&next_nodes,
2876 &mctx->state_log[str_idx + 1]->nodes);
2877 if (BE (err != REG_NOERROR, 0))
2878 {
2879 re_node_set_free (&next_nodes);
2880 return err;
2881 }
2882 }
2883 if (cur_state)
2884 {
e3a87852 2885 err = check_arrival_add_next_nodes (mctx, str_idx,
46bf9de7 2886 &cur_state->nodes, &next_nodes);
6291ee3c
UD
2887 if (BE (err != REG_NOERROR, 0))
2888 {
2889 re_node_set_free (&next_nodes);
2890 return err;
2891 }
2892 }
2893 ++str_idx;
2894 if (next_nodes.nelem)
2895 {
0ce7f49c 2896 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2897 if (BE (err != REG_NOERROR, 0))
2898 {
2899 re_node_set_free (&next_nodes);
2900 return err;
2901 }
e3a87852 2902 err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
0ce7f49c 2903 subexp_num, type);
6291ee3c
UD
2904 if (BE ( err != REG_NOERROR, 0))
2905 {
2906 re_node_set_free (&next_nodes);
2907 return err;
2908 }
2909 }
56b168be 2910 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
6291ee3c
UD
2911 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2912 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2913 {
2914 re_node_set_free (&next_nodes);
2915 return err;
2916 }
2917 mctx->state_log[str_idx] = cur_state;
2918 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2919 }
2920 re_node_set_free (&next_nodes);
2921 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2922 : &mctx->state_log[last_str]->nodes);
2923 path->next_idx = str_idx;
2924
2925 /* Fix MCTX. */
2926 mctx->state_log = backup_state_log;
56b168be 2927 mctx->input.cur_idx = backup_cur_idx;
6291ee3c 2928
6291ee3c 2929 /* Then check the current node set has the node LAST_NODE. */
c0d5034e
UD
2930 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2931 return REG_NOERROR;
2932
2933 return REG_NOMATCH;
6291ee3c
UD
2934}
2935
2936/* Helper functions for check_arrival. */
2937
2938/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2939 to NEXT_NODES.
2940 TODO: This function is similar to the functions transit_state*(),
2941 however this function has many additional works.
2942 Can't we unify them? */
2943
2944static reg_errcode_t
e3a87852 2945check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
6291ee3c
UD
2946 re_match_context_t *mctx;
2947 int str_idx;
2948 re_node_set *cur_nodes, *next_nodes;
2949{
e3a87852 2950 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
2951 int cur_idx;
2952 reg_errcode_t err;
2953 re_node_set union_set;
2954 re_node_set_init_empty (&union_set);
2955 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2956 {
2957 int naccepted = 0;
2958 int cur_node = cur_nodes->elems[cur_idx];
2959 re_token_type_t type = dfa->nodes[cur_node].type;
46bf9de7 2960 if (IS_EPSILON_NODE (type))
6291ee3c
UD
2961 continue;
2962#ifdef RE_ENABLE_I18N
2963 /* If the node may accept `multi byte'. */
2964 if (ACCEPT_MB_NODE (type))
2965 {
56b168be 2966 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
6291ee3c
UD
2967 str_idx);
2968 if (naccepted > 1)
2969 {
2970 re_dfastate_t *dest_state;
2971 int next_node = dfa->nexts[cur_node];
2972 int next_idx = str_idx + naccepted;
2973 dest_state = mctx->state_log[next_idx];
2974 re_node_set_empty (&union_set);
2975 if (dest_state)
2976 {
2977 err = re_node_set_merge (&union_set, &dest_state->nodes);
2978 if (BE (err != REG_NOERROR, 0))
2979 {
2980 re_node_set_free (&union_set);
2981 return err;
2982 }
6291ee3c 2983 }
44777771
UD
2984 err = re_node_set_insert (&union_set, next_node);
2985 if (BE (err < 0, 0))
6291ee3c 2986 {
44777771
UD
2987 re_node_set_free (&union_set);
2988 return REG_ESPACE;
6291ee3c
UD
2989 }
2990 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2991 &union_set);
2992 if (BE (mctx->state_log[next_idx] == NULL
2993 && err != REG_NOERROR, 0))
2994 {
2995 re_node_set_free (&union_set);
2996 return err;
2997 }
2998 }
2999 }
3000#endif /* RE_ENABLE_I18N */
3001 if (naccepted
e3a87852 3002 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
6291ee3c
UD
3003 {
3004 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3005 if (BE (err < 0, 0))
3006 {
3007 re_node_set_free (&union_set);
3008 return REG_ESPACE;
3009 }
3010 }
3011 }
3012 re_node_set_free (&union_set);
3013 return REG_NOERROR;
3014}
3015
3016/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3017 CUR_NODES, however exclude the nodes which are:
3018 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3019 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3020*/
3021
3022static reg_errcode_t
0ce7f49c 3023check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
6291ee3c
UD
3024 re_dfa_t *dfa;
3025 re_node_set *cur_nodes;
0ce7f49c 3026 int ex_subexp, type;
6291ee3c
UD
3027{
3028 reg_errcode_t err;
3029 int idx, outside_node;
3030 re_node_set new_nodes;
3031#ifdef DEBUG
3032 assert (cur_nodes->nelem);
3033#endif
3034 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3035 if (BE (err != REG_NOERROR, 0))
3036 return err;
3037 /* Create a new node set NEW_NODES with the nodes which are epsilon
3038 closures of the node in CUR_NODES. */
3039
3040 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3041 {
3042 int cur_node = cur_nodes->elems[idx];
3043 re_node_set *eclosure = dfa->eclosures + cur_node;
0ce7f49c 3044 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
6291ee3c
UD
3045 if (outside_node == -1)
3046 {
3047 /* There are no problematic nodes, just merge them. */
3048 err = re_node_set_merge (&new_nodes, eclosure);
3049 if (BE (err != REG_NOERROR, 0))
3050 {
3051 re_node_set_free (&new_nodes);
3052 return err;
3053 }
3054 }
3055 else
3056 {
3057 /* There are problematic nodes, re-calculate incrementally. */
3058 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
0ce7f49c 3059 ex_subexp, type);
6291ee3c
UD
3060 if (BE (err != REG_NOERROR, 0))
3061 {
3062 re_node_set_free (&new_nodes);
3063 return err;
3064 }
3065 }
3066 }
3067 re_node_set_free (cur_nodes);
3068 *cur_nodes = new_nodes;
3069 return REG_NOERROR;
3070}
3071
3072/* Helper function for check_arrival_expand_ecl.
3073 Check incrementally the epsilon closure of TARGET, and if it isn't
3074 problematic append it to DST_NODES. */
3075
3076static reg_errcode_t
0ce7f49c 3077check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
6291ee3c 3078 re_dfa_t *dfa;
0ce7f49c 3079 int target, ex_subexp, type;
6291ee3c
UD
3080 re_node_set *dst_nodes;
3081{
0ce7f49c 3082 int cur_node;
6291ee3c
UD
3083 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3084 {
3085 int err;
6291ee3c 3086
0ce7f49c 3087 if (dfa->nodes[cur_node].type == type
6291ee3c
UD
3088 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3089 {
0ce7f49c 3090 if (type == OP_CLOSE_SUBEXP)
6291ee3c
UD
3091 {
3092 err = re_node_set_insert (dst_nodes, cur_node);
3093 if (BE (err == -1, 0))
3094 return REG_ESPACE;
3095 }
3096 break;
3097 }
3098 err = re_node_set_insert (dst_nodes, cur_node);
3099 if (BE (err == -1, 0))
3100 return REG_ESPACE;
3101 if (dfa->edests[cur_node].nelem == 0)
3102 break;
3103 if (dfa->edests[cur_node].nelem == 2)
3104 {
3105 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3106 dfa->edests[cur_node].elems[1],
0ce7f49c 3107 ex_subexp, type);
6291ee3c
UD
3108 if (BE (err != REG_NOERROR, 0))
3109 return err;
3110 }
3111 cur_node = dfa->edests[cur_node].elems[0];
3112 }
3113 return REG_NOERROR;
3114}
3115
3116
3117/* For all the back references in the current state, calculate the
3118 destination of the back references by the appropriate entry
3119 in MCTX->BKREF_ENTS. */
3120
3121static reg_errcode_t
e3a87852 3122expand_bkref_cache (mctx, cur_nodes, cur_str, last_str, subexp_num,
0ce7f49c 3123 type)
6291ee3c 3124 re_match_context_t *mctx;
0ce7f49c 3125 int cur_str, last_str, subexp_num, type;
6291ee3c
UD
3126 re_node_set *cur_nodes;
3127{
e3a87852 3128 re_dfa_t *const dfa = mctx->dfa;
6291ee3c 3129 reg_errcode_t err;
6291ee3c
UD
3130 int cache_idx, cache_idx_start;
3131 /* The current state. */
3132
3133 cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3134 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3135 {
3136 int to_idx, next_node;
3137 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3138 if (ent->str_idx > cur_str)
3139 break;
3140 /* Is this entry ENT is appropriate? */
3141 if (!re_node_set_contains (cur_nodes, ent->node))
3142 continue; /* No. */
3143
3144 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3145 /* Calculate the destination of the back reference, and append it
3146 to MCTX->STATE_LOG. */
3147 if (to_idx == cur_str)
3148 {
3149 /* The backreference did epsilon transit, we must re-check all the
3150 node in the current state. */
3151 re_node_set new_dests;
3152 reg_errcode_t err2, err3;
3153 next_node = dfa->edests[ent->node].elems[0];
3154 if (re_node_set_contains (cur_nodes, next_node))
3155 continue;
3156 err = re_node_set_init_1 (&new_dests, next_node);
0ce7f49c 3157 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
6291ee3c
UD
3158 err3 = re_node_set_merge (cur_nodes, &new_dests);
3159 re_node_set_free (&new_dests);
3160 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3161 || err3 != REG_NOERROR, 0))
3162 {
3163 err = (err != REG_NOERROR ? err
3164 : (err2 != REG_NOERROR ? err2 : err3));
3165 return err;
3166 }
3167 /* TODO: It is still inefficient... */
3168 cache_idx = cache_idx_start - 1;
3169 continue;
3170 }
3171 else
3172 {
3173 re_node_set union_set;
3174 next_node = dfa->nexts[ent->node];
3175 if (mctx->state_log[to_idx])
3176 {
3177 int ret;
3178 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3179 next_node))
3180 continue;
3181 err = re_node_set_init_copy (&union_set,
3182 &mctx->state_log[to_idx]->nodes);
3183 ret = re_node_set_insert (&union_set, next_node);
3184 if (BE (err != REG_NOERROR || ret < 0, 0))
3185 {
3186 re_node_set_free (&union_set);
3187 err = err != REG_NOERROR ? err : REG_ESPACE;
3188 return err;
3189 }
3190 }
3191 else
3192 {
3193 err = re_node_set_init_1 (&union_set, next_node);
3194 if (BE (err != REG_NOERROR, 0))
3195 return err;
3196 }
3197 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3198 re_node_set_free (&union_set);
3199 if (BE (mctx->state_log[to_idx] == NULL
3200 && err != REG_NOERROR, 0))
3201 return err;
3202 }
3203 }
3204 return REG_NOERROR;
3205}
3206
3207/* Build transition table for the state.
3208 Return the new table if succeeded, otherwise return NULL. */
3209
3210static re_dfastate_t **
56b168be
UD
3211build_trtable (dfa, state)
3212 re_dfa_t *dfa;
c13c99fa 3213 re_dfastate_t *state;
6291ee3c
UD
3214{
3215 reg_errcode_t err;
3ce12656
UD
3216 int i, j, ch;
3217 unsigned int elem, mask;
6291ee3c
UD
3218 int dests_node_malloced = 0, dest_states_malloced = 0;
3219 int ndests; /* Number of the destination states from `state'. */
3220 re_dfastate_t **trtable;
3221 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3222 re_node_set follows, *dests_node;
3223 bitset *dests_ch;
3224 bitset acceptable;
3b0bdc72
UD
3225
3226 /* We build DFA states which corresponds to the destination nodes
3227 from `state'. `dests_node[i]' represents the nodes which i-th
3228 destination state contains, and `dests_ch[i]' represents the
3229 characters which i-th destination state accepts. */
05dab910
RM
3230#ifdef _LIBC
3231 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3232 dests_node = (re_node_set *)
3233 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3234 else
3235#endif
3236 {
3237 dests_node = (re_node_set *)
3238 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3239 if (BE (dests_node == NULL, 0))
3240 return NULL;
3241 dests_node_malloced = 1;
3242 }
3243 dests_ch = (bitset *) (dests_node + SBC_MAX);
3b0bdc72
UD
3244
3245 /* Initialize transiton table. */
c13c99fa 3246 state->word_trtable = 0;
3b0bdc72
UD
3247
3248 /* At first, group all nodes belonging to `state' into several
3249 destinations. */
56b168be 3250 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
bc15410e 3251 if (BE (ndests <= 0, 0))
3b0bdc72 3252 {
05dab910
RM
3253 if (dests_node_malloced)
3254 free (dests_node);
a9388965 3255 /* Return NULL in case of an error, trtable otherwise. */
05dab910 3256 if (ndests == 0)
c13c99fa 3257 {
3ce12656
UD
3258 state->trtable = (re_dfastate_t **)
3259 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3260 return state->trtable;
c13c99fa 3261 }
05dab910 3262 return NULL;
3b0bdc72
UD
3263 }
3264
a9388965 3265 err = re_node_set_alloc (&follows, ndests + 1);
05dab910
RM
3266 if (BE (err != REG_NOERROR, 0))
3267 goto out_free;
3268
3269#ifdef _LIBC
3270 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3271 + ndests * 3 * sizeof (re_dfastate_t *)))
3272 dest_states = (re_dfastate_t **)
3273 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3274 else
3275#endif
3276 {
3277 dest_states = (re_dfastate_t **)
3278 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3279 if (BE (dest_states == NULL, 0))
3280 {
3281out_free:
3282 if (dest_states_malloced)
3283 free (dest_states);
3284 re_node_set_free (&follows);
3285 for (i = 0; i < ndests; ++i)
3286 re_node_set_free (dests_node + i);
05dab910
RM
3287 if (dests_node_malloced)
3288 free (dests_node);
3289 return NULL;
3290 }
3291 dest_states_malloced = 1;
3292 }
3293 dest_states_word = dest_states + ndests;
3294 dest_states_nl = dest_states_word + ndests;
3295 bitset_empty (acceptable);
a9388965 3296
3b0bdc72
UD
3297 /* Then build the states for all destinations. */
3298 for (i = 0; i < ndests; ++i)
3299 {
3300 int next_node;
3301 re_node_set_empty (&follows);
3302 /* Merge the follows of this destination states. */
3303 for (j = 0; j < dests_node[i].nelem; ++j)
15a7d175
UD
3304 {
3305 next_node = dfa->nexts[dests_node[i].elems[j]];
3306 if (next_node != -1)
3307 {
3308 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3309 if (BE (err != REG_NOERROR, 0))
05dab910 3310 goto out_free;
15a7d175
UD
3311 }
3312 }
a9388965 3313 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
bc15410e 3314 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
15a7d175 3315 goto out_free;
3b0bdc72 3316 /* If the new state has context constraint,
15a7d175 3317 build appropriate states for these contexts. */
3b0bdc72 3318 if (dest_states[i]->has_constraint)
15a7d175
UD
3319 {
3320 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3321 CONTEXT_WORD);
3322 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3323 goto out_free;
3ce12656
UD
3324
3325 if (dest_states[i] != dest_states_word[i]
3326 && dfa->mb_cur_max > 1)
3327 state->word_trtable = 1;
3328
15a7d175
UD
3329 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3330 CONTEXT_NEWLINE);
3331 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3332 goto out_free;
3ce12656 3333 }
3b0bdc72 3334 else
15a7d175
UD
3335 {
3336 dest_states_word[i] = dest_states[i];
3337 dest_states_nl[i] = dest_states[i];
3338 }
3b0bdc72
UD
3339 bitset_merge (acceptable, dests_ch[i]);
3340 }
3341
3ce12656
UD
3342 if (!BE (state->word_trtable, 0))
3343 {
3344 /* We don't care about whether the following character is a word
3345 character, or we are in a single-byte character set so we can
3346 discern by looking at the character code: allocate a
3347 256-entry transition table. */
3348 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3349 if (BE (trtable == NULL, 0))
3350 goto out_free;
3351
3352 /* For all characters ch...: */
3353 for (i = 0; i < BITSET_UINTS; ++i)
3354 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3355 elem;
3356 mask <<= 1, elem >>= 1, ++ch)
3357 if (BE (elem & 1, 0))
3358 {
3359 /* There must be exactly one destination which accepts
3360 character ch. See group_nodes_into_DFAstates. */
3361 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3362 ;
6b6557e8 3363
3ce12656 3364 /* j-th destination accepts the word character ch. */
56b168be 3365 if (dfa->word_char[i] & mask)
3ce12656
UD
3366 trtable[ch] = dest_states_word[j];
3367 else
3368 trtable[ch] = dest_states[j];
3369 }
3370 }
3371 else
3372 {
3373 /* We care about whether the following character is a word
3374 character, and we are in a multi-byte character set: discern
3375 by looking at the character code: build two 256-entry
3376 transition tables, one starting at trtable[0] and one
3377 starting at trtable[SBC_MAX]. */
3378 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3379 2 * SBC_MAX);
3380 if (BE (trtable == NULL, 0))
3381 goto out_free;
6b6557e8 3382
3ce12656
UD
3383 /* For all characters ch...: */
3384 for (i = 0; i < BITSET_UINTS; ++i)
3385 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3386 elem;
3387 mask <<= 1, elem >>= 1, ++ch)
3388 if (BE (elem & 1, 0))
3389 {
3390 /* There must be exactly one destination which accepts
3391 character ch. See group_nodes_into_DFAstates. */
3392 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3393 ;
6b6557e8 3394
3ce12656
UD
3395 /* j-th destination accepts the word character ch. */
3396 trtable[ch] = dest_states[j];
3397 trtable[ch + SBC_MAX] = dest_states_word[j];
3398 }
3399 }
6b6557e8 3400
3b0bdc72 3401 /* new line */
c202c2c5
UD
3402 if (bitset_contain (acceptable, NEWLINE_CHAR))
3403 {
3404 /* The current state accepts newline character. */
3ce12656
UD
3405 for (j = 0; j < ndests; ++j)
3406 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
15a7d175
UD
3407 {
3408 /* k-th destination accepts newline character. */
3ce12656 3409 trtable[NEWLINE_CHAR] = dest_states_nl[j];
c13c99fa 3410 if (state->word_trtable)
3ce12656 3411 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
15a7d175
UD
3412 /* There must be only one destination which accepts
3413 newline. See group_nodes_into_DFAstates. */
3414 break;
3415 }
c202c2c5 3416 }
3b0bdc72 3417
05dab910
RM
3418 if (dest_states_malloced)
3419 free (dest_states);
3b0bdc72
UD
3420
3421 re_node_set_free (&follows);
3422 for (i = 0; i < ndests; ++i)
3423 re_node_set_free (dests_node + i);
3424
05dab910
RM
3425 if (dests_node_malloced)
3426 free (dests_node);
3b0bdc72 3427
c13c99fa 3428 state->trtable = trtable;
3b0bdc72
UD
3429 return trtable;
3430}
3431
3432/* Group all nodes belonging to STATE into several destinations.
3433 Then for all destinations, set the nodes belonging to the destination
3434 to DESTS_NODE[i] and set the characters accepted by the destination
3435 to DEST_CH[i]. This function return the number of destinations. */
3436
3437static int
56b168be
UD
3438group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3439 re_dfa_t *dfa;
3b0bdc72
UD
3440 const re_dfastate_t *state;
3441 re_node_set *dests_node;
3442 bitset *dests_ch;
3443{
a9388965 3444 reg_errcode_t err;
3b0bdc72
UD
3445 int i, j, k;
3446 int ndests; /* Number of the destinations from `state'. */
3447 bitset accepts; /* Characters a node can accept. */
3448 const re_node_set *cur_nodes = &state->nodes;
3449 bitset_empty (accepts);
3450 ndests = 0;
3451
3452 /* For all the nodes belonging to `state', */
3453 for (i = 0; i < cur_nodes->nelem; ++i)
3454 {
3b0bdc72
UD
3455 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3456 re_token_type_t type = node->type;
485d775d 3457 unsigned int constraint = node->constraint;
3b0bdc72
UD
3458
3459 /* Enumerate all single byte character this node can accept. */
3460 if (type == CHARACTER)
15a7d175 3461 bitset_set (accepts, node->opr.c);
3b0bdc72 3462 else if (type == SIMPLE_BRACKET)
15a7d175
UD
3463 {
3464 bitset_merge (accepts, node->opr.sbcset);
3465 }
3b0bdc72 3466 else if (type == OP_PERIOD)
15a7d175 3467 {
65e6becf
UD
3468#ifdef RE_ENABLE_I18N
3469 if (dfa->mb_cur_max > 1)
3470 bitset_merge (accepts, dfa->sb_char);
3471 else
0ce7f49c 3472#endif
65e6becf 3473 bitset_set_all (accepts);
56b168be 3474 if (!(dfa->syntax & RE_DOT_NEWLINE))
15a7d175 3475 bitset_clear (accepts, '\n');
56b168be 3476 if (dfa->syntax & RE_DOT_NOT_NULL)
15a7d175
UD
3477 bitset_clear (accepts, '\0');
3478 }
c0d5034e 3479#ifdef RE_ENABLE_I18N
ad7f28c2
UD
3480 else if (type == OP_UTF8_PERIOD)
3481 {
3482 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
56b168be 3483 if (!(dfa->syntax & RE_DOT_NEWLINE))
ad7f28c2 3484 bitset_clear (accepts, '\n');
56b168be 3485 if (dfa->syntax & RE_DOT_NOT_NULL)
ad7f28c2
UD
3486 bitset_clear (accepts, '\0');
3487 }
c0d5034e 3488#endif
3b0bdc72 3489 else
15a7d175 3490 continue;
3b0bdc72
UD
3491
3492 /* Check the `accepts' and sift the characters which are not
15a7d175 3493 match it the context. */
3b0bdc72 3494 if (constraint)
15a7d175 3495 {
15a7d175
UD
3496 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3497 {
3498 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3499 bitset_empty (accepts);
3500 if (accepts_newline)
3501 bitset_set (accepts, NEWLINE_CHAR);
3502 else
3503 continue;
3504 }
66b110e8
UD
3505 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3506 {
3507 bitset_empty (accepts);
3508 continue;
3509 }
c13c99fa 3510
66b110e8 3511 if (constraint & NEXT_WORD_CONSTRAINT)
65e6becf 3512 {
457beec8 3513 unsigned int any_set = 0;
1cef7b3c
UD
3514 if (type == CHARACTER && !node->word_char)
3515 {
3516 bitset_empty (accepts);
3517 continue;
3518 }
65e6becf
UD
3519#ifdef RE_ENABLE_I18N
3520 if (dfa->mb_cur_max > 1)
3521 for (j = 0; j < BITSET_UINTS; ++j)
457beec8 3522 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
65e6becf
UD
3523 else
3524#endif
3525 for (j = 0; j < BITSET_UINTS; ++j)
457beec8
UD
3526 any_set |= (accepts[j] &= dfa->word_char[j]);
3527 if (!any_set)
3528 continue;
65e6becf 3529 }
66b110e8 3530 if (constraint & NEXT_NOTWORD_CONSTRAINT)
65e6becf 3531 {
457beec8 3532 unsigned int any_set = 0;
1cef7b3c
UD
3533 if (type == CHARACTER && node->word_char)
3534 {
3535 bitset_empty (accepts);
3536 continue;
3537 }
65e6becf
UD
3538#ifdef RE_ENABLE_I18N
3539 if (dfa->mb_cur_max > 1)
3540 for (j = 0; j < BITSET_UINTS; ++j)
457beec8 3541 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
65e6becf
UD
3542 else
3543#endif
3544 for (j = 0; j < BITSET_UINTS; ++j)
457beec8
UD
3545 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3546 if (!any_set)
3547 continue;
65e6becf 3548 }
15a7d175 3549 }
3b0bdc72
UD
3550
3551 /* Then divide `accepts' into DFA states, or create a new
457beec8 3552 state. Above, we make sure that accepts is not empty. */
3b0bdc72 3553 for (j = 0; j < ndests; ++j)
15a7d175
UD
3554 {
3555 bitset intersec; /* Intersection sets, see below. */
3556 bitset remains;
3557 /* Flags, see below. */
3558 int has_intersec, not_subset, not_consumed;
3559
3560 /* Optimization, skip if this state doesn't accept the character. */
3561 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3562 continue;
3563
3564 /* Enumerate the intersection set of this state and `accepts'. */
3565 has_intersec = 0;
3566 for (k = 0; k < BITSET_UINTS; ++k)
3567 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3568 /* And skip if the intersection set is empty. */
3569 if (!has_intersec)
3570 continue;
3571
3572 /* Then check if this state is a subset of `accepts'. */
3573 not_subset = not_consumed = 0;
3574 for (k = 0; k < BITSET_UINTS; ++k)
3575 {
3576 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3577 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3578 }
3579
3580 /* If this state isn't a subset of `accepts', create a
3581 new group state, which has the `remains'. */
3582 if (not_subset)
3583 {
3584 bitset_copy (dests_ch[ndests], remains);
3585 bitset_copy (dests_ch[j], intersec);
3586 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3587 if (BE (err != REG_NOERROR, 0))
3588 goto error_return;
3589 ++ndests;
3590 }
3591
3592 /* Put the position in the current group. */
3593 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3594 if (BE (err < 0, 0))
3595 goto error_return;
3596
3597 /* If all characters are consumed, go to next node. */
3598 if (!not_consumed)
3599 break;
3600 }
3b0bdc72
UD
3601 /* Some characters remain, create a new group. */
3602 if (j == ndests)
15a7d175
UD
3603 {
3604 bitset_copy (dests_ch[ndests], accepts);
3605 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3606 if (BE (err != REG_NOERROR, 0))
3607 goto error_return;
3608 ++ndests;
3609 bitset_empty (accepts);
3610 }
3b0bdc72
UD
3611 }
3612 return ndests;
1b2c2628
UD
3613 error_return:
3614 for (j = 0; j < ndests; ++j)
3615 re_node_set_free (dests_node + j);
3616 return -1;
3b0bdc72
UD
3617}
3618
434d3784
UD
3619#ifdef RE_ENABLE_I18N
3620/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3621 Return the number of the bytes the node accepts.
3622 STR_IDX is the current index of the input string.
3623
3624 This function handles the nodes which can accept one character, or
3625 one collating element like '.', '[a-z]', opposite to the other nodes
3626 can only accept one byte. */
3b0bdc72
UD
3627
3628static int
56b168be
UD
3629check_node_accept_bytes (dfa, node_idx, input, str_idx)
3630 re_dfa_t *dfa;
3b0bdc72
UD
3631 int node_idx, str_idx;
3632 const re_string_t *input;
3633{
3b0bdc72 3634 const re_token_t *node = dfa->nodes + node_idx;
ad7f28c2 3635 int char_len, elem_len;
434d3784 3636 int i;
ad7f28c2
UD
3637
3638 if (BE (node->type == OP_UTF8_PERIOD, 0))
3639 {
3640 unsigned char c = re_string_byte_at (input, str_idx), d;
3641 if (BE (c < 0xc2, 1))
3642 return 0;
3643
3644 if (str_idx + 2 > input->len)
3645 return 0;
3646
3647 d = re_string_byte_at (input, str_idx + 1);
3648 if (c < 0xe0)
3649 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3650 else if (c < 0xf0)
3651 {
3652 char_len = 3;
3653 if (c == 0xe0 && d < 0xa0)
3654 return 0;
3655 }
3656 else if (c < 0xf8)
3657 {
3658 char_len = 4;
3659 if (c == 0xf0 && d < 0x90)
3660 return 0;
3661 }
3662 else if (c < 0xfc)
3663 {
3664 char_len = 5;
3665 if (c == 0xf8 && d < 0x88)
3666 return 0;
3667 }
3668 else if (c < 0xfe)
3669 {
3670 char_len = 6;
3671 if (c == 0xfc && d < 0x84)
3672 return 0;
3673 }
3674 else
3675 return 0;
3676
3677 if (str_idx + char_len > input->len)
3678 return 0;
3679
3680 for (i = 1; i < char_len; ++i)
3681 {
3682 d = re_string_byte_at (input, str_idx + i);
3683 if (d < 0x80 || d > 0xbf)
3684 return 0;
3685 }
3686 return char_len;
3687 }
3688
3689 char_len = re_string_char_size_at (input, str_idx);
3b0bdc72
UD
3690 if (node->type == OP_PERIOD)
3691 {
ad7f28c2
UD
3692 if (char_len <= 1)
3693 return 0;
3694 /* FIXME: I don't think this if is needed, as both '\n'
3695 and '\0' are char_len == 1. */
434d3784 3696 /* '.' accepts any one character except the following two cases. */
56b168be 3697 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
15a7d175 3698 re_string_byte_at (input, str_idx) == '\n') ||
56b168be 3699 ((dfa->syntax & RE_DOT_NOT_NULL) &&
15a7d175
UD
3700 re_string_byte_at (input, str_idx) == '\0'))
3701 return 0;
3b0bdc72
UD
3702 return char_len;
3703 }
ad7f28c2
UD
3704
3705 elem_len = re_string_elem_size_at (input, str_idx);
6c2a04a7 3706 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
ad7f28c2
UD
3707 return 0;
3708
3709 if (node->type == COMPLEX_BRACKET)
3b0bdc72
UD
3710 {
3711 const re_charset_t *cset = node->opr.mbcset;
434d3784 3712# ifdef _LIBC
85c54a32
UD
3713 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3714 + str_idx);
5f93cd52
UD
3715 int j;
3716 uint32_t nrules;
434d3784
UD
3717# endif /* _LIBC */
3718 int match_len = 0;
3719 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
15a7d175 3720 ? re_string_wchar_at (input, str_idx) : 0);
434d3784
UD
3721
3722 /* match with multibyte character? */
3723 for (i = 0; i < cset->nmbchars; ++i)
15a7d175
UD
3724 if (wc == cset->mbchars[i])
3725 {
3726 match_len = char_len;
3727 goto check_node_accept_bytes_match;
3728 }
434d3784
UD
3729 /* match with character_class? */
3730 for (i = 0; i < cset->nchar_classes; ++i)
15a7d175
UD
3731 {
3732 wctype_t wt = cset->char_classes[i];
3733 if (__iswctype (wc, wt))
3734 {
3735 match_len = char_len;
3736 goto check_node_accept_bytes_match;
3737 }
3738 }
434d3784
UD
3739
3740# ifdef _LIBC
5f93cd52 3741 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3b0bdc72 3742 if (nrules != 0)
15a7d175
UD
3743 {
3744 unsigned int in_collseq = 0;
3745 const int32_t *table, *indirect;
3746 const unsigned char *weights, *extra;
3747 const char *collseqwc;
3748 int32_t idx;
3749 /* This #include defines a local function! */
434d3784 3750# include <locale/weight.h>
3b0bdc72 3751
15a7d175
UD
3752 /* match with collating_symbol? */
3753 if (cset->ncoll_syms)
3754 extra = (const unsigned char *)
3755 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3756 for (i = 0; i < cset->ncoll_syms; ++i)
3757 {
3758 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3759 /* Compare the length of input collating element and
3760 the length of current collating element. */
3761 if (*coll_sym != elem_len)
3762 continue;
3763 /* Compare each bytes. */
3764 for (j = 0; j < *coll_sym; j++)
3765 if (pin[j] != coll_sym[1 + j])
3766 break;
3767 if (j == *coll_sym)
3768 {
3769 /* Match if every bytes is equal. */
3770 match_len = j;
3771 goto check_node_accept_bytes_match;
3772 }
3773 }
3774
3775 if (cset->nranges)
3776 {
3777 if (elem_len <= char_len)
3778 {
3779 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
25337753 3780 in_collseq = __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
3781 }
3782 else
3783 in_collseq = find_collation_sequence_value (pin, elem_len);
3784 }
3785 /* match with range expression? */
3786 for (i = 0; i < cset->nranges; ++i)
3787 if (cset->range_starts[i] <= in_collseq
3788 && in_collseq <= cset->range_ends[i])
3789 {
3790 match_len = elem_len;
3791 goto check_node_accept_bytes_match;
3792 }
3793
3794 /* match with equivalence_class? */
3795 if (cset->nequiv_classes)
3796 {
3797 const unsigned char *cp = pin;
3798 table = (const int32_t *)
3799 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3800 weights = (const unsigned char *)
3801 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3802 extra = (const unsigned char *)
3803 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3804 indirect = (const int32_t *)
3805 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3806 idx = findidx (&cp);
3807 if (idx > 0)
3808 for (i = 0; i < cset->nequiv_classes; ++i)
3809 {
3810 int32_t equiv_class_idx = cset->equiv_classes[i];
3811 size_t weight_len = weights[idx];
3812 if (weight_len == weights[equiv_class_idx])
3813 {
3814 int cnt = 0;
3815 while (cnt <= weight_len
3816 && (weights[equiv_class_idx + 1 + cnt]
3817 == weights[idx + 1 + cnt]))
3818 ++cnt;
3819 if (cnt > weight_len)
3820 {
3821 match_len = elem_len;
3822 goto check_node_accept_bytes_match;
3823 }
3824 }
3825 }
3826 }
3827 }
434d3784
UD
3828 else
3829# endif /* _LIBC */
15a7d175
UD
3830 {
3831 /* match with range expression? */
92b27c74 3832#if __GNUC__ >= 2
15a7d175 3833 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
92b27c74 3834#else
15a7d175
UD
3835 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3836 cmp_buf[2] = wc;
92b27c74 3837#endif
15a7d175
UD
3838 for (i = 0; i < cset->nranges; ++i)
3839 {
3840 cmp_buf[0] = cset->range_starts[i];
3841 cmp_buf[4] = cset->range_ends[i];
3842 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3843 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3844 {
3845 match_len = char_len;
3846 goto check_node_accept_bytes_match;
3847 }
3848 }
3849 }
434d3784
UD
3850 check_node_accept_bytes_match:
3851 if (!cset->non_match)
15a7d175 3852 return match_len;
434d3784 3853 else
15a7d175
UD
3854 {
3855 if (match_len > 0)
3856 return 0;
3857 else
3858 return (elem_len > char_len) ? elem_len : char_len;
3859 }
3b0bdc72
UD
3860 }
3861 return 0;
3862}
3863
434d3784 3864# ifdef _LIBC
3b0bdc72
UD
3865static unsigned int
3866find_collation_sequence_value (mbs, mbs_len)
c202c2c5 3867 const unsigned char *mbs;
3b0bdc72
UD
3868 size_t mbs_len;
3869{
3870 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3871 if (nrules == 0)
3872 {
3873 if (mbs_len == 1)
15a7d175
UD
3874 {
3875 /* No valid character. Match it as a single byte character. */
3876 const unsigned char *collseq = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3878 return collseq[mbs[0]];
3879 }
3b0bdc72
UD
3880 return UINT_MAX;
3881 }
3882 else
3883 {
3884 int32_t idx;
3885 const unsigned char *extra = (const unsigned char *)
15a7d175 3886 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6c2a04a7
UD
3887 int32_t extrasize = (const unsigned char *)
3888 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3b0bdc72 3889
6c2a04a7 3890 for (idx = 0; idx < extrasize;)
15a7d175
UD
3891 {
3892 int mbs_cnt, found = 0;
3893 int32_t elem_mbs_len;
3894 /* Skip the name of collating element name. */
3895 idx = idx + extra[idx] + 1;
3896 elem_mbs_len = extra[idx++];
3897 if (mbs_len == elem_mbs_len)
3898 {
3899 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3900 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3901 break;
3902 if (mbs_cnt == elem_mbs_len)
3903 /* Found the entry. */
3904 found = 1;
3905 }
3906 /* Skip the byte sequence of the collating element. */
3907 idx += elem_mbs_len;
3908 /* Adjust for the alignment. */
3909 idx = (idx + 3) & ~3;
3910 /* Skip the collation sequence value. */
3911 idx += sizeof (uint32_t);
3912 /* Skip the wide char sequence of the collating element. */
3913 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3914 /* If we found the entry, return the sequence value. */
3915 if (found)
3916 return *(uint32_t *) (extra + idx);
3917 /* Skip the collation sequence value. */
3918 idx += sizeof (uint32_t);
3919 }
6c2a04a7 3920 return UINT_MAX;
3b0bdc72
UD
3921 }
3922}
434d3784
UD
3923# endif /* _LIBC */
3924#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3925
3926/* Check whether the node accepts the byte which is IDX-th
3927 byte of the INPUT. */
3928
3929static int
e3a87852 3930check_node_accept (mctx, node, idx)
612546c6 3931 const re_match_context_t *mctx;
e3a87852 3932 const re_token_t *node;
612546c6 3933 int idx;
3b0bdc72 3934{
e3a87852 3935 re_dfa_t *const dfa = mctx->dfa;
3b0bdc72 3936 unsigned char ch;
485d775d 3937 if (node->constraint)
3b0bdc72
UD
3938 {
3939 /* The node has constraints. Check whether the current context
15a7d175 3940 satisfies the constraints. */
56b168be
UD
3941 unsigned int context = re_string_context_at (&mctx->input, idx,
3942 mctx->eflags);
3b0bdc72 3943 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
15a7d175 3944 return 0;
3b0bdc72 3945 }
56b168be 3946 ch = re_string_byte_at (&mctx->input, idx);
ad7f28c2
UD
3947 switch (node->type)
3948 {
3949 case CHARACTER:
3950 return node->opr.c == ch;
3951 case SIMPLE_BRACKET:
3952 return bitset_contain (node->opr.sbcset, ch);
c0d5034e 3953#ifdef RE_ENABLE_I18N
ad7f28c2
UD
3954 case OP_UTF8_PERIOD:
3955 if (ch >= 0x80)
3956 return 0;
3957 /* FALLTHROUGH */
c0d5034e 3958#endif
ad7f28c2 3959 case OP_PERIOD:
56b168be
UD
3960 return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
3961 || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
ad7f28c2
UD
3962 default:
3963 return 0;
3964 }
3b0bdc72 3965}
612546c6
UD
3966
3967/* Extend the buffers, if the buffers have run out. */
3968
3969static reg_errcode_t
3970extend_buffers (mctx)
3971 re_match_context_t *mctx;
3972{
3973 reg_errcode_t ret;
56b168be 3974 re_string_t *pstr = &mctx->input;
612546c6
UD
3975
3976 /* Double the lengthes of the buffers. */
3977 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3978 if (BE (ret != REG_NOERROR, 0))
3979 return ret;
3980
3981 if (mctx->state_log != NULL)
3982 {
3983 /* And double the length of state_log. */
951d6408
UD
3984 /* XXX We have no indication of the size of this buffer. If this
3985 allocation fail we have no indication that the state_log array
3986 does not have the right size. */
3987 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3988 pstr->bufs_len + 1);
1b2c2628 3989 if (BE (new_array == NULL, 0))
15a7d175 3990 return REG_ESPACE;
1b2c2628 3991 mctx->state_log = new_array;
612546c6
UD
3992 }
3993
3994 /* Then reconstruct the buffers. */
3995 if (pstr->icase)
3996 {
3997#ifdef RE_ENABLE_I18N
3c0fb574 3998 if (pstr->mb_cur_max > 1)
bb3f4825
UD
3999 {
4000 ret = build_wcs_upper_buffer (pstr);
4001 if (BE (ret != REG_NOERROR, 0))
4002 return ret;
4003 }
612546c6
UD
4004 else
4005#endif /* RE_ENABLE_I18N */
15a7d175 4006 build_upper_buffer (pstr);
612546c6
UD
4007 }
4008 else
4009 {
4010#ifdef RE_ENABLE_I18N
3c0fb574 4011 if (pstr->mb_cur_max > 1)
15a7d175 4012 build_wcs_buffer (pstr);
612546c6
UD
4013 else
4014#endif /* RE_ENABLE_I18N */
15a7d175
UD
4015 {
4016 if (pstr->trans != NULL)
4017 re_string_translate_buffer (pstr);
15a7d175 4018 }
612546c6
UD
4019 }
4020 return REG_NOERROR;
4021}
4022
3b0bdc72
UD
4023\f
4024/* Functions for matching context. */
4025
6291ee3c
UD
4026/* Initialize MCTX. */
4027
a9388965 4028static reg_errcode_t
56b168be 4029match_ctx_init (mctx, eflags, n)
3b0bdc72 4030 re_match_context_t *mctx;
612546c6 4031 int eflags, n;
3b0bdc72
UD
4032{
4033 mctx->eflags = eflags;
612546c6 4034 mctx->match_last = -1;
3b0bdc72 4035 if (n > 0)
a9388965
UD
4036 {
4037 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
6291ee3c
UD
4038 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4039 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
15a7d175 4040 return REG_ESPACE;
a9388965 4041 }
56b168be
UD
4042 /* Already zero-ed by the caller.
4043 else
4044 mctx->bkref_ents = NULL;
4045 mctx->nbkref_ents = 0;
4046 mctx->nsub_tops = 0; */
3b0bdc72 4047 mctx->abkref_ents = n;
6291ee3c 4048 mctx->max_mb_elem_len = 1;
6291ee3c 4049 mctx->asub_tops = n;
a9388965 4050 return REG_NOERROR;
3b0bdc72
UD
4051}
4052
6291ee3c
UD
4053/* Clean the entries which depend on the current input in MCTX.
4054 This function must be invoked when the matcher changes the start index
4055 of the input, or changes the input string. */
4056
4057static void
4058match_ctx_clean (mctx)
4059 re_match_context_t *mctx;
4060{
4061 match_ctx_free_subtops (mctx);
4062 mctx->nsub_tops = 0;
4063 mctx->nbkref_ents = 0;
4064}
4065
4066/* Free all the memory associated with MCTX. */
4067
3b0bdc72
UD
4068static void
4069match_ctx_free (mctx)
4070 re_match_context_t *mctx;
4071{
6291ee3c
UD
4072 match_ctx_free_subtops (mctx);
4073 re_free (mctx->sub_tops);
3b0bdc72
UD
4074 re_free (mctx->bkref_ents);
4075}
4076
6291ee3c
UD
4077/* Free all the memory associated with MCTX->SUB_TOPS. */
4078
4079static void
4080match_ctx_free_subtops (mctx)
4081 re_match_context_t *mctx;
4082{
4083 int st_idx;
4084 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4085 {
4086 int sl_idx;
4087 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4088 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4089 {
4090 re_sub_match_last_t *last = top->lasts[sl_idx];
4091 re_free (last->path.array);
4092 re_free (last);
4093 }
4094 re_free (top->lasts);
4095 if (top->path)
4096 {
4097 re_free (top->path->array);
4098 re_free (top->path);
4099 }
4100 free (top);
4101 }
4102}
4103
4104/* Add a new backreference entry to MCTX.
4105 Note that we assume that caller never call this function with duplicate
4106 entry, and call with STR_IDX which isn't smaller than any existing entry.
4107*/
3b0bdc72 4108
a9388965 4109static reg_errcode_t
0742e48e 4110match_ctx_add_entry (mctx, node, str_idx, from, to)
6291ee3c
UD
4111 re_match_context_t *mctx;
4112 int node, str_idx, from, to;
3b0bdc72
UD
4113{
4114 if (mctx->nbkref_ents >= mctx->abkref_ents)
4115 {
1b2c2628
UD
4116 struct re_backref_cache_entry* new_entry;
4117 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
15a7d175 4118 mctx->abkref_ents * 2);
1b2c2628 4119 if (BE (new_entry == NULL, 0))
15a7d175
UD
4120 {
4121 re_free (mctx->bkref_ents);
4122 return REG_ESPACE;
4123 }
1b2c2628 4124 mctx->bkref_ents = new_entry;
3b0bdc72 4125 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
15a7d175 4126 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3b0bdc72
UD
4127 mctx->abkref_ents *= 2;
4128 }
4129 mctx->bkref_ents[mctx->nbkref_ents].node = node;
0742e48e
UD
4130 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4131 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4132 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4133 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
4134 if (mctx->max_mb_elem_len < to - from)
4135 mctx->max_mb_elem_len = to - from;
a9388965 4136 return REG_NOERROR;
3b0bdc72 4137}
0742e48e 4138
6291ee3c
UD
4139/* Search for the first entry which has the same str_idx.
4140 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4141
4142static int
4143search_cur_bkref_entry (mctx, str_idx)
4144 re_match_context_t *mctx;
4145 int str_idx;
4146{
4147 int left, right, mid;
4148 right = mctx->nbkref_ents;
4149 for (left = 0; left < right;)
4150 {
4151 mid = (left + right) / 2;
4152 if (mctx->bkref_ents[mid].str_idx < str_idx)
4153 left = mid + 1;
4154 else
4155 right = mid;
4156 }
4157 return left;
4158}
4159
0742e48e
UD
4160static void
4161match_ctx_clear_flag (mctx)
4162 re_match_context_t *mctx;
4163{
4164 int i;
4165 for (i = 0; i < mctx->nbkref_ents; ++i)
951d6408 4166 mctx->bkref_ents[i].flag = 0;
0742e48e
UD
4167}
4168
6291ee3c
UD
4169/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4170 at STR_IDX. */
4171
4172static reg_errcode_t
4173match_ctx_add_subtop (mctx, node, str_idx)
4174 re_match_context_t *mctx;
4175 int node, str_idx;
4176{
4177#ifdef DEBUG
4178 assert (mctx->sub_tops != NULL);
4179 assert (mctx->asub_tops > 0);
4180#endif
951d6408 4181 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
6291ee3c 4182 {
951d6408
UD
4183 int new_asub_tops = mctx->asub_tops * 2;
4184 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4185 re_sub_match_top_t *,
4186 new_asub_tops);
6291ee3c
UD
4187 if (BE (new_array == NULL, 0))
4188 return REG_ESPACE;
4189 mctx->sub_tops = new_array;
951d6408 4190 mctx->asub_tops = new_asub_tops;
6291ee3c
UD
4191 }
4192 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
951d6408 4193 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
6291ee3c
UD
4194 return REG_ESPACE;
4195 mctx->sub_tops[mctx->nsub_tops]->node = node;
4196 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4197 return REG_NOERROR;
4198}
4199
4200/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4201 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4202
4203static re_sub_match_last_t *
4204match_ctx_add_sublast (subtop, node, str_idx)
4205 re_sub_match_top_t *subtop;
4206 int node, str_idx;
4207{
4208 re_sub_match_last_t *new_entry;
951d6408 4209 if (BE (subtop->nlasts == subtop->alasts, 0))
6291ee3c 4210 {
951d6408
UD
4211 int new_alasts = 2 * subtop->alasts + 1;
4212 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4213 re_sub_match_last_t *,
4214 new_alasts);
6291ee3c
UD
4215 if (BE (new_array == NULL, 0))
4216 return NULL;
4217 subtop->lasts = new_array;
951d6408 4218 subtop->alasts = new_alasts;
6291ee3c
UD
4219 }
4220 new_entry = calloc (1, sizeof (re_sub_match_last_t));
951d6408
UD
4221 if (BE (new_entry != NULL, 1))
4222 {
4223 subtop->lasts[subtop->nlasts] = new_entry;
4224 new_entry->node = node;
4225 new_entry->str_idx = str_idx;
4226 ++subtop->nlasts;
4227 }
6291ee3c
UD
4228 return new_entry;
4229}
4230
0742e48e
UD
4231static void
4232sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
15a7d175 4233 check_subexp)
0742e48e
UD
4234 re_sift_context_t *sctx;
4235 re_dfastate_t **sifted_sts, **limited_sts;
4236 int last_node, last_str_idx, check_subexp;
4237{
4238 sctx->sifted_states = sifted_sts;
4239 sctx->limited_states = limited_sts;
4240 sctx->last_node = last_node;
4241 sctx->last_str_idx = last_str_idx;
4242 sctx->check_subexp = check_subexp;
485d775d
UD
4243 sctx->cur_bkref = -1;
4244 sctx->cls_subexp_idx = -1;
0742e48e
UD
4245 re_node_set_init_empty (&sctx->limits);
4246}