]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regexec.c
Update.
[thirdparty/glibc.git] / posix / regexec.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
56b168be 2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
a9388965 21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
56b168be 22 int n) internal_function;
3ce12656
UD
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
e3a87852
UD
25static void match_ctx_free_subtops (re_match_context_t *mctx)
26 internal_function;
a9388965 27static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
e3a87852
UD
28 int str_idx, int from, int to)
29 internal_function;
30static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
31 internal_function;
3ce12656 32static void match_ctx_clear_flag (re_match_context_t *mctx) internal_function;
6291ee3c 33static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
3ce12656 34 int str_idx) internal_function;
6291ee3c 35static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
e3a87852
UD
36 int node, int str_idx)
37 internal_function;
0742e48e 38static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
15a7d175 39 re_dfastate_t **limited_sts, int last_node,
e3a87852
UD
40 int last_str_idx, int check_subexp)
41 internal_function;
a9388965 42static reg_errcode_t re_search_internal (const regex_t *preg,
15a7d175
UD
43 const char *string, int length,
44 int start, int range, int stop,
45 size_t nmatch, regmatch_t pmatch[],
3ce12656 46 int eflags) internal_function;
ac3d553b 47static int re_search_2_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
48 const char *string1, int length1,
49 const char *string2, int length2,
50 int start, int range, struct re_registers *regs,
3ce12656 51 int stop, int ret_len) internal_function;
ac3d553b 52static int re_search_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
53 const char *string, int length, int start,
54 int range, int stop, struct re_registers *regs,
3ce12656 55 int ret_len) internal_function;
ac3d553b 56static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
3ce12656 57 int nregs, int regs_allocated) internal_function;
bb3f4825 58static inline re_dfastate_t *acquire_init_state_context
e3a87852
UD
59 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
60 __attribute ((always_inline)) internal_function;
61static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
62 internal_function;
15bad1a5
UD
63static int check_matching (re_match_context_t *mctx, int fl_longest_match,
64 int *p_match_first)
e3a87852 65 internal_function;
3b0bdc72 66static int check_halt_node_context (const re_dfa_t *dfa, int node,
3ce12656 67 unsigned int context) internal_function;
e3a87852
UD
68static int check_halt_state_context (const re_match_context_t *mctx,
69 const re_dfastate_t *state, int idx)
70 internal_function;
6b6557e8
UD
71static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
72 regmatch_t *prev_idx_match, int cur_node,
3ce12656 73 int cur_idx, int nmatch) internal_function;
e3a87852
UD
74static int proceed_next_node (const re_match_context_t *mctx,
75 int nregs, regmatch_t *regs,
15a7d175 76 int *pidx, int node, re_node_set *eps_via_nodes,
3ce12656 77 struct re_fail_stack_t *fs) internal_function;
1b2c2628 78static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
15a7d175
UD
79 int str_idx, int *dests, int nregs,
80 regmatch_t *regs,
3ce12656 81 re_node_set *eps_via_nodes) internal_function;
a3022b82 82static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
3ce12656 83 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
612546c6 84static reg_errcode_t set_regs (const regex_t *preg,
15a7d175
UD
85 const re_match_context_t *mctx,
86 size_t nmatch, regmatch_t *pmatch,
3ce12656
UD
87 int fl_backtrack) internal_function;
88static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
485d775d 89
434d3784 90#ifdef RE_ENABLE_I18N
e3a87852 91static int sift_states_iter_mb (const re_match_context_t *mctx,
15a7d175 92 re_sift_context_t *sctx,
3ce12656 93 int node_idx, int str_idx, int max_str_idx) internal_function;
434d3784 94#endif /* RE_ENABLE_I18N */
e3a87852 95static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
3ce12656 96 re_sift_context_t *sctx) internal_function;
e3a87852 97static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
15a7d175
UD
98 re_sift_context_t *sctx,
99 int str_idx,
3ce12656 100 re_node_set *dest_nodes) internal_function;
0742e48e 101static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
15a7d175 102 re_node_set *dest_nodes,
3ce12656 103 const re_node_set *candidates) internal_function;
0742e48e 104static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
15a7d175 105 re_node_set *dest_nodes,
3ce12656 106 const re_node_set *and_nodes) internal_function;
e3a87852
UD
107static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110static int check_dst_limits_calc_pos (re_match_context_t *mctx,
15a7d175 111 int limit, re_node_set *eclosures,
3ce12656 112 int subexp_idx, int node, int str_idx) internal_function;
0742e48e 113static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
15a7d175
UD
114 re_node_set *dest_nodes,
115 const re_node_set *candidates,
116 re_node_set *limits,
117 struct re_backref_cache_entry *bkref_ents,
3ce12656 118 int str_idx) internal_function;
e3a87852 119static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
15a7d175 120 re_sift_context_t *sctx,
3ce12656 121 int str_idx, re_node_set *dest_nodes) internal_function;
7c1be3ec
UD
122static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
123 int next_state_log_idx) internal_function;
0742e48e 124static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
3ce12656 125 re_dfastate_t **src, int num) internal_function;
4c595adb
UD
126static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
e3a87852 128static re_dfastate_t *transit_state (reg_errcode_t *err,
15a7d175 129 re_match_context_t *mctx,
3ce12656 130 re_dfastate_t *state) internal_function;
4c595adb
UD
131static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state) internal_function;
e3a87852 134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6291ee3c 135 re_node_set *cur_nodes,
3ce12656 136 int str_idx) internal_function;
c13c99fa 137#if 0
e3a87852
UD
138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate) internal_function;
c13c99fa 141#endif
434d3784 142#ifdef RE_ENABLE_I18N
e3a87852
UD
143static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
434d3784 145#endif /* RE_ENABLE_I18N */
e3a87852
UD
146static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
147 const re_node_set *nodes) internal_function;
148static reg_errcode_t get_subexp (re_match_context_t *mctx,
3ce12656 149 int bkref_node, int bkref_str_idx) internal_function;
e3a87852 150static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
fe9434bb 151 const re_sub_match_top_t *sub_top,
6291ee3c 152 re_sub_match_last_t *sub_last,
3ce12656 153 int bkref_node, int bkref_str) internal_function;
fe9434bb 154static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
3ce12656 155 int subexp_idx, int type) internal_function;
e3a87852 156static reg_errcode_t check_arrival (re_match_context_t *mctx,
6291ee3c
UD
157 state_array_t *path, int top_node,
158 int top_str, int last_node, int last_str,
3ce12656 159 int type) internal_function;
e3a87852 160static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6291ee3c
UD
161 int str_idx,
162 re_node_set *cur_nodes,
3ce12656 163 re_node_set *next_nodes) internal_function;
6291ee3c
UD
164static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
165 re_node_set *cur_nodes,
3ce12656 166 int ex_subexp, int type) internal_function;
6291ee3c
UD
167static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
168 re_node_set *dst_nodes,
169 int target, int ex_subexp,
3ce12656 170 int type) internal_function;
e3a87852 171static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6291ee3c
UD
172 re_node_set *cur_nodes, int cur_str,
173 int last_str, int subexp_num,
3ce12656 174 int type) internal_function;
56b168be 175static re_dfastate_t **build_trtable (re_dfa_t *dfa,
3ce12656 176 re_dfastate_t *state) internal_function;
434d3784 177#ifdef RE_ENABLE_I18N
56b168be 178static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3ce12656 179 const re_string_t *input, int idx) internal_function;
434d3784 180# ifdef _LIBC
c202c2c5 181static unsigned int find_collation_sequence_value (const unsigned char *mbs,
3ce12656 182 size_t name_len) internal_function;
434d3784
UD
183# endif /* _LIBC */
184#endif /* RE_ENABLE_I18N */
56b168be 185static int group_nodes_into_DFAstates (re_dfa_t *dfa,
15a7d175
UD
186 const re_dfastate_t *state,
187 re_node_set *states_node,
3ce12656 188 bitset *states_ch) internal_function;
e3a87852
UD
189static int check_node_accept (const re_match_context_t *mctx,
190 const re_token_t *node, int idx) internal_function;
3ce12656 191static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
3b0bdc72
UD
192\f
193/* Entry point for POSIX code. */
194
195/* regexec searches for a given pattern, specified by PREG, in the
196 string STRING.
197
198 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
199 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
200 least NMATCH elements, and we set them to the offsets of the
201 corresponding matched substrings.
202
203 EFLAGS specifies `execution flags' which affect matching: if
204 REG_NOTBOL is set, then ^ does not match at the beginning of the
205 string; if REG_NOTEOL is set, then $ does not match at the end.
206
207 We return 0 if we find a match and REG_NOMATCH if not. */
208
209int
210regexec (preg, string, nmatch, pmatch, eflags)
92b27c74
UD
211 const regex_t *__restrict preg;
212 const char *__restrict string;
3b0bdc72
UD
213 size_t nmatch;
214 regmatch_t pmatch[];
215 int eflags;
216{
a9388965 217 reg_errcode_t err;
3b0bdc72
UD
218 int length = strlen (string);
219 if (preg->no_sub)
ac3d553b 220 err = re_search_internal (preg, string, length, 0, length, length, 0,
15a7d175 221 NULL, eflags);
3b0bdc72 222 else
ac3d553b 223 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
15a7d175 224 pmatch, eflags);
a9388965 225 return err != REG_NOERROR;
3b0bdc72
UD
226}
227#ifdef _LIBC
228weak_alias (__regexec, regexec)
229#endif
230
231/* Entry points for GNU code. */
232
ac3d553b
UD
233/* re_match, re_search, re_match_2, re_search_2
234
235 The former two functions operate on STRING with length LENGTH,
236 while the later two operate on concatenation of STRING1 and STRING2
237 with lengths LENGTH1 and LENGTH2, respectively.
238
239 re_match() matches the compiled pattern in BUFP against the string,
240 starting at index START.
241
242 re_search() first tries matching at index START, then it tries to match
243 starting from index START + 1, and so on. The last start position tried
244 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
245 way as re_match().)
246
247 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
248 the first STOP characters of the concatenation of the strings should be
249 concerned.
250
251 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
252 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
253 computed relative to the concatenation, not relative to the individual
254 strings.)
255
256 On success, re_match* functions return the length of the match, re_search*
257 return the position of the start of the match. Return value -1 means no
258 match was found and -2 indicates an internal error. */
3b0bdc72
UD
259
260int
ac3d553b
UD
261re_match (bufp, string, length, start, regs)
262 struct re_pattern_buffer *bufp;
3b0bdc72
UD
263 const char *string;
264 int length, start;
265 struct re_registers *regs;
266{
ac3d553b 267 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
3b0bdc72
UD
268}
269#ifdef _LIBC
270weak_alias (__re_match, re_match)
271#endif
272
ac3d553b
UD
273int
274re_search (bufp, string, length, start, range, regs)
275 struct re_pattern_buffer *bufp;
276 const char *string;
277 int length, start, range;
278 struct re_registers *regs;
279{
280 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
281}
282#ifdef _LIBC
283weak_alias (__re_search, re_search)
284#endif
3b0bdc72
UD
285
286int
ac3d553b
UD
287re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
288 struct re_pattern_buffer *bufp;
289 const char *string1, *string2;
290 int length1, length2, start, stop;
291 struct re_registers *regs;
3b0bdc72 292{
ac3d553b 293 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 294 start, 0, regs, stop, 1);
3b0bdc72
UD
295}
296#ifdef _LIBC
297weak_alias (__re_match_2, re_match_2)
298#endif
299
3b0bdc72 300int
ac3d553b
UD
301re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
302 struct re_pattern_buffer *bufp;
303 const char *string1, *string2;
304 int length1, length2, start, range, stop;
305 struct re_registers *regs;
306{
307 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 308 start, range, regs, stop, 0);
ac3d553b
UD
309}
310#ifdef _LIBC
311weak_alias (__re_search_2, re_search_2)
312#endif
313
314static int
315re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
15a7d175 316 stop, ret_len)
ac3d553b
UD
317 struct re_pattern_buffer *bufp;
318 const char *string1, *string2;
319 int length1, length2, start, range, stop, ret_len;
320 struct re_registers *regs;
321{
322 const char *str;
323 int rval;
324 int len = length1 + length2;
325 int free_str = 0;
326
327 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
328 return -2;
329
330 /* Concatenate the strings. */
331 if (length2 > 0)
332 if (length1 > 0)
333 {
15a7d175
UD
334 char *s = re_malloc (char, len);
335
336 if (BE (s == NULL, 0))
337 return -2;
338 memcpy (s, string1, length1);
339 memcpy (s + length1, string2, length2);
340 str = s;
341 free_str = 1;
ac3d553b
UD
342 }
343 else
344 str = string2;
345 else
346 str = string1;
347
348 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
15a7d175 349 ret_len);
ac3d553b 350 if (free_str)
1b2c2628 351 re_free ((char *) str);
ac3d553b
UD
352 return rval;
353}
354
355/* The parameters have the same meaning as those of re_search.
356 Additional parameters:
357 If RET_LEN is nonzero the length of the match is returned (re_match style);
358 otherwise the position of the match is returned. */
359
360static int
361re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
362 struct re_pattern_buffer *bufp;
363 const char *string;
364 int length, start, range, stop, ret_len;
365 struct re_registers *regs;
3b0bdc72 366{
a9388965 367 reg_errcode_t result;
3b0bdc72 368 regmatch_t *pmatch;
ac3d553b
UD
369 int nregs, rval;
370 int eflags = 0;
371
372 /* Check for out-of-range. */
a877402c 373 if (BE (start < 0 || start > length, 0))
ac3d553b
UD
374 return -1;
375 if (BE (start + range > length, 0))
376 range = length - start;
a877402c
UD
377 else if (BE (start + range < 0, 0))
378 range = -start;
3b0bdc72
UD
379
380 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
381 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
382
ac3d553b
UD
383 /* Compile fastmap if we haven't yet. */
384 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
385 re_compile_fastmap (bufp);
386
387 if (BE (bufp->no_sub, 0))
388 regs = NULL;
3b0bdc72
UD
389
390 /* We need at least 1 register. */
ac3d553b
UD
391 if (regs == NULL)
392 nregs = 1;
393 else if (BE (bufp->regs_allocated == REGS_FIXED &&
15a7d175 394 regs->num_regs < bufp->re_nsub + 1, 0))
ac3d553b
UD
395 {
396 nregs = regs->num_regs;
397 if (BE (nregs < 1, 0))
15a7d175
UD
398 {
399 /* Nothing can be copied to regs. */
400 regs = NULL;
401 nregs = 1;
402 }
ac3d553b
UD
403 }
404 else
405 nregs = bufp->re_nsub + 1;
3b0bdc72 406 pmatch = re_malloc (regmatch_t, nregs);
bc15410e
UD
407 if (BE (pmatch == NULL, 0))
408 return -2;
3b0bdc72 409
ac3d553b 410 result = re_search_internal (bufp, string, length, start, range, stop,
15a7d175 411 nregs, pmatch, eflags);
3b0bdc72 412
ac3d553b
UD
413 rval = 0;
414
415 /* I hope we needn't fill ther regs with -1's when no match was found. */
416 if (result != REG_NOERROR)
417 rval = -1;
418 else if (regs != NULL)
3b0bdc72 419 {
ac3d553b
UD
420 /* If caller wants register contents data back, copy them. */
421 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
15a7d175 422 bufp->regs_allocated);
ac3d553b 423 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
15a7d175 424 rval = -2;
3b0bdc72
UD
425 }
426
ac3d553b 427 if (BE (rval == 0, 1))
3b0bdc72 428 {
ac3d553b 429 if (ret_len)
15a7d175
UD
430 {
431 assert (pmatch[0].rm_so == start);
432 rval = pmatch[0].rm_eo - start;
433 }
ac3d553b 434 else
15a7d175 435 rval = pmatch[0].rm_so;
3b0bdc72 436 }
3b0bdc72
UD
437 re_free (pmatch);
438 return rval;
439}
3b0bdc72 440
ac3d553b
UD
441static unsigned
442re_copy_regs (regs, pmatch, nregs, regs_allocated)
3b0bdc72 443 struct re_registers *regs;
ac3d553b
UD
444 regmatch_t *pmatch;
445 int nregs, regs_allocated;
3b0bdc72 446{
ac3d553b
UD
447 int rval = REGS_REALLOCATE;
448 int i;
449 int need_regs = nregs + 1;
450 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
451 uses. */
452
453 /* Have the register data arrays been allocated? */
454 if (regs_allocated == REGS_UNALLOCATED)
44777771 455 { /* No. So allocate them with malloc. */
a96c63ed
UD
456 regs->start = re_malloc (regoff_t, need_regs);
457 regs->end = re_malloc (regoff_t, need_regs);
458 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
15a7d175 459 return REGS_UNALLOCATED;
ac3d553b
UD
460 regs->num_regs = need_regs;
461 }
462 else if (regs_allocated == REGS_REALLOCATE)
463 { /* Yes. If we need more elements than were already
15a7d175
UD
464 allocated, reallocate them. If we need fewer, just
465 leave it alone. */
951d6408 466 if (BE (need_regs > regs->num_regs, 0))
15a7d175 467 {
44777771
UD
468 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
469 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
470 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
a96c63ed 471 return REGS_UNALLOCATED;
44777771
UD
472 regs->start = new_start;
473 regs->end = new_end;
15a7d175
UD
474 regs->num_regs = need_regs;
475 }
ac3d553b
UD
476 }
477 else
478 {
479 assert (regs_allocated == REGS_FIXED);
480 /* This function may not be called with REGS_FIXED and nregs too big. */
481 assert (regs->num_regs >= nregs);
482 rval = REGS_FIXED;
483 }
484
485 /* Copy the regs. */
486 for (i = 0; i < nregs; ++i)
487 {
488 regs->start[i] = pmatch[i].rm_so;
489 regs->end[i] = pmatch[i].rm_eo;
490 }
491 for ( ; i < regs->num_regs; ++i)
492 regs->start[i] = regs->end[i] = -1;
493
494 return rval;
3b0bdc72 495}
3b0bdc72
UD
496
497/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
498 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
499 this memory for recording register information. STARTS and ENDS
500 must be allocated using the malloc library routine, and must each
501 be at least NUM_REGS * sizeof (regoff_t) bytes long.
502
503 If NUM_REGS == 0, then subsequent matches should allocate their own
504 register data.
505
506 Unless this function is called, the first search or match using
507 PATTERN_BUFFER will allocate its own register data, without
508 freeing the old data. */
509
510void
511re_set_registers (bufp, regs, num_regs, starts, ends)
512 struct re_pattern_buffer *bufp;
513 struct re_registers *regs;
514 unsigned num_regs;
515 regoff_t *starts, *ends;
516{
517 if (num_regs)
518 {
519 bufp->regs_allocated = REGS_REALLOCATE;
520 regs->num_regs = num_regs;
521 regs->start = starts;
522 regs->end = ends;
523 }
524 else
525 {
526 bufp->regs_allocated = REGS_UNALLOCATED;
527 regs->num_regs = 0;
528 regs->start = regs->end = (regoff_t *) 0;
529 }
530}
531#ifdef _LIBC
532weak_alias (__re_set_registers, re_set_registers)
533#endif
534\f
535/* Entry points compatible with 4.2 BSD regex library. We don't define
536 them unless specifically requested. */
537
538#if defined _REGEX_RE_COMP || defined _LIBC
539int
540# ifdef _LIBC
541weak_function
542# endif
543re_exec (s)
544 const char *s;
545{
546 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
547}
548#endif /* _REGEX_RE_COMP */
549\f
550static re_node_set empty_set;
551
552/* Internal entry point. */
553
554/* Searches for a compiled pattern PREG in the string STRING, whose
555 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
556 mingings with regexec. START, and RANGE have the same meanings
557 with re_search.
a9388965
UD
558 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
559 otherwise return the error code.
3b0bdc72
UD
560 Note: We assume front end functions already check ranges.
561 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
562
a9388965 563static reg_errcode_t
ac3d553b 564re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
15a7d175 565 eflags)
3b0bdc72
UD
566 const regex_t *preg;
567 const char *string;
ac3d553b 568 int length, start, range, stop, eflags;
3b0bdc72
UD
569 size_t nmatch;
570 regmatch_t pmatch[];
571{
a9388965 572 reg_errcode_t err;
3b0bdc72 573 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
612546c6 574 int left_lim, right_lim, incr;
3b0bdc72 575 int fl_longest_match, match_first, match_last = -1;
1b2c2628 576 int fast_translate, sb;
e3a87852
UD
577#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
578 re_match_context_t mctx = { .dfa = dfa };
579#else
3b0bdc72 580 re_match_context_t mctx;
e3a87852 581#endif
1b2c2628
UD
582 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
583 && range && !preg->can_be_null) ? preg->fastmap : NULL);
3b0bdc72 584
e3a87852
UD
585#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
586 memset (&mctx, '\0', sizeof (re_match_context_t));
587 mctx.dfa = dfa;
588#endif
589
3b0bdc72 590 /* Check if the DFA haven't been compiled. */
bc15410e 591 if (BE (preg->used == 0 || dfa->init_state == NULL
15a7d175
UD
592 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
593 || dfa->init_state_begbuf == NULL, 0))
a9388965 594 return REG_NOMATCH;
3b0bdc72 595
c70f81dd
UD
596#ifdef DEBUG
597 /* We assume front-end functions already check them. */
598 assert (start + range >= 0 && start + range <= length);
599#endif
600
601 /* If initial states with non-begbuf contexts have no elements,
602 the regex must be anchored. If preg->newline_anchor is set,
603 we'll never use init_state_nl, so do not check it. */
604 if (dfa->init_state->nodes.nelem == 0
605 && dfa->init_state_word->nodes.nelem == 0
606 && (dfa->init_state_nl->nodes.nelem == 0
607 || !preg->newline_anchor))
608 {
609 if (start != 0 && start + range != 0)
610 return REG_NOMATCH;
611 start = range = 0;
612 }
613
3b0bdc72
UD
614 re_node_set_init_empty (&empty_set);
615
616 /* We must check the longest matching, if nmatch > 0. */
6291ee3c 617 fl_longest_match = (nmatch != 0 || dfa->nbackref);
3b0bdc72 618
56b168be 619 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
f0c7c524 620 preg->translate, preg->syntax & RE_ICASE, dfa);
612546c6 621 if (BE (err != REG_NOERROR, 0))
1b2c2628 622 goto free_return;
56b168be
UD
623 mctx.input.stop = stop;
624 mctx.input.raw_stop = stop;
625 mctx.input.newline_anchor = preg->newline_anchor;
612546c6 626
56b168be 627 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
612546c6 628 if (BE (err != REG_NOERROR, 0))
1b2c2628 629 goto free_return;
612546c6 630
3b0bdc72
UD
631 /* We will log all the DFA states through which the dfa pass,
632 if nmatch > 1, or this dfa has "multibyte node", which is a
633 back-reference or a node which can accept multibyte character or
634 multi character collating element. */
635 if (nmatch > 1 || dfa->has_mb_node)
a9388965 636 {
56b168be 637 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
612546c6 638 if (BE (mctx.state_log == NULL, 0))
15a7d175
UD
639 {
640 err = REG_ESPACE;
641 goto free_return;
642 }
a9388965 643 }
3b0bdc72 644 else
612546c6 645 mctx.state_log = NULL;
3b0bdc72 646
612546c6 647 match_first = start;
56b168be
UD
648 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
649 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
612546c6 650
3b0bdc72 651 /* Check incrementally whether of not the input string match. */
612546c6
UD
652 incr = (range < 0) ? -1 : 1;
653 left_lim = (range < 0) ? start + range : start;
654 right_lim = (range < 0) ? start : start + range;
3c0fb574 655 sb = dfa->mb_cur_max == 1;
1b2c2628 656 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
612546c6
UD
657
658 for (;;)
3b0bdc72 659 {
612546c6 660 /* At first get the current byte from input string. */
1b2c2628
UD
661 if (fastmap)
662 {
663 if (BE (fast_translate, 1))
664 {
665 unsigned RE_TRANSLATE_TYPE t
666 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
667 if (BE (range >= 0, 1))
668 {
669 if (BE (t != NULL, 0))
670 {
671 while (BE (match_first < right_lim, 1)
672 && !fastmap[t[(unsigned char) string[match_first]]])
673 ++match_first;
674 }
675 else
676 {
677 while (BE (match_first < right_lim, 1)
678 && !fastmap[(unsigned char) string[match_first]])
679 ++match_first;
680 }
681 if (BE (match_first == right_lim, 0))
682 {
683 int ch = match_first >= length
684 ? 0 : (unsigned char) string[match_first];
685 if (!fastmap[t ? t[ch] : ch])
686 break;
687 }
688 }
689 else
690 {
691 while (match_first >= left_lim)
692 {
693 int ch = match_first >= length
694 ? 0 : (unsigned char) string[match_first];
695 if (fastmap[t ? t[ch] : ch])
696 break;
697 --match_first;
698 }
699 if (match_first < left_lim)
700 break;
701 }
702 }
703 else
704 {
705 int ch;
706
707 do
708 {
709 /* In this case, we can't determine easily the current byte,
710 since it might be a component byte of a multibyte
711 character. Then we use the constructed buffer
712 instead. */
713 /* If MATCH_FIRST is out of the valid range, reconstruct the
714 buffers. */
56b168be
UD
715 if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
716 <= match_first
717 || match_first < mctx.input.raw_mbs_idx)
1b2c2628 718 {
56b168be
UD
719 err = re_string_reconstruct (&mctx.input, match_first,
720 eflags);
1b2c2628
UD
721 if (BE (err != REG_NOERROR, 0))
722 goto free_return;
723 }
724 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
725 Note that MATCH_FIRST must not be smaller than 0. */
726 ch = ((match_first >= length) ? 0
56b168be
UD
727 : re_string_byte_at (&mctx.input,
728 match_first
729 - mctx.input.raw_mbs_idx));
1b2c2628
UD
730 if (fastmap[ch])
731 break;
732 match_first += incr;
733 }
734 while (match_first >= left_lim && match_first <= right_lim);
735 if (! fastmap[ch])
736 break;
737 }
738 }
612546c6 739
1b2c2628 740 /* Reconstruct the buffers so that the matcher can assume that
fe9434bb 741 the matching starts from the beginning of the buffer. */
56b168be 742 err = re_string_reconstruct (&mctx.input, match_first, eflags);
1b2c2628
UD
743 if (BE (err != REG_NOERROR, 0))
744 goto free_return;
3b0bdc72 745#ifdef RE_ENABLE_I18N
1b2c2628 746 /* Eliminate it when it is a component of a multibyte character
15a7d175 747 and isn't the head of a multibyte character. */
56b168be 748 if (sb || re_string_first_byte (&mctx.input, 0))
3b0bdc72 749#endif
1b2c2628
UD
750 {
751 /* It seems to be appropriate one, then use the matcher. */
752 /* We assume that the matching starts from 0. */
753 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
15bad1a5
UD
754 match_last = check_matching (&mctx, fl_longest_match,
755 range >= 0 ? &match_first : NULL);
1b2c2628
UD
756 if (match_last != -1)
757 {
758 if (BE (match_last == -2, 0))
759 {
760 err = REG_ESPACE;
761 goto free_return;
762 }
763 else
6291ee3c
UD
764 {
765 mctx.match_last = match_last;
766 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
767 {
768 re_dfastate_t *pstate = mctx.state_log[match_last];
e3a87852
UD
769 mctx.last_node = check_halt_state_context (&mctx, pstate,
770 match_last);
6291ee3c
UD
771 }
772 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
773 || dfa->nbackref)
774 {
e3a87852 775 err = prune_impossible_nodes (&mctx);
6291ee3c
UD
776 if (err == REG_NOERROR)
777 break;
778 if (BE (err != REG_NOMATCH, 0))
779 goto free_return;
97fd3a30 780 match_last = -1;
6291ee3c
UD
781 }
782 else
fe9434bb 783 break; /* We found a match. */
6291ee3c 784 }
1b2c2628 785 }
6291ee3c 786 match_ctx_clean (&mctx);
1b2c2628 787 }
3b0bdc72 788 /* Update counter. */
612546c6
UD
789 match_first += incr;
790 if (match_first < left_lim || right_lim < match_first)
15a7d175 791 break;
3b0bdc72
UD
792 }
793
794 /* Set pmatch[] if we need. */
795 if (match_last != -1 && nmatch > 0)
796 {
797 int reg_idx;
798
799 /* Initialize registers. */
97fd3a30 800 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
15a7d175 801 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
3b0bdc72
UD
802
803 /* Set the points where matching start/end. */
612546c6 804 pmatch[0].rm_so = 0;
6291ee3c 805 pmatch[0].rm_eo = mctx.match_last;
3b0bdc72
UD
806
807 if (!preg->no_sub && nmatch > 1)
15a7d175 808 {
15a7d175
UD
809 err = set_regs (preg, &mctx, nmatch, pmatch,
810 dfa->has_plural_match && dfa->nbackref > 0);
811 if (BE (err != REG_NOERROR, 0))
812 goto free_return;
813 }
612546c6
UD
814
815 /* At last, add the offset to the each registers, since we slided
97fd3a30
UD
816 the buffers so that we could assume that the matching starts
817 from 0. */
612546c6 818 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
15a7d175
UD
819 if (pmatch[reg_idx].rm_so != -1)
820 {
c0d5034e 821#ifdef RE_ENABLE_I18N
56b168be 822 if (BE (mctx.input.offsets_needed != 0, 0))
bb3f4825 823 {
56b168be
UD
824 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
825 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
bb3f4825 826 else
56b168be
UD
827 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
828 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
829 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
bb3f4825 830 else
56b168be 831 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
bb3f4825 832 }
c0d5034e 833#else
56b168be 834 assert (mctx.input.offsets_needed == 0);
c0d5034e 835#endif
15a7d175
UD
836 pmatch[reg_idx].rm_so += match_first;
837 pmatch[reg_idx].rm_eo += match_first;
838 }
3b0bdc72 839 }
1b2c2628
UD
840 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
841 free_return:
612546c6 842 re_free (mctx.state_log);
3b0bdc72
UD
843 if (dfa->nbackref)
844 match_ctx_free (&mctx);
56b168be 845 re_string_destruct (&mctx.input);
1b2c2628 846 return err;
3b0bdc72
UD
847}
848
6291ee3c 849static reg_errcode_t
e3a87852 850prune_impossible_nodes (mctx)
6291ee3c
UD
851 re_match_context_t *mctx;
852{
e3a87852 853 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
854 int halt_node, match_last;
855 reg_errcode_t ret;
6291ee3c
UD
856 re_dfastate_t **sifted_states;
857 re_dfastate_t **lim_states = NULL;
858 re_sift_context_t sctx;
859#ifdef DEBUG
860 assert (mctx->state_log != NULL);
861#endif
862 match_last = mctx->match_last;
863 halt_node = mctx->last_node;
864 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
865 if (BE (sifted_states == NULL, 0))
866 {
867 ret = REG_ESPACE;
868 goto free_return;
869 }
870 if (dfa->nbackref)
871 {
872 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
873 if (BE (lim_states == NULL, 0))
874 {
875 ret = REG_ESPACE;
876 goto free_return;
877 }
878 while (1)
879 {
880 memset (lim_states, '\0',
881 sizeof (re_dfastate_t *) * (match_last + 1));
882 match_ctx_clear_flag (mctx);
883 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
884 match_last, 0);
e3a87852 885 ret = sift_states_backward (mctx, &sctx);
6291ee3c
UD
886 re_node_set_free (&sctx.limits);
887 if (BE (ret != REG_NOERROR, 0))
888 goto free_return;
889 if (sifted_states[0] != NULL || lim_states[0] != NULL)
890 break;
891 do
892 {
893 --match_last;
894 if (match_last < 0)
895 {
896 ret = REG_NOMATCH;
897 goto free_return;
898 }
97fd3a30
UD
899 } while (mctx->state_log[match_last] == NULL
900 || !mctx->state_log[match_last]->halt);
e3a87852 901 halt_node = check_halt_state_context (mctx,
6291ee3c 902 mctx->state_log[match_last],
e3a87852 903 match_last);
6291ee3c
UD
904 }
905 ret = merge_state_array (dfa, sifted_states, lim_states,
906 match_last + 1);
907 re_free (lim_states);
908 lim_states = NULL;
909 if (BE (ret != REG_NOERROR, 0))
910 goto free_return;
911 }
912 else
913 {
914 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
915 match_last, 0);
e3a87852 916 ret = sift_states_backward (mctx, &sctx);
6291ee3c
UD
917 re_node_set_free (&sctx.limits);
918 if (BE (ret != REG_NOERROR, 0))
919 goto free_return;
920 }
921 re_free (mctx->state_log);
922 mctx->state_log = sifted_states;
923 sifted_states = NULL;
924 mctx->last_node = halt_node;
925 mctx->match_last = match_last;
926 ret = REG_NOERROR;
927 free_return:
928 re_free (sifted_states);
929 re_free (lim_states);
930 return ret;
931}
932
a9388965 933/* Acquire an initial state and return it.
3b0bdc72
UD
934 We must select appropriate initial state depending on the context,
935 since initial states may have constraints like "\<", "^", etc.. */
936
bb3f4825 937static inline re_dfastate_t *
e3a87852 938acquire_init_state_context (err, mctx, idx)
a9388965 939 reg_errcode_t *err;
612546c6
UD
940 const re_match_context_t *mctx;
941 int idx;
3b0bdc72 942{
e3a87852 943 re_dfa_t *const dfa = mctx->dfa;
3b0bdc72
UD
944 if (dfa->init_state->has_constraint)
945 {
946 unsigned int context;
56b168be 947 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
3b0bdc72 948 if (IS_WORD_CONTEXT (context))
15a7d175 949 return dfa->init_state_word;
3b0bdc72 950 else if (IS_ORDINARY_CONTEXT (context))
15a7d175 951 return dfa->init_state;
3b0bdc72 952 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
15a7d175 953 return dfa->init_state_begbuf;
3b0bdc72 954 else if (IS_NEWLINE_CONTEXT (context))
15a7d175 955 return dfa->init_state_nl;
3b0bdc72 956 else if (IS_BEGBUF_CONTEXT (context))
15a7d175
UD
957 {
958 /* It is relatively rare case, then calculate on demand. */
4c595adb
UD
959 return re_acquire_state_context (err, dfa,
960 dfa->init_state->entrance_nodes,
961 context);
15a7d175 962 }
3b0bdc72 963 else
15a7d175
UD
964 /* Must not happen? */
965 return dfa->init_state;
3b0bdc72
UD
966 }
967 else
968 return dfa->init_state;
969}
970
971/* Check whether the regular expression match input string INPUT or not,
a9388965
UD
972 and return the index where the matching end, return -1 if not match,
973 or return -2 in case of an error.
612546c6 974 FL_LONGEST_MATCH means we want the POSIX longest matching.
15bad1a5
UD
975 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
976 next place where we may want to try matching.
612546c6
UD
977 Note that the matcher assume that the maching starts from the current
978 index of the buffer. */
3b0bdc72
UD
979
980static int
15bad1a5 981check_matching (mctx, fl_longest_match, p_match_first)
3b0bdc72 982 re_match_context_t *mctx;
c13c99fa 983 int fl_longest_match;
15bad1a5 984 int *p_match_first;
3b0bdc72 985{
e3a87852 986 re_dfa_t *const dfa = mctx->dfa;
a9388965 987 reg_errcode_t err;
612546c6
UD
988 int match = 0;
989 int match_last = -1;
56b168be 990 int cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72 991 re_dfastate_t *cur_state;
4c595adb
UD
992 int at_init_state = p_match_first != NULL;
993 int next_start_idx = cur_str_idx;
3b0bdc72 994
4c595adb 995 err = REG_NOERROR;
e3a87852 996 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
4c595adb 997 /* An initial state must not be NULL (invalid). */
bc15410e 998 if (BE (cur_state == NULL, 0))
4c595adb
UD
999 {
1000 assert (err == REG_ESPACE);
1001 return -2;
1002 }
0742e48e 1003
4c595adb 1004 if (mctx->state_log != NULL)
6291ee3c 1005 {
4c595adb 1006 mctx->state_log[cur_str_idx] = cur_state;
6291ee3c 1007
4c595adb
UD
1008 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1009 later. E.g. Processing back references. */
1010 if (BE (dfa->nbackref, 0))
bb3f4825 1011 {
4c595adb
UD
1012 at_init_state = 0;
1013 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
bb3f4825
UD
1014 if (BE (err != REG_NOERROR, 0))
1015 return err;
4c595adb
UD
1016
1017 if (cur_state->has_backref)
1018 {
1019 err = transit_state_bkref (mctx, &cur_state->nodes);
1020 if (BE (err != REG_NOERROR, 0))
1021 return err;
1022 }
bb3f4825 1023 }
0742e48e
UD
1024 }
1025
3b0bdc72 1026 /* If the RE accepts NULL string. */
bb3f4825 1027 if (BE (cur_state->halt, 0))
3b0bdc72
UD
1028 {
1029 if (!cur_state->has_constraint
e3a87852 1030 || check_halt_state_context (mctx, cur_state, cur_str_idx))
15a7d175
UD
1031 {
1032 if (!fl_longest_match)
1033 return cur_str_idx;
1034 else
1035 {
1036 match_last = cur_str_idx;
1037 match = 1;
1038 }
1039 }
3b0bdc72
UD
1040 }
1041
56b168be 1042 while (!re_string_eoi (&mctx->input))
3b0bdc72 1043 {
15bad1a5 1044 re_dfastate_t *old_state = cur_state;
e3a87852 1045 cur_state = transit_state (&err, mctx, cur_state);
4c595adb
UD
1046 if (mctx->state_log != NULL)
1047 cur_state = merge_state_with_log (&err, mctx, cur_state);
15bad1a5 1048
4c595adb 1049 if (cur_state == NULL)
15a7d175 1050 {
4c595adb
UD
1051 /* Reached the invalid state or an error. Try to recover a valid
1052 state using the state log, if available and if we have not
1053 already found a valid (even if not the longest) match. */
15a7d175
UD
1054 if (BE (err != REG_NOERROR, 0))
1055 return -2;
4c595adb
UD
1056
1057 if (mctx->state_log == NULL
1058 || (match && !fl_longest_match)
1059 || (cur_state = find_recover_state (&err, mctx)) == NULL)
15a7d175 1060 break;
4c595adb
UD
1061 }
1062
1063 if (at_init_state)
1064 {
1065 if (old_state == cur_state)
1066 next_start_idx = re_string_cur_idx (&mctx->input);
c13c99fa 1067 else
4c595adb 1068 at_init_state = 0;
15a7d175 1069 }
3b0bdc72 1070
4c595adb 1071 if (cur_state->halt)
15a7d175 1072 {
4c595adb 1073 /* Reached a halt state.
15a7d175
UD
1074 Check the halt state can satisfy the current context. */
1075 if (!cur_state->has_constraint
e3a87852 1076 || check_halt_state_context (mctx, cur_state,
56b168be 1077 re_string_cur_idx (&mctx->input)))
15a7d175
UD
1078 {
1079 /* We found an appropriate halt state. */
56b168be 1080 match_last = re_string_cur_idx (&mctx->input);
15a7d175
UD
1081 match = 1;
1082 if (!fl_longest_match)
1083 break;
1084 }
1085 }
3b0bdc72 1086 }
15bad1a5 1087
4c595adb
UD
1088 if (match_last == -1 && p_match_first)
1089 *p_match_first += next_start_idx;
15bad1a5 1090
3b0bdc72
UD
1091 return match_last;
1092}
1093
1094/* Check NODE match the current context. */
1095
1096static int check_halt_node_context (dfa, node, context)
1097 const re_dfa_t *dfa;
1098 int node;
1099 unsigned int context;
1100{
3b0bdc72 1101 re_token_type_t type = dfa->nodes[node].type;
485d775d
UD
1102 unsigned int constraint = dfa->nodes[node].constraint;
1103 if (type != END_OF_RE)
3b0bdc72 1104 return 0;
485d775d
UD
1105 if (!constraint)
1106 return 1;
1107 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
3b0bdc72
UD
1108 return 0;
1109 return 1;
1110}
1111
1112/* Check the halt state STATE match the current context.
1113 Return 0 if not match, if the node, STATE has, is a halt node and
1114 match the context, return the node. */
1115
1116static int
e3a87852 1117check_halt_state_context (mctx, state, idx)
612546c6 1118 const re_match_context_t *mctx;
e3a87852 1119 const re_dfastate_t *state;
612546c6 1120 int idx;
3b0bdc72 1121{
3b0bdc72
UD
1122 int i;
1123 unsigned int context;
1124#ifdef DEBUG
1125 assert (state->halt);
1126#endif
56b168be 1127 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
3b0bdc72 1128 for (i = 0; i < state->nodes.nelem; ++i)
e3a87852 1129 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
3b0bdc72
UD
1130 return state->nodes.elems[i];
1131 return 0;
1132}
1133
a9388965
UD
1134/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1135 corresponding to the DFA).
1136 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1137 of errors. */
3b0bdc72
UD
1138
1139static int
e3a87852 1140proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
3b0bdc72 1141 const re_match_context_t *mctx;
e3a87852 1142 regmatch_t *regs;
a3022b82 1143 int nregs, *pidx, node;
3b0bdc72 1144 re_node_set *eps_via_nodes;
a3022b82 1145 struct re_fail_stack_t *fs;
3b0bdc72 1146{
e3a87852 1147 re_dfa_t *const dfa = mctx->dfa;
485d775d 1148 int i, err, dest_node;
81c64d40 1149 dest_node = -1;
3b0bdc72
UD
1150 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1151 {
485d775d
UD
1152 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1153 int ndest, dest_nodes[2];
a9388965 1154 err = re_node_set_insert (eps_via_nodes, node);
bc15410e 1155 if (BE (err < 0, 0))
44777771 1156 return -2;
a3022b82 1157 /* Pick up valid destinations. */
485d775d 1158 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
15a7d175
UD
1159 {
1160 int candidate = dfa->edests[node].elems[i];
1161 if (!re_node_set_contains (cur_nodes, candidate))
1162 continue;
1163 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1164 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1165 ++ndest;
1166 }
a3022b82 1167 if (ndest <= 1)
15a7d175 1168 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
a3022b82
UD
1169 /* In order to avoid infinite loop like "(a*)*". */
1170 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
15a7d175 1171 return dest_nodes[1];
44777771
UD
1172 if (fs != NULL
1173 && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1174 eps_via_nodes))
1175 return -2;
a3022b82 1176 return dest_nodes[0];
3b0bdc72
UD
1177 }
1178 else
1179 {
485d775d 1180 int naccepted = 0;
3b0bdc72 1181 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72 1182
434d3784 1183#ifdef RE_ENABLE_I18N
3b0bdc72 1184 if (ACCEPT_MB_NODE (type))
56b168be 1185 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
434d3784
UD
1186 else
1187#endif /* RE_ENABLE_I18N */
1188 if (type == OP_BACK_REF)
15a7d175
UD
1189 {
1190 int subexp_idx = dfa->nodes[node].opr.idx;
1191 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1192 if (fs != NULL)
1193 {
1194 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1195 return -1;
1196 else if (naccepted)
1197 {
56b168be 1198 char *buf = (char *) re_string_get_buffer (&mctx->input);
6291ee3c
UD
1199 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1200 naccepted) != 0)
15a7d175
UD
1201 return -1;
1202 }
1203 }
1204
1205 if (naccepted == 0)
1206 {
1207 err = re_node_set_insert (eps_via_nodes, node);
1208 if (BE (err < 0, 0))
1209 return -2;
1210 dest_node = dfa->edests[node].elems[0];
1211 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1212 dest_node))
1213 return dest_node;
1214 }
1215 }
3b0bdc72
UD
1216
1217 if (naccepted != 0
e3a87852 1218 || check_node_accept (mctx, dfa->nodes + node, *pidx))
15a7d175
UD
1219 {
1220 dest_node = dfa->nexts[node];
1221 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1222 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1223 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1224 dest_node)))
1225 return -1;
1226 re_node_set_empty (eps_via_nodes);
1227 return dest_node;
1228 }
3b0bdc72 1229 }
a3022b82
UD
1230 return -1;
1231}
1232
1233static reg_errcode_t
1234push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1235 struct re_fail_stack_t *fs;
1236 int str_idx, *dests, nregs;
1237 regmatch_t *regs;
1238 re_node_set *eps_via_nodes;
1239{
1240 reg_errcode_t err;
1241 int num = fs->num++;
1242 if (fs->num == fs->alloc)
1243 {
1b2c2628 1244 struct re_fail_stack_ent_t *new_array;
1b2c2628 1245 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
44777771 1246 * fs->alloc * 2));
1b2c2628 1247 if (new_array == NULL)
15a7d175 1248 return REG_ESPACE;
44777771 1249 fs->alloc *= 2;
1b2c2628 1250 fs->stack = new_array;
a3022b82
UD
1251 }
1252 fs->stack[num].idx = str_idx;
1253 fs->stack[num].node = dests[1];
1254 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
5a299c96
UD
1255 if (fs->stack[num].regs == NULL)
1256 return REG_ESPACE;
a3022b82
UD
1257 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1258 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1259 return err;
1260}
1b2c2628 1261
a3022b82
UD
1262static int
1263pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1264 struct re_fail_stack_t *fs;
1265 int *pidx, nregs;
1266 regmatch_t *regs;
1267 re_node_set *eps_via_nodes;
1268{
1269 int num = --fs->num;
1270 assert (num >= 0);
44777771 1271 *pidx = fs->stack[num].idx;
a3022b82
UD
1272 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1273 re_node_set_free (eps_via_nodes);
485d775d 1274 re_free (fs->stack[num].regs);
a3022b82
UD
1275 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1276 return fs->stack[num].node;
3b0bdc72
UD
1277}
1278
1279/* Set the positions where the subexpressions are starts/ends to registers
1280 PMATCH.
1281 Note: We assume that pmatch[0] is already set, and
97fd3a30 1282 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
3b0bdc72 1283
a9388965 1284static reg_errcode_t
a3022b82
UD
1285set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1286 const regex_t *preg;
1287 const re_match_context_t *mctx;
1288 size_t nmatch;
1289 regmatch_t *pmatch;
1290 int fl_backtrack;
3b0bdc72 1291{
6b6557e8 1292 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 1293 int idx, cur_node, real_nmatch;
3b0bdc72 1294 re_node_set eps_via_nodes;
a3022b82 1295 struct re_fail_stack_t *fs;
6b6557e8
UD
1296 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1297 regmatch_t *prev_idx_match;
1298
3b0bdc72
UD
1299#ifdef DEBUG
1300 assert (nmatch > 1);
612546c6 1301 assert (mctx->state_log != NULL);
3b0bdc72 1302#endif
a3022b82
UD
1303 if (fl_backtrack)
1304 {
1305 fs = &fs_body;
1306 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
6b6557e8
UD
1307 if (fs->stack == NULL)
1308 return REG_ESPACE;
a3022b82
UD
1309 }
1310 else
1311 fs = NULL;
6b6557e8 1312
3b0bdc72
UD
1313 cur_node = dfa->init_node;
1314 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1315 re_node_set_init_empty (&eps_via_nodes);
6b6557e8
UD
1316
1317 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1318 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1319
3b0bdc72
UD
1320 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1321 {
6b6557e8
UD
1322 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1323
a3022b82 1324 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
15a7d175
UD
1325 {
1326 int reg_idx;
1327 if (fs)
1328 {
1329 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1330 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1331 break;
1332 if (reg_idx == nmatch)
1333 {
1334 re_node_set_free (&eps_via_nodes);
1335 return free_fail_stack_return (fs);
1336 }
1337 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1338 &eps_via_nodes);
1339 }
1340 else
1341 {
1342 re_node_set_free (&eps_via_nodes);
1343 return REG_NOERROR;
1344 }
1345 }
3b0bdc72
UD
1346
1347 /* Proceed to next node. */
e3a87852 1348 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
15a7d175 1349 &eps_via_nodes, fs);
a3022b82 1350
bc15410e 1351 if (BE (cur_node < 0, 0))
15a7d175 1352 {
44777771
UD
1353 if (BE (cur_node == -2, 0))
1354 {
1355 re_node_set_free (&eps_via_nodes);
1356 free_fail_stack_return (fs);
1357 return REG_ESPACE;
1358 }
15a7d175
UD
1359 if (fs)
1360 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1361 &eps_via_nodes);
1362 else
1363 {
1364 re_node_set_free (&eps_via_nodes);
1365 return REG_NOMATCH;
1366 }
1367 }
3b0bdc72
UD
1368 }
1369 re_node_set_free (&eps_via_nodes);
485d775d
UD
1370 return free_fail_stack_return (fs);
1371}
1372
1373static reg_errcode_t
1374free_fail_stack_return (fs)
1375 struct re_fail_stack_t *fs;
1376{
1377 if (fs)
1378 {
1379 int fs_idx;
1380 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
15a7d175
UD
1381 {
1382 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1383 re_free (fs->stack[fs_idx].regs);
1384 }
485d775d
UD
1385 re_free (fs->stack);
1386 }
a9388965 1387 return REG_NOERROR;
3b0bdc72
UD
1388}
1389
81c64d40 1390static void
6b6557e8 1391update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
81c64d40 1392 re_dfa_t *dfa;
6b6557e8 1393 regmatch_t *pmatch, *prev_idx_match;
81c64d40
UD
1394 int cur_node, cur_idx, nmatch;
1395{
1396 int type = dfa->nodes[cur_node].type;
81c64d40
UD
1397 if (type == OP_OPEN_SUBEXP)
1398 {
6b6557e8
UD
1399 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1400
81c64d40 1401 /* We are at the first node of this sub expression. */
6b6557e8
UD
1402 if (reg_num < nmatch)
1403 {
1404 pmatch[reg_num].rm_so = cur_idx;
1405 pmatch[reg_num].rm_eo = -1;
1406 }
81c64d40
UD
1407 }
1408 else if (type == OP_CLOSE_SUBEXP)
6b6557e8
UD
1409 {
1410 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1411 if (reg_num < nmatch)
1412 {
1413 /* We are at the last node of this sub expression. */
1414 if (pmatch[reg_num].rm_so < cur_idx)
1415 {
1416 pmatch[reg_num].rm_eo = cur_idx;
1417 /* This is a non-empty match or we are not inside an optional
1418 subexpression. Accept this right away. */
1419 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1420 }
1421 else
1422 {
1423 if (dfa->nodes[cur_node].opt_subexp
1424 && prev_idx_match[reg_num].rm_so != -1)
1425 /* We transited through an empty match for an optional
1426 subexpression, like (a?)*, and this is not the subexp's
1427 first match. Copy back the old content of the registers
1428 so that matches of an inner subexpression are undone as
1429 well, like in ((a?))*. */
1430 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1431 else
1432 /* We completed a subexpression, but it may be part of
1433 an optional one, so do not update PREV_IDX_MATCH. */
1434 pmatch[reg_num].rm_eo = cur_idx;
1435 }
1436 }
1437 }
0742e48e 1438}
81c64d40 1439
0742e48e 1440/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
612546c6
UD
1441 and sift the nodes in each states according to the following rules.
1442 Updated state_log will be wrote to STATE_LOG.
3b0bdc72
UD
1443
1444 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1445 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
15a7d175
UD
1446 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1447 the LAST_NODE, we throw away the node `a'.
612546c6 1448 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
15a7d175
UD
1449 string `s' and transit to `b':
1450 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1451 away the node `a'.
1452 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
bb3f4825 1453 thrown away, we throw away the node `a'.
d0b96fc4 1454 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
15a7d175
UD
1455 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1456 node `a'.
bb3f4825 1457 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
15a7d175 1458 we throw away the node `a'. */
3b0bdc72
UD
1459
1460#define STATE_NODE_CONTAINS(state,node) \
1461 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1462
a9388965 1463static reg_errcode_t
e3a87852 1464sift_states_backward (mctx, sctx)
0742e48e
UD
1465 re_match_context_t *mctx;
1466 re_sift_context_t *sctx;
3b0bdc72 1467{
e3a87852 1468 re_dfa_t *const dfa = mctx->dfa;
a9388965 1469 reg_errcode_t err;
0742e48e
UD
1470 int null_cnt = 0;
1471 int str_idx = sctx->last_str_idx;
1472 re_node_set cur_dest;
1473 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
3b0bdc72
UD
1474
1475#ifdef DEBUG
612546c6 1476 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
3b0bdc72 1477#endif
0742e48e 1478 cur_src = &mctx->state_log[str_idx]->nodes;
3b0bdc72
UD
1479
1480 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1481 transit to the last_node and the last_node itself. */
0742e48e 1482 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
bc15410e 1483 if (BE (err != REG_NOERROR, 0))
a9388965 1484 return err;
e3a87852 1485 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
0742e48e 1486 if (BE (err != REG_NOERROR, 0))
1b2c2628 1487 goto free_return;
3b0bdc72
UD
1488
1489 /* Then check each states in the state_log. */
612546c6 1490 while (str_idx > 0)
3b0bdc72 1491 {
0742e48e 1492 int i, ret;
3b0bdc72 1493 /* Update counters. */
0742e48e
UD
1494 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1495 if (null_cnt > mctx->max_mb_elem_len)
15a7d175
UD
1496 {
1497 memset (sctx->sifted_states, '\0',
1498 sizeof (re_dfastate_t *) * str_idx);
1499 re_node_set_free (&cur_dest);
1500 return REG_NOERROR;
1501 }
0742e48e 1502 re_node_set_empty (&cur_dest);
3b0bdc72 1503 --str_idx;
0742e48e 1504 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1505 : &mctx->state_log[str_idx]->nodes);
3b0bdc72
UD
1506
1507 /* Then build the next sifted state.
15a7d175
UD
1508 We build the next sifted state on `cur_dest', and update
1509 `sifted_states[str_idx]' with `cur_dest'.
1510 Note:
1511 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1512 `cur_src' points the node_set of the old `state_log[str_idx]'. */
0742e48e 1513 for (i = 0; i < cur_src->nelem; i++)
15a7d175
UD
1514 {
1515 int prev_node = cur_src->elems[i];
1516 int naccepted = 0;
1517 re_token_type_t type = dfa->nodes[prev_node].type;
0742e48e 1518
46bf9de7 1519 if (IS_EPSILON_NODE (type))
15a7d175 1520 continue;
434d3784 1521#ifdef RE_ENABLE_I18N
15a7d175
UD
1522 /* If the node may accept `multi byte'. */
1523 if (ACCEPT_MB_NODE (type))
e3a87852 1524 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
15a7d175 1525 str_idx, sctx->last_str_idx);
3b0bdc72 1526
434d3784 1527#endif /* RE_ENABLE_I18N */
15a7d175
UD
1528 /* We don't check backreferences here.
1529 See update_cur_sifted_state(). */
1530
1531 if (!naccepted
e3a87852 1532 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
15a7d175
UD
1533 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1534 dfa->nexts[prev_node]))
1535 naccepted = 1;
1536
1537 if (naccepted == 0)
1538 continue;
1539
1540 if (sctx->limits.nelem)
1541 {
1542 int to_idx = str_idx + naccepted;
e3a87852 1543 if (check_dst_limits (mctx, &sctx->limits,
15a7d175
UD
1544 dfa->nexts[prev_node], to_idx,
1545 prev_node, str_idx))
1546 continue;
1547 }
1548 ret = re_node_set_insert (&cur_dest, prev_node);
1549 if (BE (ret == -1, 0))
1550 {
1551 err = REG_ESPACE;
1552 goto free_return;
1553 }
1554 }
3b0bdc72 1555
0742e48e 1556 /* Add all the nodes which satisfy the following conditions:
15a7d175
UD
1557 - It can epsilon transit to a node in CUR_DEST.
1558 - It is in CUR_SRC.
1559 And update state_log. */
e3a87852 1560 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
0742e48e 1561 if (BE (err != REG_NOERROR, 0))
15a7d175 1562 goto free_return;
3b0bdc72 1563 }
1b2c2628
UD
1564 err = REG_NOERROR;
1565 free_return:
0742e48e 1566 re_node_set_free (&cur_dest);
1b2c2628 1567 return err;
3b0bdc72
UD
1568}
1569
1570/* Helper functions. */
1571
25337753 1572static reg_errcode_t
7c1be3ec 1573clean_state_log_if_needed (mctx, next_state_log_idx)
3b0bdc72
UD
1574 re_match_context_t *mctx;
1575 int next_state_log_idx;
1576{
1577 int top = mctx->state_log_top;
612546c6 1578
56b168be
UD
1579 if (next_state_log_idx >= mctx->input.bufs_len
1580 || (next_state_log_idx >= mctx->input.valid_len
1581 && mctx->input.valid_len < mctx->input.len))
612546c6
UD
1582 {
1583 reg_errcode_t err;
1584 err = extend_buffers (mctx);
1585 if (BE (err != REG_NOERROR, 0))
15a7d175 1586 return err;
612546c6
UD
1587 }
1588
3b0bdc72
UD
1589 if (top < next_state_log_idx)
1590 {
612546c6 1591 memset (mctx->state_log + top + 1, '\0',
15a7d175 1592 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
3b0bdc72
UD
1593 mctx->state_log_top = next_state_log_idx;
1594 }
612546c6 1595 return REG_NOERROR;
3b0bdc72
UD
1596}
1597
1b2c2628
UD
1598static reg_errcode_t
1599merge_state_array (dfa, dst, src, num)
0742e48e
UD
1600 re_dfa_t *dfa;
1601 re_dfastate_t **dst;
1602 re_dfastate_t **src;
1603 int num;
3b0bdc72 1604{
0742e48e
UD
1605 int st_idx;
1606 reg_errcode_t err;
1607 for (st_idx = 0; st_idx < num; ++st_idx)
1608 {
1609 if (dst[st_idx] == NULL)
15a7d175 1610 dst[st_idx] = src[st_idx];
0742e48e 1611 else if (src[st_idx] != NULL)
15a7d175
UD
1612 {
1613 re_node_set merged_set;
1614 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1615 &src[st_idx]->nodes);
1616 if (BE (err != REG_NOERROR, 0))
1617 return err;
1618 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1619 re_node_set_free (&merged_set);
1620 if (BE (err != REG_NOERROR, 0))
1621 return err;
1622 }
0742e48e
UD
1623 }
1624 return REG_NOERROR;
3b0bdc72
UD
1625}
1626
0742e48e 1627static reg_errcode_t
e3a87852 1628update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
0742e48e
UD
1629 re_match_context_t *mctx;
1630 re_sift_context_t *sctx;
1631 int str_idx;
1632 re_node_set *dest_nodes;
3b0bdc72 1633{
e3a87852 1634 re_dfa_t *const dfa = mctx->dfa;
0742e48e 1635 reg_errcode_t err;
0742e48e
UD
1636 const re_node_set *candidates;
1637 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1638 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
1639
1640 /* At first, add the nodes which can epsilon transit to a node in
1641 DEST_NODE. */
485d775d
UD
1642 if (dest_nodes->nelem)
1643 {
1644 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1645 if (BE (err != REG_NOERROR, 0))
1646 return err;
1647 }
0742e48e
UD
1648
1649 /* Then, check the limitations in the current sift_context. */
485d775d 1650 if (dest_nodes->nelem && sctx->limits.nelem)
0742e48e
UD
1651 {
1652 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
15a7d175 1653 mctx->bkref_ents, str_idx);
0742e48e 1654 if (BE (err != REG_NOERROR, 0))
15a7d175 1655 return err;
0742e48e
UD
1656 }
1657
1658 /* Update state_log. */
1659 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1660 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1661 return err;
1662
0742e48e
UD
1663 if ((mctx->state_log[str_idx] != NULL
1664 && mctx->state_log[str_idx]->has_backref))
1665 {
e3a87852 1666 err = sift_states_bkref (mctx, sctx, str_idx, dest_nodes);
0742e48e 1667 if (BE (err != REG_NOERROR, 0))
15a7d175 1668 return err;
0742e48e
UD
1669 }
1670 return REG_NOERROR;
3b0bdc72
UD
1671}
1672
a9388965 1673static reg_errcode_t
0742e48e
UD
1674add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1675 re_dfa_t *dfa;
1676 re_node_set *dest_nodes;
1677 const re_node_set *candidates;
3b0bdc72 1678{
0742e48e
UD
1679 reg_errcode_t err;
1680 int src_idx;
1681 re_node_set src_copy;
1682
1683 err = re_node_set_init_copy (&src_copy, dest_nodes);
1684 if (BE (err != REG_NOERROR, 0))
1685 return err;
1686 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
3b0bdc72 1687 {
0742e48e 1688 err = re_node_set_add_intersect (dest_nodes, candidates,
15a7d175
UD
1689 dfa->inveclosures
1690 + src_copy.elems[src_idx]);
0742e48e 1691 if (BE (err != REG_NOERROR, 0))
15a7d175
UD
1692 {
1693 re_node_set_free (&src_copy);
1694 return err;
1695 }
0742e48e
UD
1696 }
1697 re_node_set_free (&src_copy);
1698 return REG_NOERROR;
1699}
3b0bdc72 1700
0742e48e
UD
1701static reg_errcode_t
1702sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1703 re_dfa_t *dfa;
1704 int node;
1705 re_node_set *dest_nodes;
1706 const re_node_set *candidates;
1707{
1708 int ecl_idx;
1709 reg_errcode_t err;
1710 re_node_set *inv_eclosure = dfa->inveclosures + node;
1711 re_node_set except_nodes;
1712 re_node_set_init_empty (&except_nodes);
1713 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1714 {
15a7d175
UD
1715 int cur_node = inv_eclosure->elems[ecl_idx];
1716 if (cur_node == node)
1717 continue;
1718 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1719 {
1720 int edst1 = dfa->edests[cur_node].elems[0];
1721 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1722 ? dfa->edests[cur_node].elems[1] : -1);
1723 if ((!re_node_set_contains (inv_eclosure, edst1)
1724 && re_node_set_contains (dest_nodes, edst1))
1725 || (edst2 > 0
1726 && !re_node_set_contains (inv_eclosure, edst2)
1727 && re_node_set_contains (dest_nodes, edst2)))
1728 {
1729 err = re_node_set_add_intersect (&except_nodes, candidates,
1730 dfa->inveclosures + cur_node);
1731 if (BE (err != REG_NOERROR, 0))
1732 {
1733 re_node_set_free (&except_nodes);
1734 return err;
1735 }
1736 }
1737 }
0742e48e
UD
1738 }
1739 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1740 {
15a7d175
UD
1741 int cur_node = inv_eclosure->elems[ecl_idx];
1742 if (!re_node_set_contains (&except_nodes, cur_node))
1743 {
1744 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1745 re_node_set_remove_at (dest_nodes, idx);
1746 }
0742e48e
UD
1747 }
1748 re_node_set_free (&except_nodes);
1749 return REG_NOERROR;
1750}
1751
a3022b82 1752static int
e3a87852 1753check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
a3022b82 1754 re_match_context_t *mctx;
e3a87852 1755 re_node_set *limits;
a3022b82
UD
1756 int dst_node, dst_idx, src_node, src_idx;
1757{
e3a87852 1758 re_dfa_t *const dfa = mctx->dfa;
a3022b82
UD
1759 int lim_idx, src_pos, dst_pos;
1760
1761 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1762 {
485d775d 1763 int subexp_idx;
a3022b82
UD
1764 struct re_backref_cache_entry *ent;
1765 ent = mctx->bkref_ents + limits->elems[lim_idx];
485d775d 1766 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
a3022b82 1767
e3a87852 1768 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
15a7d175
UD
1769 dfa->eclosures + dst_node,
1770 subexp_idx, dst_node, dst_idx);
e3a87852 1771 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
15a7d175
UD
1772 dfa->eclosures + src_node,
1773 subexp_idx, src_node, src_idx);
a3022b82
UD
1774
1775 /* In case of:
15a7d175
UD
1776 <src> <dst> ( <subexp> )
1777 ( <subexp> ) <src> <dst>
1778 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
a3022b82 1779 if (src_pos == dst_pos)
15a7d175 1780 continue; /* This is unrelated limitation. */
a3022b82 1781 else
15a7d175 1782 return 1;
a3022b82
UD
1783 }
1784 return 0;
1785}
1786
1787static int
e3a87852 1788check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
15a7d175 1789 str_idx)
a3022b82
UD
1790 re_match_context_t *mctx;
1791 re_node_set *eclosures;
d0b96fc4 1792 int limit, subexp_idx, from_node, str_idx;
a3022b82 1793{
e3a87852 1794 re_dfa_t *const dfa = mctx->dfa;
a3022b82 1795 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1cef7b3c 1796 int node_idx;
d0b96fc4
UD
1797
1798 /* If we are outside the range of the subexpression, return -1 or 1. */
1799 if (str_idx < lim->subexp_from)
1800 return -1;
1801
1802 if (lim->subexp_to < str_idx)
1803 return 1;
1804
1805 /* If we are within the subexpression, return 0. */
1806 if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1807 return 0;
1808
1809 /* Else, we are on the boundary: examine the nodes on the epsilon
1810 closure. */
1cef7b3c
UD
1811 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1812 {
1813 int node = eclosures->elems[node_idx];
d0b96fc4
UD
1814 switch (dfa->nodes[node].type)
1815 {
1816 case OP_BACK_REF:
1cef7b3c
UD
1817 {
1818 int bi = search_cur_bkref_entry (mctx, str_idx);
1819 for (; bi < mctx->nbkref_ents; ++bi)
1820 {
1821 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
d0b96fc4 1822 int dst, cpos;
f0c7c524 1823
d0b96fc4
UD
1824 /* If this backreference goes beyond the point we're
1825 examining, don't go any further. */
1cef7b3c
UD
1826 if (ent->str_idx > str_idx)
1827 break;
d0b96fc4
UD
1828
1829 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1830 continue;
1831
1832 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1833 OP_CLOSE_SUBEXP cases below. But, if the
1834 destination node is the same node as the source
1835 node, don't recurse because it would cause an
1836 infinite loop: a regex that exhibits this behavior
1837 is ()\1*\1* */
1cef7b3c 1838 dst = dfa->edests[node].elems[0];
d0b96fc4
UD
1839 if (dst == from_node)
1840 {
1841 if (str_idx == lim->subexp_from)
1842 return -1;
1843 else /* if (str_idx == lim->subexp_to) */
1844 return 0;
1845 }
1846
e3a87852 1847 cpos = check_dst_limits_calc_pos (mctx, limit,
1cef7b3c
UD
1848 dfa->eclosures + dst,
1849 subexp_idx, dst,
1850 str_idx);
d0b96fc4
UD
1851
1852 if (cpos == -1 && str_idx == lim->subexp_from)
1853 return -1;
1854
1855 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1856 return 0;
1cef7b3c 1857 }
15a7d175
UD
1858 break;
1859 }
f0c7c524 1860
d0b96fc4
UD
1861 case OP_OPEN_SUBEXP:
1862 if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1863 return -1;
1864 break;
f0c7c524 1865
d0b96fc4
UD
1866 case OP_CLOSE_SUBEXP:
1867 if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1868 return 0;
1869 break;
1870
1871 default:
15a7d175
UD
1872 break;
1873 }
a3022b82 1874 }
d0b96fc4
UD
1875
1876 if (str_idx == lim->subexp_to)
1877 return 1;
1878 else
1879 return 0;
a3022b82
UD
1880}
1881
0742e48e
UD
1882/* Check the limitations of sub expressions LIMITS, and remove the nodes
1883 which are against limitations from DEST_NODES. */
1884
1885static reg_errcode_t
1886check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1887 re_dfa_t *dfa;
1888 re_node_set *dest_nodes;
1889 const re_node_set *candidates;
1890 re_node_set *limits;
1891 struct re_backref_cache_entry *bkref_ents;
1892 int str_idx;
1893{
1894 reg_errcode_t err;
1895 int node_idx, lim_idx;
1896
1897 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1898 {
485d775d 1899 int subexp_idx;
0742e48e
UD
1900 struct re_backref_cache_entry *ent;
1901 ent = bkref_ents + limits->elems[lim_idx];
1902
1903 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
15a7d175 1904 continue; /* This is unrelated limitation. */
0742e48e 1905
485d775d 1906 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
0742e48e 1907 if (ent->subexp_to == str_idx)
15a7d175
UD
1908 {
1909 int ops_node = -1;
1910 int cls_node = -1;
1911 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1912 {
1913 int node = dest_nodes->elems[node_idx];
46bf9de7 1914 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
1915 if (type == OP_OPEN_SUBEXP
1916 && subexp_idx == dfa->nodes[node].opr.idx)
1917 ops_node = node;
1918 else if (type == OP_CLOSE_SUBEXP
1919 && subexp_idx == dfa->nodes[node].opr.idx)
1920 cls_node = node;
1921 }
1922
1923 /* Check the limitation of the open subexpression. */
1924 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1925 if (ops_node >= 0)
1926 {
46bf9de7
UD
1927 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1928 candidates);
15a7d175
UD
1929 if (BE (err != REG_NOERROR, 0))
1930 return err;
1931 }
46bf9de7 1932
15a7d175 1933 /* Check the limitation of the close subexpression. */
46bf9de7
UD
1934 if (cls_node >= 0)
1935 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1936 {
1937 int node = dest_nodes->elems[node_idx];
1938 if (!re_node_set_contains (dfa->inveclosures + node,
1939 cls_node)
1940 && !re_node_set_contains (dfa->eclosures + node,
1941 cls_node))
1942 {
1943 /* It is against this limitation.
1944 Remove it form the current sifted state. */
1945 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1946 candidates);
1947 if (BE (err != REG_NOERROR, 0))
1948 return err;
1949 --node_idx;
1950 }
1951 }
15a7d175 1952 }
0742e48e 1953 else /* (ent->subexp_to != str_idx) */
15a7d175
UD
1954 {
1955 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1956 {
1957 int node = dest_nodes->elems[node_idx];
46bf9de7 1958 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
1959 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1960 {
1961 if (subexp_idx != dfa->nodes[node].opr.idx)
1962 continue;
1963 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1964 || (type == OP_OPEN_SUBEXP))
1965 {
1966 /* It is against this limitation.
1967 Remove it form the current sifted state. */
46bf9de7
UD
1968 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1969 candidates);
15a7d175
UD
1970 if (BE (err != REG_NOERROR, 0))
1971 return err;
1972 }
1973 }
1974 }
1975 }
0742e48e
UD
1976 }
1977 return REG_NOERROR;
1978}
1979
0742e48e 1980static reg_errcode_t
e3a87852 1981sift_states_bkref (mctx, sctx, str_idx, dest_nodes)
0742e48e
UD
1982 re_match_context_t *mctx;
1983 re_sift_context_t *sctx;
1984 int str_idx;
1985 re_node_set *dest_nodes;
1986{
e3a87852 1987 re_dfa_t *const dfa = mctx->dfa;
0742e48e 1988 reg_errcode_t err;
0742e48e
UD
1989 int node_idx, node;
1990 re_sift_context_t local_sctx;
1991 const re_node_set *candidates;
1992 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
15a7d175 1993 : &mctx->state_log[str_idx]->nodes);
0742e48e
UD
1994 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1995
1996 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
1997 {
56b168be 1998 int cur_bkref_idx = re_string_cur_idx (&mctx->input);
0742e48e
UD
1999 re_token_type_t type;
2000 node = candidates->elems[node_idx];
2001 type = dfa->nodes[node].type;
0742e48e 2002 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
15a7d175 2003 continue;
0742e48e
UD
2004 /* Avoid infinite loop for the REs like "()\1+". */
2005 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
15a7d175 2006 continue;
0742e48e 2007 if (type == OP_BACK_REF)
15a7d175 2008 {
6291ee3c
UD
2009 int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2010 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
2011 {
2012 int disabled_idx, subexp_len, to_idx, dst_node;
2013 struct re_backref_cache_entry *entry;
2014 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
2015 if (entry->str_idx > str_idx)
2016 break;
2017 if (entry->node != node)
2018 continue;
15a7d175
UD
2019 subexp_len = entry->subexp_to - entry->subexp_from;
2020 to_idx = str_idx + subexp_len;
2021 dst_node = (subexp_len ? dfa->nexts[node]
2022 : dfa->edests[node].elems[0]);
2023
6291ee3c
UD
2024 if (to_idx > sctx->last_str_idx
2025 || sctx->sifted_states[to_idx] == NULL
2026 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
2027 dst_node)
e3a87852 2028 || check_dst_limits (mctx, &sctx->limits, node,
6291ee3c 2029 str_idx, dst_node, to_idx))
15a7d175 2030 continue;
15a7d175
UD
2031 {
2032 re_dfastate_t *cur_state;
2033 entry->flag = 0;
2034 for (disabled_idx = enabled_idx + 1;
2035 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
2036 {
2037 struct re_backref_cache_entry *entry2;
2038 entry2 = mctx->bkref_ents + disabled_idx;
6291ee3c
UD
2039 if (entry2->str_idx > str_idx)
2040 break;
2041 entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
15a7d175
UD
2042 }
2043
2044 if (local_sctx.sifted_states == NULL)
2045 {
2046 local_sctx = *sctx;
2047 err = re_node_set_init_copy (&local_sctx.limits,
2048 &sctx->limits);
2049 if (BE (err != REG_NOERROR, 0))
1b2c2628 2050 goto free_return;
15a7d175
UD
2051 }
2052 local_sctx.last_node = node;
2053 local_sctx.last_str_idx = str_idx;
2054 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2055 if (BE (err < 0, 0))
1b2c2628
UD
2056 {
2057 err = REG_ESPACE;
2058 goto free_return;
2059 }
15a7d175 2060 cur_state = local_sctx.sifted_states[str_idx];
e3a87852 2061 err = sift_states_backward (mctx, &local_sctx);
15a7d175 2062 if (BE (err != REG_NOERROR, 0))
1b2c2628 2063 goto free_return;
15a7d175
UD
2064 if (sctx->limited_states != NULL)
2065 {
2066 err = merge_state_array (dfa, sctx->limited_states,
2067 local_sctx.sifted_states,
2068 str_idx + 1);
2069 if (BE (err != REG_NOERROR, 0))
2070 goto free_return;
2071 }
2072 local_sctx.sifted_states[str_idx] = cur_state;
2073 re_node_set_remove (&local_sctx.limits, enabled_idx);
485d775d
UD
2074 /* We must not use the variable entry here, since
2075 mctx->bkref_ents might be realloced. */
2076 mctx->bkref_ents[enabled_idx].flag = 1;
15a7d175
UD
2077 }
2078 }
6291ee3c
UD
2079 enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2080 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
15a7d175
UD
2081 {
2082 struct re_backref_cache_entry *entry;
2083 entry = mctx->bkref_ents + enabled_idx;
6291ee3c
UD
2084 if (entry->str_idx > str_idx)
2085 break;
2086 if (entry->node == node)
15a7d175
UD
2087 entry->flag = 0;
2088 }
2089 }
0742e48e 2090 }
1b2c2628
UD
2091 err = REG_NOERROR;
2092 free_return:
0742e48e
UD
2093 if (local_sctx.sifted_states != NULL)
2094 {
0742e48e
UD
2095 re_node_set_free (&local_sctx.limits);
2096 }
2097
1b2c2628 2098 return err;
0742e48e
UD
2099}
2100
2101
2102#ifdef RE_ENABLE_I18N
2103static int
e3a87852 2104sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
0742e48e
UD
2105 const re_match_context_t *mctx;
2106 re_sift_context_t *sctx;
2107 int node_idx, str_idx, max_str_idx;
2108{
e3a87852 2109 re_dfa_t *const dfa = mctx->dfa;
0742e48e
UD
2110 int naccepted;
2111 /* Check the node can accept `multi byte'. */
56b168be 2112 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
0742e48e
UD
2113 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2114 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
15a7d175 2115 dfa->nexts[node_idx]))
0742e48e 2116 /* The node can't accept the `multi byte', or the
bb3f4825 2117 destination was already thrown away, then the node
0742e48e
UD
2118 could't accept the current input `multi byte'. */
2119 naccepted = 0;
2120 /* Otherwise, it is sure that the node could accept
2121 `naccepted' bytes input. */
2122 return naccepted;
2123}
2124#endif /* RE_ENABLE_I18N */
2125
3b0bdc72
UD
2126\f
2127/* Functions for state transition. */
2128
2129/* Return the next state to which the current state STATE will transit by
2130 accepting the current input byte, and update STATE_LOG if necessary.
2131 If STATE can accept a multibyte char/collating element/back reference
2132 update the destination of STATE_LOG. */
2133
2134static re_dfastate_t *
e3a87852 2135transit_state (err, mctx, state)
a9388965 2136 reg_errcode_t *err;
a9388965 2137 re_match_context_t *mctx;
612546c6 2138 re_dfastate_t *state;
3b0bdc72 2139{
e3a87852 2140 re_dfa_t *const dfa = mctx->dfa;
3b0bdc72
UD
2141 re_dfastate_t **trtable, *next_state;
2142 unsigned char ch;
2143
56b168be
UD
2144 if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
2145 || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
2146 && mctx->input.valid_len < mctx->input.len))
612546c6
UD
2147 {
2148 *err = extend_buffers (mctx);
2149 if (BE (*err != REG_NOERROR, 0))
15a7d175 2150 return NULL;
612546c6
UD
2151 }
2152
434d3784 2153#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2154 /* If the current state can accept multibyte. */
2155 if (state->accept_mb)
15a7d175 2156 {
e3a87852 2157 *err = transit_state_mb (mctx, state);
15a7d175
UD
2158 if (BE (*err != REG_NOERROR, 0))
2159 return NULL;
2160 }
434d3784 2161#endif /* RE_ENABLE_I18N */
3b0bdc72 2162
4c595adb
UD
2163 /* Then decide the next state with the single byte. */
2164 if (1)
2165 {
2166 /* Use transition table */
2167 ch = re_string_fetch_byte (&mctx->input);
2168 trtable = state->trtable;
2169 if (trtable == NULL)
2170 {
2171 trtable = build_trtable (dfa, state);
2172 if (trtable == NULL)
c13c99fa 2173 {
4c595adb
UD
2174 *err = REG_ESPACE;
2175 return NULL;
15a7d175 2176 }
4c595adb
UD
2177 }
2178 if (BE (state->word_trtable, 0))
2179 {
2180 unsigned int context;
2181 context
2182 = re_string_context_at (&mctx->input,
2183 re_string_cur_idx (&mctx->input) - 1,
2184 mctx->eflags);
2185 if (IS_WORD_CONTEXT (context))
2186 return trtable[ch + SBC_MAX];
c13c99fa 2187 else
4c595adb 2188 return trtable[ch];
15a7d175 2189 }
3b0bdc72 2190 else
4c595adb 2191 return trtable[ch];
3b0bdc72 2192 }
4c595adb
UD
2193#if 0
2194 else
2195 /* don't use transition table */
2196 return transit_state_sb (err, mctx, state);
2197#endif
2198}
3b0bdc72 2199
4c595adb
UD
2200/* Update the state_log if we need */
2201re_dfastate_t *
2202merge_state_with_log (err, mctx, next_state)
2203 reg_errcode_t *err;
2204 re_match_context_t *mctx;
2205 re_dfastate_t *next_state;
2206{
2207 re_dfa_t *const dfa = mctx->dfa;
2208 int cur_idx = re_string_cur_idx (&mctx->input);
2209
2210 if (cur_idx > mctx->state_log_top)
3b0bdc72 2211 {
4c595adb
UD
2212 mctx->state_log[cur_idx] = next_state;
2213 mctx->state_log_top = cur_idx;
2214 }
2215 else if (mctx->state_log[cur_idx] == 0)
2216 {
2217 mctx->state_log[cur_idx] = next_state;
2218 }
2219 else
2220 {
2221 re_dfastate_t *pstate;
2222 unsigned int context;
2223 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2224 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2225 the destination of a multibyte char/collating element/
2226 back reference. Then the next state is the union set of
2227 these destinations and the results of the transition table. */
2228 pstate = mctx->state_log[cur_idx];
2229 log_nodes = pstate->entrance_nodes;
2230 if (next_state != NULL)
2231 {
2232 table_nodes = next_state->entrance_nodes;
2233 *err = re_node_set_init_union (&next_nodes, table_nodes,
15a7d175 2234 log_nodes);
4c595adb
UD
2235 if (BE (*err != REG_NOERROR, 0))
2236 return NULL;
2237 }
2238 else
2239 next_nodes = *log_nodes;
2240 /* Note: We already add the nodes of the initial state,
2241 then we don't need to add them here. */
2242
2243 context = re_string_context_at (&mctx->input,
2244 re_string_cur_idx (&mctx->input) - 1,
2245 mctx->eflags);
2246 next_state = mctx->state_log[cur_idx]
2247 = re_acquire_state_context (err, dfa, &next_nodes, context);
2248 /* We don't need to check errors here, since the return value of
2249 this function is next_state and ERR is already set. */
2250
2251 if (table_nodes != NULL)
2252 re_node_set_free (&next_nodes);
6291ee3c
UD
2253 }
2254
bb3f4825 2255 if (BE (dfa->nbackref, 0) && next_state != NULL)
6291ee3c 2256 {
bb3f4825
UD
2257 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2258 later. We must check them here, since the back references in the
2259 next state might use them. */
e3a87852 2260 *err = check_subexp_matching_top (mctx, &next_state->nodes,
6291ee3c
UD
2261 cur_idx);
2262 if (BE (*err != REG_NOERROR, 0))
2263 return NULL;
6291ee3c 2264
bb3f4825
UD
2265 /* If the next state has back references. */
2266 if (next_state->has_backref)
2267 {
e3a87852 2268 *err = transit_state_bkref (mctx, &next_state->nodes);
bb3f4825
UD
2269 if (BE (*err != REG_NOERROR, 0))
2270 return NULL;
2271 next_state = mctx->state_log[cur_idx];
2272 }
3b0bdc72 2273 }
4c595adb 2274
3b0bdc72
UD
2275 return next_state;
2276}
2277
4c595adb
UD
2278/* Skip bytes in the input that correspond to part of a
2279 multi-byte match, then look in the log for a state
2280 from which to restart matching. */
2281re_dfastate_t *
2282find_recover_state (err, mctx)
2283 reg_errcode_t *err;
2284 re_match_context_t *mctx;
2285{
2286 re_dfastate_t *cur_state = NULL;
2287 do
2288 {
2289 int max = mctx->state_log_top;
2290 int cur_str_idx = re_string_cur_idx (&mctx->input);
2291
2292 do
2293 {
2294 if (++cur_str_idx > max)
2295 return NULL;
2296 re_string_skip_bytes (&mctx->input, 1);
2297 }
2298 while (mctx->state_log[cur_str_idx] == NULL);
2299
2300 cur_state = merge_state_with_log (err, mctx, NULL);
2301 }
2302 while (err == REG_NOERROR && cur_state == NULL);
2303 return cur_state;
2304}
2305
3b0bdc72
UD
2306/* Helper functions for transit_state. */
2307
6291ee3c
UD
2308/* From the node set CUR_NODES, pick up the nodes whose types are
2309 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2310 expression. And register them to use them later for evaluating the
2311 correspoding back references. */
2312
2313static reg_errcode_t
e3a87852 2314check_subexp_matching_top (mctx, cur_nodes, str_idx)
6291ee3c
UD
2315 re_match_context_t *mctx;
2316 re_node_set *cur_nodes;
2317 int str_idx;
2318{
e3a87852 2319 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
2320 int node_idx;
2321 reg_errcode_t err;
2322
2323 /* TODO: This isn't efficient.
2324 Because there might be more than one nodes whose types are
2325 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2326 nodes.
2327 E.g. RE: (a){2} */
2328 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2329 {
2330 int node = cur_nodes->elems[node_idx];
2331 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
ce859332 2332 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
6291ee3c
UD
2333 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2334 {
2335 err = match_ctx_add_subtop (mctx, node, str_idx);
2336 if (BE (err != REG_NOERROR, 0))
2337 return err;
2338 }
2339 }
2340 return REG_NOERROR;
2341}
2342
c13c99fa 2343#if 0
3b0bdc72
UD
2344/* Return the next state to which the current state STATE will transit by
2345 accepting the current input byte. */
2346
2347static re_dfastate_t *
e3a87852 2348transit_state_sb (err, mctx, state)
a9388965 2349 reg_errcode_t *err;
a9388965 2350 re_match_context_t *mctx;
e3a87852 2351 re_dfastate_t *state;
3b0bdc72 2352{
e3a87852 2353 re_dfa_t *const dfa = mctx->dfa;
3b0bdc72
UD
2354 re_node_set next_nodes;
2355 re_dfastate_t *next_state;
56b168be 2356 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72
UD
2357 unsigned int context;
2358
a9388965 2359 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
bc15410e 2360 if (BE (*err != REG_NOERROR, 0))
a9388965 2361 return NULL;
3b0bdc72
UD
2362 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2363 {
2364 int cur_node = state->nodes.elems[node_cnt];
e3a87852 2365 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
15a7d175
UD
2366 {
2367 *err = re_node_set_merge (&next_nodes,
2368 dfa->eclosures + dfa->nexts[cur_node]);
2369 if (BE (*err != REG_NOERROR, 0))
2370 {
2371 re_node_set_free (&next_nodes);
2372 return NULL;
2373 }
2374 }
3b0bdc72 2375 }
56b168be 2376 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
a9388965
UD
2377 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2378 /* We don't need to check errors here, since the return value of
2379 this function is next_state and ERR is already set. */
2380
3b0bdc72 2381 re_node_set_free (&next_nodes);
56b168be 2382 re_string_skip_bytes (&mctx->input, 1);
3b0bdc72
UD
2383 return next_state;
2384}
c13c99fa 2385#endif
3b0bdc72 2386
434d3784 2387#ifdef RE_ENABLE_I18N
a9388965 2388static reg_errcode_t
e3a87852 2389transit_state_mb (mctx, pstate)
3b0bdc72 2390 re_match_context_t *mctx;
e3a87852 2391 re_dfastate_t *pstate;
3b0bdc72 2392{
e3a87852 2393 re_dfa_t *const dfa = mctx->dfa;
a9388965 2394 reg_errcode_t err;
3b0bdc72
UD
2395 int i;
2396
2397 for (i = 0; i < pstate->nodes.nelem; ++i)
2398 {
2399 re_node_set dest_nodes, *new_nodes;
2400 int cur_node_idx = pstate->nodes.elems[i];
2401 int naccepted = 0, dest_idx;
2402 unsigned int context;
2403 re_dfastate_t *dest_state;
2404
485d775d 2405 if (dfa->nodes[cur_node_idx].constraint)
15a7d175 2406 {
56b168be
UD
2407 context = re_string_context_at (&mctx->input,
2408 re_string_cur_idx (&mctx->input),
2409 mctx->eflags);
15a7d175
UD
2410 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2411 context))
2412 continue;
2413 }
3b0bdc72 2414
ad7f28c2 2415 /* How many bytes the node can accept? */
3b0bdc72 2416 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
56b168be
UD
2417 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2418 re_string_cur_idx (&mctx->input));
3b0bdc72 2419 if (naccepted == 0)
15a7d175 2420 continue;
3b0bdc72
UD
2421
2422 /* The node can accepts `naccepted' bytes. */
56b168be 2423 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
0742e48e 2424 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
15a7d175 2425 : mctx->max_mb_elem_len);
7c1be3ec 2426 err = clean_state_log_if_needed (mctx, dest_idx);
612546c6 2427 if (BE (err != REG_NOERROR, 0))
15a7d175 2428 return err;
3b0bdc72
UD
2429#ifdef DEBUG
2430 assert (dfa->nexts[cur_node_idx] != -1);
2431#endif
2432 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
15a7d175 2433 then we use pstate->nodes.elems[i] instead. */
3b0bdc72
UD
2434 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2435
612546c6 2436 dest_state = mctx->state_log[dest_idx];
3b0bdc72 2437 if (dest_state == NULL)
15a7d175 2438 dest_nodes = *new_nodes;
3b0bdc72 2439 else
15a7d175
UD
2440 {
2441 err = re_node_set_init_union (&dest_nodes,
2442 dest_state->entrance_nodes, new_nodes);
2443 if (BE (err != REG_NOERROR, 0))
2444 return err;
2445 }
56b168be 2446 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
612546c6 2447 mctx->state_log[dest_idx]
15a7d175 2448 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
3b0bdc72 2449 if (dest_state != NULL)
15a7d175 2450 re_node_set_free (&dest_nodes);
1b2c2628 2451 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
15a7d175 2452 return err;
3b0bdc72 2453 }
a9388965 2454 return REG_NOERROR;
3b0bdc72 2455}
434d3784 2456#endif /* RE_ENABLE_I18N */
3b0bdc72 2457
a9388965 2458static reg_errcode_t
e3a87852 2459transit_state_bkref (mctx, nodes)
3b0bdc72 2460 re_match_context_t *mctx;
e3a87852 2461 const re_node_set *nodes;
3b0bdc72 2462{
e3a87852 2463 re_dfa_t *const dfa = mctx->dfa;
a9388965 2464 reg_errcode_t err;
0742e48e 2465 int i;
56b168be 2466 int cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72
UD
2467
2468 for (i = 0; i < nodes->nelem; ++i)
2469 {
6291ee3c 2470 int dest_str_idx, prev_nelem, bkc_idx;
3b0bdc72
UD
2471 int node_idx = nodes->elems[i];
2472 unsigned int context;
fe9434bb 2473 const re_token_t *node = dfa->nodes + node_idx;
3b0bdc72
UD
2474 re_node_set *new_dest_nodes;
2475
2476 /* Check whether `node' is a backreference or not. */
6291ee3c 2477 if (node->type != OP_BACK_REF)
15a7d175 2478 continue;
485d775d
UD
2479
2480 if (node->constraint)
15a7d175 2481 {
56b168be
UD
2482 context = re_string_context_at (&mctx->input, cur_str_idx,
2483 mctx->eflags);
15a7d175
UD
2484 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2485 continue;
2486 }
3b0bdc72
UD
2487
2488 /* `node' is a backreference.
15a7d175 2489 Check the substring which the substring matched. */
6291ee3c 2490 bkc_idx = mctx->nbkref_ents;
e3a87852 2491 err = get_subexp (mctx, node_idx, cur_str_idx);
612546c6 2492 if (BE (err != REG_NOERROR, 0))
15a7d175 2493 goto free_return;
3b0bdc72
UD
2494
2495 /* And add the epsilon closures (which is `new_dest_nodes') of
15a7d175 2496 the backreference to appropriate state_log. */
3b0bdc72
UD
2497#ifdef DEBUG
2498 assert (dfa->nexts[node_idx] != -1);
2499#endif
6291ee3c 2500 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
15a7d175
UD
2501 {
2502 int subexp_len;
2503 re_dfastate_t *dest_state;
2504 struct re_backref_cache_entry *bkref_ent;
2505 bkref_ent = mctx->bkref_ents + bkc_idx;
2506 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2507 continue;
2508 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2509 new_dest_nodes = (subexp_len == 0
2510 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2511 : dfa->eclosures + dfa->nexts[node_idx]);
2512 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2513 - bkref_ent->subexp_from);
56b168be
UD
2514 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2515 mctx->eflags);
15a7d175
UD
2516 dest_state = mctx->state_log[dest_str_idx];
2517 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2518 : mctx->state_log[cur_str_idx]->nodes.nelem);
2519 /* Add `new_dest_node' to state_log. */
2520 if (dest_state == NULL)
2521 {
2522 mctx->state_log[dest_str_idx]
2523 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2524 context);
2525 if (BE (mctx->state_log[dest_str_idx] == NULL
2526 && err != REG_NOERROR, 0))
2527 goto free_return;
2528 }
2529 else
2530 {
2531 re_node_set dest_nodes;
2532 err = re_node_set_init_union (&dest_nodes,
2533 dest_state->entrance_nodes,
2534 new_dest_nodes);
2535 if (BE (err != REG_NOERROR, 0))
2536 {
2537 re_node_set_free (&dest_nodes);
2538 goto free_return;
2539 }
2540 mctx->state_log[dest_str_idx]
2541 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2542 re_node_set_free (&dest_nodes);
2543 if (BE (mctx->state_log[dest_str_idx] == NULL
2544 && err != REG_NOERROR, 0))
2545 goto free_return;
2546 }
2547 /* We need to check recursively if the backreference can epsilon
2548 transit. */
2549 if (subexp_len == 0
2550 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2551 {
e3a87852 2552 err = check_subexp_matching_top (mctx, new_dest_nodes,
6291ee3c
UD
2553 cur_str_idx);
2554 if (BE (err != REG_NOERROR, 0))
2555 goto free_return;
e3a87852 2556 err = transit_state_bkref (mctx, new_dest_nodes);
15a7d175
UD
2557 if (BE (err != REG_NOERROR, 0))
2558 goto free_return;
2559 }
2560 }
3b0bdc72 2561 }
1b2c2628
UD
2562 err = REG_NOERROR;
2563 free_return:
1b2c2628 2564 return err;
3b0bdc72
UD
2565}
2566
6291ee3c
UD
2567/* Enumerate all the candidates which the backreference BKREF_NODE can match
2568 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2569 Note that we might collect inappropriate candidates here.
2570 However, the cost of checking them strictly here is too high, then we
2571 delay these checking for prune_impossible_nodes(). */
3b0bdc72 2572
6291ee3c 2573static reg_errcode_t
e3a87852 2574get_subexp (mctx, bkref_node, bkref_str_idx)
6291ee3c
UD
2575 re_match_context_t *mctx;
2576 int bkref_node, bkref_str_idx;
3b0bdc72 2577{
e3a87852 2578 re_dfa_t *const dfa = mctx->dfa;
6291ee3c 2579 int subexp_num, sub_top_idx;
56b168be 2580 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
6291ee3c
UD
2581 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2582 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2583 for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2584 {
fe9434bb
UD
2585 const struct re_backref_cache_entry *entry
2586 = &mctx->bkref_ents[cache_idx];
6291ee3c
UD
2587 if (entry->str_idx > bkref_str_idx)
2588 break;
2589 if (entry->node == bkref_node)
2590 return REG_NOERROR; /* We already checked it. */
2591 }
2592 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2593
2594 /* For each sub expression */
2595 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2596 {
2597 reg_errcode_t err;
2598 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2599 re_sub_match_last_t *sub_last;
7c1be3ec 2600 int sub_last_idx, sl_str, bkref_str_off;
6291ee3c
UD
2601
2602 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2603 continue; /* It isn't related. */
2604
2605 sl_str = sub_top->str_idx;
7c1be3ec 2606 bkref_str_off = bkref_str_idx;
6291ee3c
UD
2607 /* At first, check the last node of sub expressions we already
2608 evaluated. */
2609 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2610 {
2611 int sl_str_diff;
2612 sub_last = sub_top->lasts[sub_last_idx];
2613 sl_str_diff = sub_last->str_idx - sl_str;
2614 /* The matched string by the sub expression match with the substring
2615 at the back reference? */
c1baba0f
UD
2616 if (sl_str_diff > 0)
2617 {
2618 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2619 {
2620 /* Not enough chars for a successful match. */
2621 if (bkref_str_off + sl_str_diff > mctx->input.len)
2622 break;
2623
2624 err = clean_state_log_if_needed (mctx,
2625 bkref_str_off
2626 + sl_str_diff);
2627 if (BE (err != REG_NOERROR, 0))
2628 return err;
2629 buf = (const char *) re_string_get_buffer (&mctx->input);
2630 }
2631 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2632 break; /* We don't need to search this sub expression any more. */
2633 }
7c1be3ec 2634 bkref_str_off += sl_str_diff;
6291ee3c 2635 sl_str += sl_str_diff;
e3a87852 2636 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
6291ee3c 2637 bkref_str_idx);
3ee363e2 2638
7c1be3ec
UD
2639 /* Reload buf, since the preceding call might have reallocated
2640 the buffer. */
56b168be 2641 buf = (const char *) re_string_get_buffer (&mctx->input);
3ee363e2 2642
6291ee3c
UD
2643 if (err == REG_NOMATCH)
2644 continue;
2645 if (BE (err != REG_NOERROR, 0))
2646 return err;
2647 }
7c1be3ec 2648
6291ee3c
UD
2649 if (sub_last_idx < sub_top->nlasts)
2650 continue;
2651 if (sub_last_idx > 0)
2652 ++sl_str;
2653 /* Then, search for the other last nodes of the sub expression. */
2654 for (; sl_str <= bkref_str_idx; ++sl_str)
2655 {
2656 int cls_node, sl_str_off;
fe9434bb 2657 const re_node_set *nodes;
6291ee3c
UD
2658 sl_str_off = sl_str - sub_top->str_idx;
2659 /* The matched string by the sub expression match with the substring
2660 at the back reference? */
c1baba0f
UD
2661 if (sl_str_off > 0)
2662 {
2663 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2664 {
2665 /* If we are at the end of the input, we cannot match. */
2666 if (bkref_str_off >= mctx->input.len)
2667 break;
2668
2669 err = extend_buffers (mctx);
2670 if (BE (err != REG_NOERROR, 0))
2671 return err;
2672
2673 buf = (const char *) re_string_get_buffer (&mctx->input);
2674 }
2675 if (buf [bkref_str_off++] != buf[sl_str - 1])
2676 break; /* We don't need to search this sub expression
2677 any more. */
2678 }
6291ee3c
UD
2679 if (mctx->state_log[sl_str] == NULL)
2680 continue;
2681 /* Does this state have a ')' of the sub expression? */
2682 nodes = &mctx->state_log[sl_str]->nodes;
0ce7f49c 2683 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
6291ee3c
UD
2684 if (cls_node == -1)
2685 continue; /* No. */
2686 if (sub_top->path == NULL)
2687 {
2688 sub_top->path = calloc (sizeof (state_array_t),
2689 sl_str - sub_top->str_idx + 1);
2690 if (sub_top->path == NULL)
2691 return REG_ESPACE;
2692 }
2693 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2694 in the current context? */
e3a87852 2695 err = check_arrival (mctx, sub_top->path, sub_top->node,
0ce7f49c 2696 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
6291ee3c
UD
2697 if (err == REG_NOMATCH)
2698 continue;
2699 if (BE (err != REG_NOERROR, 0))
2700 return err;
2701 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2702 if (BE (sub_last == NULL, 0))
2703 return REG_ESPACE;
e3a87852 2704 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
6291ee3c
UD
2705 bkref_str_idx);
2706 if (err == REG_NOMATCH)
2707 continue;
2708 }
2709 }
2710 return REG_NOERROR;
2711}
2712
2713/* Helper functions for get_subexp(). */
2714
2715/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2716 If it can arrive, register the sub expression expressed with SUB_TOP
2717 and SUB_LAST. */
2718
2719static reg_errcode_t
e3a87852 2720get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
6291ee3c 2721 re_match_context_t *mctx;
fe9434bb 2722 const re_sub_match_top_t *sub_top;
6291ee3c
UD
2723 re_sub_match_last_t *sub_last;
2724 int bkref_node, bkref_str;
2725{
2726 reg_errcode_t err;
2727 int to_idx;
2728 /* Can the subexpression arrive the back reference? */
e3a87852 2729 err = check_arrival (mctx, &sub_last->path, sub_last->node,
0ce7f49c 2730 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
6291ee3c
UD
2731 if (err != REG_NOERROR)
2732 return err;
2733 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2734 sub_last->str_idx);
2735 if (BE (err != REG_NOERROR, 0))
2736 return err;
2737 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
c1baba0f 2738 return clean_state_log_if_needed (mctx, to_idx);
6291ee3c
UD
2739}
2740
2741/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2742 Search '(' if FL_OPEN, or search ')' otherwise.
2743 TODO: This function isn't efficient...
2744 Because there might be more than one nodes whose types are
2745 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2746 nodes.
2747 E.g. RE: (a){2} */
2748
2749static int
0ce7f49c 2750find_subexp_node (dfa, nodes, subexp_idx, type)
fe9434bb
UD
2751 const re_dfa_t *dfa;
2752 const re_node_set *nodes;
0ce7f49c 2753 int subexp_idx, type;
6291ee3c
UD
2754{
2755 int cls_idx;
2756 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2757 {
2758 int cls_node = nodes->elems[cls_idx];
fe9434bb 2759 const re_token_t *node = dfa->nodes + cls_node;
0ce7f49c 2760 if (node->type == type
6291ee3c
UD
2761 && node->opr.idx == subexp_idx)
2762 return cls_node;
2763 }
2764 return -1;
2765}
2766
2767/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2768 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2769 heavily reused.
2770 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2771
2772static reg_errcode_t
e3a87852 2773check_arrival (mctx, path, top_node, top_str, last_node, last_str,
0ce7f49c 2774 type)
6291ee3c
UD
2775 re_match_context_t *mctx;
2776 state_array_t *path;
0ce7f49c 2777 int top_node, top_str, last_node, last_str, type;
6291ee3c 2778{
e3a87852 2779 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
2780 reg_errcode_t err;
2781 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2782 re_dfastate_t *cur_state = NULL;
2783 re_node_set *cur_nodes, next_nodes;
2784 re_dfastate_t **backup_state_log;
2785 unsigned int context;
2786
2787 subexp_num = dfa->nodes[top_node].opr.idx;
2788 /* Extend the buffer if we need. */
951d6408 2789 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
6291ee3c
UD
2790 {
2791 re_dfastate_t **new_array;
2792 int old_alloc = path->alloc;
2793 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2794 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2795 if (new_array == NULL)
951d6408
UD
2796 {
2797 path->alloc = old_alloc;
2798 return REG_ESPACE;
2799 }
6291ee3c
UD
2800 path->array = new_array;
2801 memset (new_array + old_alloc, '\0',
2802 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2803 }
2804
2805 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2806
2807 /* Temporary modify MCTX. */
2808 backup_state_log = mctx->state_log;
56b168be 2809 backup_cur_idx = mctx->input.cur_idx;
6291ee3c 2810 mctx->state_log = path->array;
56b168be 2811 mctx->input.cur_idx = str_idx;
6291ee3c
UD
2812
2813 /* Setup initial node set. */
56b168be 2814 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
6291ee3c
UD
2815 if (str_idx == top_str)
2816 {
2817 err = re_node_set_init_1 (&next_nodes, top_node);
2818 if (BE (err != REG_NOERROR, 0))
2819 return err;
0ce7f49c 2820 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2821 if (BE (err != REG_NOERROR, 0))
2822 {
2823 re_node_set_free (&next_nodes);
2824 return err;
2825 }
2826 }
2827 else
2828 {
2829 cur_state = mctx->state_log[str_idx];
2830 if (cur_state && cur_state->has_backref)
2831 {
2832 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2833 if (BE ( err != REG_NOERROR, 0))
2834 return err;
2835 }
2836 else
2837 re_node_set_init_empty (&next_nodes);
2838 }
2839 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2840 {
2841 if (next_nodes.nelem)
2842 {
e3a87852 2843 err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
0ce7f49c 2844 subexp_num, type);
6291ee3c
UD
2845 if (BE ( err != REG_NOERROR, 0))
2846 {
2847 re_node_set_free (&next_nodes);
2848 return err;
2849 }
2850 }
2851 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2852 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2853 {
2854 re_node_set_free (&next_nodes);
2855 return err;
2856 }
2857 mctx->state_log[str_idx] = cur_state;
2858 }
2859
2860 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2861 {
2862 re_node_set_empty (&next_nodes);
2863 if (mctx->state_log[str_idx + 1])
2864 {
2865 err = re_node_set_merge (&next_nodes,
2866 &mctx->state_log[str_idx + 1]->nodes);
2867 if (BE (err != REG_NOERROR, 0))
2868 {
2869 re_node_set_free (&next_nodes);
2870 return err;
2871 }
2872 }
2873 if (cur_state)
2874 {
e3a87852 2875 err = check_arrival_add_next_nodes (mctx, str_idx,
46bf9de7 2876 &cur_state->nodes, &next_nodes);
6291ee3c
UD
2877 if (BE (err != REG_NOERROR, 0))
2878 {
2879 re_node_set_free (&next_nodes);
2880 return err;
2881 }
2882 }
2883 ++str_idx;
2884 if (next_nodes.nelem)
2885 {
0ce7f49c 2886 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2887 if (BE (err != REG_NOERROR, 0))
2888 {
2889 re_node_set_free (&next_nodes);
2890 return err;
2891 }
e3a87852 2892 err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
0ce7f49c 2893 subexp_num, type);
6291ee3c
UD
2894 if (BE ( err != REG_NOERROR, 0))
2895 {
2896 re_node_set_free (&next_nodes);
2897 return err;
2898 }
2899 }
56b168be 2900 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
6291ee3c
UD
2901 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2902 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2903 {
2904 re_node_set_free (&next_nodes);
2905 return err;
2906 }
2907 mctx->state_log[str_idx] = cur_state;
2908 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2909 }
2910 re_node_set_free (&next_nodes);
2911 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2912 : &mctx->state_log[last_str]->nodes);
2913 path->next_idx = str_idx;
2914
2915 /* Fix MCTX. */
2916 mctx->state_log = backup_state_log;
56b168be 2917 mctx->input.cur_idx = backup_cur_idx;
6291ee3c 2918
6291ee3c 2919 /* Then check the current node set has the node LAST_NODE. */
c0d5034e
UD
2920 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2921 return REG_NOERROR;
2922
2923 return REG_NOMATCH;
6291ee3c
UD
2924}
2925
2926/* Helper functions for check_arrival. */
2927
2928/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2929 to NEXT_NODES.
2930 TODO: This function is similar to the functions transit_state*(),
2931 however this function has many additional works.
2932 Can't we unify them? */
2933
2934static reg_errcode_t
e3a87852 2935check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
6291ee3c
UD
2936 re_match_context_t *mctx;
2937 int str_idx;
2938 re_node_set *cur_nodes, *next_nodes;
2939{
e3a87852 2940 re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
2941 int cur_idx;
2942 reg_errcode_t err;
2943 re_node_set union_set;
2944 re_node_set_init_empty (&union_set);
2945 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2946 {
2947 int naccepted = 0;
2948 int cur_node = cur_nodes->elems[cur_idx];
2949 re_token_type_t type = dfa->nodes[cur_node].type;
46bf9de7 2950 if (IS_EPSILON_NODE (type))
6291ee3c
UD
2951 continue;
2952#ifdef RE_ENABLE_I18N
2953 /* If the node may accept `multi byte'. */
2954 if (ACCEPT_MB_NODE (type))
2955 {
56b168be 2956 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
6291ee3c
UD
2957 str_idx);
2958 if (naccepted > 1)
2959 {
2960 re_dfastate_t *dest_state;
2961 int next_node = dfa->nexts[cur_node];
2962 int next_idx = str_idx + naccepted;
2963 dest_state = mctx->state_log[next_idx];
2964 re_node_set_empty (&union_set);
2965 if (dest_state)
2966 {
2967 err = re_node_set_merge (&union_set, &dest_state->nodes);
2968 if (BE (err != REG_NOERROR, 0))
2969 {
2970 re_node_set_free (&union_set);
2971 return err;
2972 }
6291ee3c 2973 }
44777771
UD
2974 err = re_node_set_insert (&union_set, next_node);
2975 if (BE (err < 0, 0))
6291ee3c 2976 {
44777771
UD
2977 re_node_set_free (&union_set);
2978 return REG_ESPACE;
6291ee3c
UD
2979 }
2980 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2981 &union_set);
2982 if (BE (mctx->state_log[next_idx] == NULL
2983 && err != REG_NOERROR, 0))
2984 {
2985 re_node_set_free (&union_set);
2986 return err;
2987 }
2988 }
2989 }
2990#endif /* RE_ENABLE_I18N */
2991 if (naccepted
e3a87852 2992 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
6291ee3c
UD
2993 {
2994 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
2995 if (BE (err < 0, 0))
2996 {
2997 re_node_set_free (&union_set);
2998 return REG_ESPACE;
2999 }
3000 }
3001 }
3002 re_node_set_free (&union_set);
3003 return REG_NOERROR;
3004}
3005
3006/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3007 CUR_NODES, however exclude the nodes which are:
3008 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3009 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3010*/
3011
3012static reg_errcode_t
0ce7f49c 3013check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
6291ee3c
UD
3014 re_dfa_t *dfa;
3015 re_node_set *cur_nodes;
0ce7f49c 3016 int ex_subexp, type;
6291ee3c
UD
3017{
3018 reg_errcode_t err;
3019 int idx, outside_node;
3020 re_node_set new_nodes;
3021#ifdef DEBUG
3022 assert (cur_nodes->nelem);
3023#endif
3024 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3025 if (BE (err != REG_NOERROR, 0))
3026 return err;
3027 /* Create a new node set NEW_NODES with the nodes which are epsilon
3028 closures of the node in CUR_NODES. */
3029
3030 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3031 {
3032 int cur_node = cur_nodes->elems[idx];
3033 re_node_set *eclosure = dfa->eclosures + cur_node;
0ce7f49c 3034 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
6291ee3c
UD
3035 if (outside_node == -1)
3036 {
3037 /* There are no problematic nodes, just merge them. */
3038 err = re_node_set_merge (&new_nodes, eclosure);
3039 if (BE (err != REG_NOERROR, 0))
3040 {
3041 re_node_set_free (&new_nodes);
3042 return err;
3043 }
3044 }
3045 else
3046 {
3047 /* There are problematic nodes, re-calculate incrementally. */
3048 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
0ce7f49c 3049 ex_subexp, type);
6291ee3c
UD
3050 if (BE (err != REG_NOERROR, 0))
3051 {
3052 re_node_set_free (&new_nodes);
3053 return err;
3054 }
3055 }
3056 }
3057 re_node_set_free (cur_nodes);
3058 *cur_nodes = new_nodes;
3059 return REG_NOERROR;
3060}
3061
3062/* Helper function for check_arrival_expand_ecl.
3063 Check incrementally the epsilon closure of TARGET, and if it isn't
3064 problematic append it to DST_NODES. */
3065
3066static reg_errcode_t
0ce7f49c 3067check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
6291ee3c 3068 re_dfa_t *dfa;
0ce7f49c 3069 int target, ex_subexp, type;
6291ee3c
UD
3070 re_node_set *dst_nodes;
3071{
0ce7f49c 3072 int cur_node;
6291ee3c
UD
3073 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3074 {
3075 int err;
6291ee3c 3076
0ce7f49c 3077 if (dfa->nodes[cur_node].type == type
6291ee3c
UD
3078 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3079 {
0ce7f49c 3080 if (type == OP_CLOSE_SUBEXP)
6291ee3c
UD
3081 {
3082 err = re_node_set_insert (dst_nodes, cur_node);
3083 if (BE (err == -1, 0))
3084 return REG_ESPACE;
3085 }
3086 break;
3087 }
3088 err = re_node_set_insert (dst_nodes, cur_node);
3089 if (BE (err == -1, 0))
3090 return REG_ESPACE;
3091 if (dfa->edests[cur_node].nelem == 0)
3092 break;
3093 if (dfa->edests[cur_node].nelem == 2)
3094 {
3095 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3096 dfa->edests[cur_node].elems[1],
0ce7f49c 3097 ex_subexp, type);
6291ee3c
UD
3098 if (BE (err != REG_NOERROR, 0))
3099 return err;
3100 }
3101 cur_node = dfa->edests[cur_node].elems[0];
3102 }
3103 return REG_NOERROR;
3104}
3105
3106
3107/* For all the back references in the current state, calculate the
3108 destination of the back references by the appropriate entry
3109 in MCTX->BKREF_ENTS. */
3110
3111static reg_errcode_t
e3a87852 3112expand_bkref_cache (mctx, cur_nodes, cur_str, last_str, subexp_num,
0ce7f49c 3113 type)
6291ee3c 3114 re_match_context_t *mctx;
0ce7f49c 3115 int cur_str, last_str, subexp_num, type;
6291ee3c
UD
3116 re_node_set *cur_nodes;
3117{
e3a87852 3118 re_dfa_t *const dfa = mctx->dfa;
6291ee3c 3119 reg_errcode_t err;
6291ee3c
UD
3120 int cache_idx, cache_idx_start;
3121 /* The current state. */
3122
3123 cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3124 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3125 {
3126 int to_idx, next_node;
3127 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3128 if (ent->str_idx > cur_str)
3129 break;
3130 /* Is this entry ENT is appropriate? */
3131 if (!re_node_set_contains (cur_nodes, ent->node))
3132 continue; /* No. */
3133
3134 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3135 /* Calculate the destination of the back reference, and append it
3136 to MCTX->STATE_LOG. */
3137 if (to_idx == cur_str)
3138 {
3139 /* The backreference did epsilon transit, we must re-check all the
3140 node in the current state. */
3141 re_node_set new_dests;
3142 reg_errcode_t err2, err3;
3143 next_node = dfa->edests[ent->node].elems[0];
3144 if (re_node_set_contains (cur_nodes, next_node))
3145 continue;
3146 err = re_node_set_init_1 (&new_dests, next_node);
0ce7f49c 3147 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
6291ee3c
UD
3148 err3 = re_node_set_merge (cur_nodes, &new_dests);
3149 re_node_set_free (&new_dests);
3150 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3151 || err3 != REG_NOERROR, 0))
3152 {
3153 err = (err != REG_NOERROR ? err
3154 : (err2 != REG_NOERROR ? err2 : err3));
3155 return err;
3156 }
3157 /* TODO: It is still inefficient... */
3158 cache_idx = cache_idx_start - 1;
3159 continue;
3160 }
3161 else
3162 {
3163 re_node_set union_set;
3164 next_node = dfa->nexts[ent->node];
3165 if (mctx->state_log[to_idx])
3166 {
3167 int ret;
3168 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3169 next_node))
3170 continue;
3171 err = re_node_set_init_copy (&union_set,
3172 &mctx->state_log[to_idx]->nodes);
3173 ret = re_node_set_insert (&union_set, next_node);
3174 if (BE (err != REG_NOERROR || ret < 0, 0))
3175 {
3176 re_node_set_free (&union_set);
3177 err = err != REG_NOERROR ? err : REG_ESPACE;
3178 return err;
3179 }
3180 }
3181 else
3182 {
3183 err = re_node_set_init_1 (&union_set, next_node);
3184 if (BE (err != REG_NOERROR, 0))
3185 return err;
3186 }
3187 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3188 re_node_set_free (&union_set);
3189 if (BE (mctx->state_log[to_idx] == NULL
3190 && err != REG_NOERROR, 0))
3191 return err;
3192 }
3193 }
3194 return REG_NOERROR;
3195}
3196
3197/* Build transition table for the state.
3198 Return the new table if succeeded, otherwise return NULL. */
3199
3200static re_dfastate_t **
56b168be
UD
3201build_trtable (dfa, state)
3202 re_dfa_t *dfa;
c13c99fa 3203 re_dfastate_t *state;
6291ee3c
UD
3204{
3205 reg_errcode_t err;
3ce12656
UD
3206 int i, j, ch;
3207 unsigned int elem, mask;
6291ee3c
UD
3208 int dests_node_malloced = 0, dest_states_malloced = 0;
3209 int ndests; /* Number of the destination states from `state'. */
3210 re_dfastate_t **trtable;
3211 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3212 re_node_set follows, *dests_node;
3213 bitset *dests_ch;
3214 bitset acceptable;
3b0bdc72
UD
3215
3216 /* We build DFA states which corresponds to the destination nodes
3217 from `state'. `dests_node[i]' represents the nodes which i-th
3218 destination state contains, and `dests_ch[i]' represents the
3219 characters which i-th destination state accepts. */
05dab910
RM
3220#ifdef _LIBC
3221 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3222 dests_node = (re_node_set *)
3223 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3224 else
3225#endif
3226 {
3227 dests_node = (re_node_set *)
3228 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3229 if (BE (dests_node == NULL, 0))
3230 return NULL;
3231 dests_node_malloced = 1;
3232 }
3233 dests_ch = (bitset *) (dests_node + SBC_MAX);
3b0bdc72
UD
3234
3235 /* Initialize transiton table. */
c13c99fa 3236 state->word_trtable = 0;
3b0bdc72
UD
3237
3238 /* At first, group all nodes belonging to `state' into several
3239 destinations. */
56b168be 3240 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
bc15410e 3241 if (BE (ndests <= 0, 0))
3b0bdc72 3242 {
05dab910
RM
3243 if (dests_node_malloced)
3244 free (dests_node);
a9388965 3245 /* Return NULL in case of an error, trtable otherwise. */
05dab910 3246 if (ndests == 0)
c13c99fa 3247 {
3ce12656
UD
3248 state->trtable = (re_dfastate_t **)
3249 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3250 return state->trtable;
c13c99fa 3251 }
05dab910 3252 return NULL;
3b0bdc72
UD
3253 }
3254
a9388965 3255 err = re_node_set_alloc (&follows, ndests + 1);
05dab910
RM
3256 if (BE (err != REG_NOERROR, 0))
3257 goto out_free;
3258
3259#ifdef _LIBC
3260 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3261 + ndests * 3 * sizeof (re_dfastate_t *)))
3262 dest_states = (re_dfastate_t **)
3263 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3264 else
3265#endif
3266 {
3267 dest_states = (re_dfastate_t **)
3268 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3269 if (BE (dest_states == NULL, 0))
3270 {
3271out_free:
3272 if (dest_states_malloced)
3273 free (dest_states);
3274 re_node_set_free (&follows);
3275 for (i = 0; i < ndests; ++i)
3276 re_node_set_free (dests_node + i);
05dab910
RM
3277 if (dests_node_malloced)
3278 free (dests_node);
3279 return NULL;
3280 }
3281 dest_states_malloced = 1;
3282 }
3283 dest_states_word = dest_states + ndests;
3284 dest_states_nl = dest_states_word + ndests;
3285 bitset_empty (acceptable);
a9388965 3286
3b0bdc72
UD
3287 /* Then build the states for all destinations. */
3288 for (i = 0; i < ndests; ++i)
3289 {
3290 int next_node;
3291 re_node_set_empty (&follows);
3292 /* Merge the follows of this destination states. */
3293 for (j = 0; j < dests_node[i].nelem; ++j)
15a7d175
UD
3294 {
3295 next_node = dfa->nexts[dests_node[i].elems[j]];
3296 if (next_node != -1)
3297 {
3298 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3299 if (BE (err != REG_NOERROR, 0))
05dab910 3300 goto out_free;
15a7d175
UD
3301 }
3302 }
a9388965 3303 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
bc15410e 3304 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
15a7d175 3305 goto out_free;
3b0bdc72 3306 /* If the new state has context constraint,
15a7d175 3307 build appropriate states for these contexts. */
3b0bdc72 3308 if (dest_states[i]->has_constraint)
15a7d175
UD
3309 {
3310 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3311 CONTEXT_WORD);
3312 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3313 goto out_free;
3ce12656
UD
3314
3315 if (dest_states[i] != dest_states_word[i]
3316 && dfa->mb_cur_max > 1)
3317 state->word_trtable = 1;
3318
15a7d175
UD
3319 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3320 CONTEXT_NEWLINE);
3321 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3322 goto out_free;
3ce12656 3323 }
3b0bdc72 3324 else
15a7d175
UD
3325 {
3326 dest_states_word[i] = dest_states[i];
3327 dest_states_nl[i] = dest_states[i];
3328 }
3b0bdc72
UD
3329 bitset_merge (acceptable, dests_ch[i]);
3330 }
3331
3ce12656
UD
3332 if (!BE (state->word_trtable, 0))
3333 {
3334 /* We don't care about whether the following character is a word
3335 character, or we are in a single-byte character set so we can
3336 discern by looking at the character code: allocate a
3337 256-entry transition table. */
3338 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3339 if (BE (trtable == NULL, 0))
3340 goto out_free;
3341
3342 /* For all characters ch...: */
3343 for (i = 0; i < BITSET_UINTS; ++i)
3344 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3345 elem;
3346 mask <<= 1, elem >>= 1, ++ch)
3347 if (BE (elem & 1, 0))
3348 {
3349 /* There must be exactly one destination which accepts
3350 character ch. See group_nodes_into_DFAstates. */
3351 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3352 ;
6b6557e8 3353
3ce12656 3354 /* j-th destination accepts the word character ch. */
56b168be 3355 if (dfa->word_char[i] & mask)
3ce12656
UD
3356 trtable[ch] = dest_states_word[j];
3357 else
3358 trtable[ch] = dest_states[j];
3359 }
3360 }
3361 else
3362 {
3363 /* We care about whether the following character is a word
3364 character, and we are in a multi-byte character set: discern
3365 by looking at the character code: build two 256-entry
3366 transition tables, one starting at trtable[0] and one
3367 starting at trtable[SBC_MAX]. */
3368 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3369 2 * SBC_MAX);
3370 if (BE (trtable == NULL, 0))
3371 goto out_free;
6b6557e8 3372
3ce12656
UD
3373 /* For all characters ch...: */
3374 for (i = 0; i < BITSET_UINTS; ++i)
3375 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3376 elem;
3377 mask <<= 1, elem >>= 1, ++ch)
3378 if (BE (elem & 1, 0))
3379 {
3380 /* There must be exactly one destination which accepts
3381 character ch. See group_nodes_into_DFAstates. */
3382 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3383 ;
6b6557e8 3384
3ce12656
UD
3385 /* j-th destination accepts the word character ch. */
3386 trtable[ch] = dest_states[j];
3387 trtable[ch + SBC_MAX] = dest_states_word[j];
3388 }
3389 }
6b6557e8 3390
3b0bdc72 3391 /* new line */
c202c2c5
UD
3392 if (bitset_contain (acceptable, NEWLINE_CHAR))
3393 {
3394 /* The current state accepts newline character. */
3ce12656
UD
3395 for (j = 0; j < ndests; ++j)
3396 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
15a7d175
UD
3397 {
3398 /* k-th destination accepts newline character. */
3ce12656 3399 trtable[NEWLINE_CHAR] = dest_states_nl[j];
c13c99fa 3400 if (state->word_trtable)
3ce12656 3401 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
15a7d175
UD
3402 /* There must be only one destination which accepts
3403 newline. See group_nodes_into_DFAstates. */
3404 break;
3405 }
c202c2c5 3406 }
3b0bdc72 3407
05dab910
RM
3408 if (dest_states_malloced)
3409 free (dest_states);
3b0bdc72
UD
3410
3411 re_node_set_free (&follows);
3412 for (i = 0; i < ndests; ++i)
3413 re_node_set_free (dests_node + i);
3414
05dab910
RM
3415 if (dests_node_malloced)
3416 free (dests_node);
3b0bdc72 3417
c13c99fa 3418 state->trtable = trtable;
3b0bdc72
UD
3419 return trtable;
3420}
3421
3422/* Group all nodes belonging to STATE into several destinations.
3423 Then for all destinations, set the nodes belonging to the destination
3424 to DESTS_NODE[i] and set the characters accepted by the destination
3425 to DEST_CH[i]. This function return the number of destinations. */
3426
3427static int
56b168be
UD
3428group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3429 re_dfa_t *dfa;
3b0bdc72
UD
3430 const re_dfastate_t *state;
3431 re_node_set *dests_node;
3432 bitset *dests_ch;
3433{
a9388965 3434 reg_errcode_t err;
3b0bdc72
UD
3435 int i, j, k;
3436 int ndests; /* Number of the destinations from `state'. */
3437 bitset accepts; /* Characters a node can accept. */
3438 const re_node_set *cur_nodes = &state->nodes;
3439 bitset_empty (accepts);
3440 ndests = 0;
3441
3442 /* For all the nodes belonging to `state', */
3443 for (i = 0; i < cur_nodes->nelem; ++i)
3444 {
3b0bdc72
UD
3445 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3446 re_token_type_t type = node->type;
485d775d 3447 unsigned int constraint = node->constraint;
3b0bdc72
UD
3448
3449 /* Enumerate all single byte character this node can accept. */
3450 if (type == CHARACTER)
15a7d175 3451 bitset_set (accepts, node->opr.c);
3b0bdc72 3452 else if (type == SIMPLE_BRACKET)
15a7d175
UD
3453 {
3454 bitset_merge (accepts, node->opr.sbcset);
3455 }
3b0bdc72 3456 else if (type == OP_PERIOD)
15a7d175 3457 {
65e6becf
UD
3458#ifdef RE_ENABLE_I18N
3459 if (dfa->mb_cur_max > 1)
3460 bitset_merge (accepts, dfa->sb_char);
3461 else
0ce7f49c 3462#endif
65e6becf 3463 bitset_set_all (accepts);
56b168be 3464 if (!(dfa->syntax & RE_DOT_NEWLINE))
15a7d175 3465 bitset_clear (accepts, '\n');
56b168be 3466 if (dfa->syntax & RE_DOT_NOT_NULL)
15a7d175
UD
3467 bitset_clear (accepts, '\0');
3468 }
c0d5034e 3469#ifdef RE_ENABLE_I18N
ad7f28c2
UD
3470 else if (type == OP_UTF8_PERIOD)
3471 {
3472 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
56b168be 3473 if (!(dfa->syntax & RE_DOT_NEWLINE))
ad7f28c2 3474 bitset_clear (accepts, '\n');
56b168be 3475 if (dfa->syntax & RE_DOT_NOT_NULL)
ad7f28c2
UD
3476 bitset_clear (accepts, '\0');
3477 }
c0d5034e 3478#endif
3b0bdc72 3479 else
15a7d175 3480 continue;
3b0bdc72
UD
3481
3482 /* Check the `accepts' and sift the characters which are not
15a7d175 3483 match it the context. */
3b0bdc72 3484 if (constraint)
15a7d175 3485 {
15a7d175
UD
3486 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3487 {
3488 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3489 bitset_empty (accepts);
3490 if (accepts_newline)
3491 bitset_set (accepts, NEWLINE_CHAR);
3492 else
3493 continue;
3494 }
66b110e8
UD
3495 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3496 {
3497 bitset_empty (accepts);
3498 continue;
3499 }
c13c99fa 3500
66b110e8 3501 if (constraint & NEXT_WORD_CONSTRAINT)
65e6becf 3502 {
457beec8 3503 unsigned int any_set = 0;
1cef7b3c
UD
3504 if (type == CHARACTER && !node->word_char)
3505 {
3506 bitset_empty (accepts);
3507 continue;
3508 }
65e6becf
UD
3509#ifdef RE_ENABLE_I18N
3510 if (dfa->mb_cur_max > 1)
3511 for (j = 0; j < BITSET_UINTS; ++j)
457beec8 3512 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
65e6becf
UD
3513 else
3514#endif
3515 for (j = 0; j < BITSET_UINTS; ++j)
457beec8
UD
3516 any_set |= (accepts[j] &= dfa->word_char[j]);
3517 if (!any_set)
3518 continue;
65e6becf 3519 }
66b110e8 3520 if (constraint & NEXT_NOTWORD_CONSTRAINT)
65e6becf 3521 {
457beec8 3522 unsigned int any_set = 0;
1cef7b3c
UD
3523 if (type == CHARACTER && node->word_char)
3524 {
3525 bitset_empty (accepts);
3526 continue;
3527 }
65e6becf
UD
3528#ifdef RE_ENABLE_I18N
3529 if (dfa->mb_cur_max > 1)
3530 for (j = 0; j < BITSET_UINTS; ++j)
457beec8 3531 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
65e6becf
UD
3532 else
3533#endif
3534 for (j = 0; j < BITSET_UINTS; ++j)
457beec8
UD
3535 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3536 if (!any_set)
3537 continue;
65e6becf 3538 }
15a7d175 3539 }
3b0bdc72
UD
3540
3541 /* Then divide `accepts' into DFA states, or create a new
457beec8 3542 state. Above, we make sure that accepts is not empty. */
3b0bdc72 3543 for (j = 0; j < ndests; ++j)
15a7d175
UD
3544 {
3545 bitset intersec; /* Intersection sets, see below. */
3546 bitset remains;
3547 /* Flags, see below. */
3548 int has_intersec, not_subset, not_consumed;
3549
3550 /* Optimization, skip if this state doesn't accept the character. */
3551 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3552 continue;
3553
3554 /* Enumerate the intersection set of this state and `accepts'. */
3555 has_intersec = 0;
3556 for (k = 0; k < BITSET_UINTS; ++k)
3557 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3558 /* And skip if the intersection set is empty. */
3559 if (!has_intersec)
3560 continue;
3561
3562 /* Then check if this state is a subset of `accepts'. */
3563 not_subset = not_consumed = 0;
3564 for (k = 0; k < BITSET_UINTS; ++k)
3565 {
3566 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3567 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3568 }
3569
3570 /* If this state isn't a subset of `accepts', create a
3571 new group state, which has the `remains'. */
3572 if (not_subset)
3573 {
3574 bitset_copy (dests_ch[ndests], remains);
3575 bitset_copy (dests_ch[j], intersec);
3576 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3577 if (BE (err != REG_NOERROR, 0))
3578 goto error_return;
3579 ++ndests;
3580 }
3581
3582 /* Put the position in the current group. */
3583 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3584 if (BE (err < 0, 0))
3585 goto error_return;
3586
3587 /* If all characters are consumed, go to next node. */
3588 if (!not_consumed)
3589 break;
3590 }
3b0bdc72
UD
3591 /* Some characters remain, create a new group. */
3592 if (j == ndests)
15a7d175
UD
3593 {
3594 bitset_copy (dests_ch[ndests], accepts);
3595 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3596 if (BE (err != REG_NOERROR, 0))
3597 goto error_return;
3598 ++ndests;
3599 bitset_empty (accepts);
3600 }
3b0bdc72
UD
3601 }
3602 return ndests;
1b2c2628
UD
3603 error_return:
3604 for (j = 0; j < ndests; ++j)
3605 re_node_set_free (dests_node + j);
3606 return -1;
3b0bdc72
UD
3607}
3608
434d3784
UD
3609#ifdef RE_ENABLE_I18N
3610/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3611 Return the number of the bytes the node accepts.
3612 STR_IDX is the current index of the input string.
3613
3614 This function handles the nodes which can accept one character, or
3615 one collating element like '.', '[a-z]', opposite to the other nodes
3616 can only accept one byte. */
3b0bdc72
UD
3617
3618static int
56b168be
UD
3619check_node_accept_bytes (dfa, node_idx, input, str_idx)
3620 re_dfa_t *dfa;
3b0bdc72
UD
3621 int node_idx, str_idx;
3622 const re_string_t *input;
3623{
3b0bdc72 3624 const re_token_t *node = dfa->nodes + node_idx;
ad7f28c2 3625 int char_len, elem_len;
434d3784 3626 int i;
ad7f28c2
UD
3627
3628 if (BE (node->type == OP_UTF8_PERIOD, 0))
3629 {
3630 unsigned char c = re_string_byte_at (input, str_idx), d;
3631 if (BE (c < 0xc2, 1))
3632 return 0;
3633
3634 if (str_idx + 2 > input->len)
3635 return 0;
3636
3637 d = re_string_byte_at (input, str_idx + 1);
3638 if (c < 0xe0)
3639 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3640 else if (c < 0xf0)
3641 {
3642 char_len = 3;
3643 if (c == 0xe0 && d < 0xa0)
3644 return 0;
3645 }
3646 else if (c < 0xf8)
3647 {
3648 char_len = 4;
3649 if (c == 0xf0 && d < 0x90)
3650 return 0;
3651 }
3652 else if (c < 0xfc)
3653 {
3654 char_len = 5;
3655 if (c == 0xf8 && d < 0x88)
3656 return 0;
3657 }
3658 else if (c < 0xfe)
3659 {
3660 char_len = 6;
3661 if (c == 0xfc && d < 0x84)
3662 return 0;
3663 }
3664 else
3665 return 0;
3666
3667 if (str_idx + char_len > input->len)
3668 return 0;
3669
3670 for (i = 1; i < char_len; ++i)
3671 {
3672 d = re_string_byte_at (input, str_idx + i);
3673 if (d < 0x80 || d > 0xbf)
3674 return 0;
3675 }
3676 return char_len;
3677 }
3678
3679 char_len = re_string_char_size_at (input, str_idx);
3b0bdc72
UD
3680 if (node->type == OP_PERIOD)
3681 {
ad7f28c2
UD
3682 if (char_len <= 1)
3683 return 0;
3684 /* FIXME: I don't think this if is needed, as both '\n'
3685 and '\0' are char_len == 1. */
434d3784 3686 /* '.' accepts any one character except the following two cases. */
56b168be 3687 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
15a7d175 3688 re_string_byte_at (input, str_idx) == '\n') ||
56b168be 3689 ((dfa->syntax & RE_DOT_NOT_NULL) &&
15a7d175
UD
3690 re_string_byte_at (input, str_idx) == '\0'))
3691 return 0;
3b0bdc72
UD
3692 return char_len;
3693 }
ad7f28c2
UD
3694
3695 elem_len = re_string_elem_size_at (input, str_idx);
6c2a04a7 3696 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
ad7f28c2
UD
3697 return 0;
3698
3699 if (node->type == COMPLEX_BRACKET)
3b0bdc72
UD
3700 {
3701 const re_charset_t *cset = node->opr.mbcset;
434d3784 3702# ifdef _LIBC
85c54a32
UD
3703 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3704 + str_idx);
5f93cd52
UD
3705 int j;
3706 uint32_t nrules;
434d3784
UD
3707# endif /* _LIBC */
3708 int match_len = 0;
3709 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
15a7d175 3710 ? re_string_wchar_at (input, str_idx) : 0);
434d3784
UD
3711
3712 /* match with multibyte character? */
3713 for (i = 0; i < cset->nmbchars; ++i)
15a7d175
UD
3714 if (wc == cset->mbchars[i])
3715 {
3716 match_len = char_len;
3717 goto check_node_accept_bytes_match;
3718 }
434d3784
UD
3719 /* match with character_class? */
3720 for (i = 0; i < cset->nchar_classes; ++i)
15a7d175
UD
3721 {
3722 wctype_t wt = cset->char_classes[i];
3723 if (__iswctype (wc, wt))
3724 {
3725 match_len = char_len;
3726 goto check_node_accept_bytes_match;
3727 }
3728 }
434d3784
UD
3729
3730# ifdef _LIBC
5f93cd52 3731 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3b0bdc72 3732 if (nrules != 0)
15a7d175
UD
3733 {
3734 unsigned int in_collseq = 0;
3735 const int32_t *table, *indirect;
3736 const unsigned char *weights, *extra;
3737 const char *collseqwc;
3738 int32_t idx;
3739 /* This #include defines a local function! */
434d3784 3740# include <locale/weight.h>
3b0bdc72 3741
15a7d175
UD
3742 /* match with collating_symbol? */
3743 if (cset->ncoll_syms)
3744 extra = (const unsigned char *)
3745 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3746 for (i = 0; i < cset->ncoll_syms; ++i)
3747 {
3748 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3749 /* Compare the length of input collating element and
3750 the length of current collating element. */
3751 if (*coll_sym != elem_len)
3752 continue;
3753 /* Compare each bytes. */
3754 for (j = 0; j < *coll_sym; j++)
3755 if (pin[j] != coll_sym[1 + j])
3756 break;
3757 if (j == *coll_sym)
3758 {
3759 /* Match if every bytes is equal. */
3760 match_len = j;
3761 goto check_node_accept_bytes_match;
3762 }
3763 }
3764
3765 if (cset->nranges)
3766 {
3767 if (elem_len <= char_len)
3768 {
3769 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
25337753 3770 in_collseq = __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
3771 }
3772 else
3773 in_collseq = find_collation_sequence_value (pin, elem_len);
3774 }
3775 /* match with range expression? */
3776 for (i = 0; i < cset->nranges; ++i)
3777 if (cset->range_starts[i] <= in_collseq
3778 && in_collseq <= cset->range_ends[i])
3779 {
3780 match_len = elem_len;
3781 goto check_node_accept_bytes_match;
3782 }
3783
3784 /* match with equivalence_class? */
3785 if (cset->nequiv_classes)
3786 {
3787 const unsigned char *cp = pin;
3788 table = (const int32_t *)
3789 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3790 weights = (const unsigned char *)
3791 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3792 extra = (const unsigned char *)
3793 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3794 indirect = (const int32_t *)
3795 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3796 idx = findidx (&cp);
3797 if (idx > 0)
3798 for (i = 0; i < cset->nequiv_classes; ++i)
3799 {
3800 int32_t equiv_class_idx = cset->equiv_classes[i];
3801 size_t weight_len = weights[idx];
3802 if (weight_len == weights[equiv_class_idx])
3803 {
3804 int cnt = 0;
3805 while (cnt <= weight_len
3806 && (weights[equiv_class_idx + 1 + cnt]
3807 == weights[idx + 1 + cnt]))
3808 ++cnt;
3809 if (cnt > weight_len)
3810 {
3811 match_len = elem_len;
3812 goto check_node_accept_bytes_match;
3813 }
3814 }
3815 }
3816 }
3817 }
434d3784
UD
3818 else
3819# endif /* _LIBC */
15a7d175
UD
3820 {
3821 /* match with range expression? */
92b27c74 3822#if __GNUC__ >= 2
15a7d175 3823 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
92b27c74 3824#else
15a7d175
UD
3825 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3826 cmp_buf[2] = wc;
92b27c74 3827#endif
15a7d175
UD
3828 for (i = 0; i < cset->nranges; ++i)
3829 {
3830 cmp_buf[0] = cset->range_starts[i];
3831 cmp_buf[4] = cset->range_ends[i];
3832 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3833 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3834 {
3835 match_len = char_len;
3836 goto check_node_accept_bytes_match;
3837 }
3838 }
3839 }
434d3784
UD
3840 check_node_accept_bytes_match:
3841 if (!cset->non_match)
15a7d175 3842 return match_len;
434d3784 3843 else
15a7d175
UD
3844 {
3845 if (match_len > 0)
3846 return 0;
3847 else
3848 return (elem_len > char_len) ? elem_len : char_len;
3849 }
3b0bdc72
UD
3850 }
3851 return 0;
3852}
3853
434d3784 3854# ifdef _LIBC
3b0bdc72
UD
3855static unsigned int
3856find_collation_sequence_value (mbs, mbs_len)
c202c2c5 3857 const unsigned char *mbs;
3b0bdc72
UD
3858 size_t mbs_len;
3859{
3860 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3861 if (nrules == 0)
3862 {
3863 if (mbs_len == 1)
15a7d175
UD
3864 {
3865 /* No valid character. Match it as a single byte character. */
3866 const unsigned char *collseq = (const unsigned char *)
3867 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3868 return collseq[mbs[0]];
3869 }
3b0bdc72
UD
3870 return UINT_MAX;
3871 }
3872 else
3873 {
3874 int32_t idx;
3875 const unsigned char *extra = (const unsigned char *)
15a7d175 3876 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6c2a04a7
UD
3877 int32_t extrasize = (const unsigned char *)
3878 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3b0bdc72 3879
6c2a04a7 3880 for (idx = 0; idx < extrasize;)
15a7d175
UD
3881 {
3882 int mbs_cnt, found = 0;
3883 int32_t elem_mbs_len;
3884 /* Skip the name of collating element name. */
3885 idx = idx + extra[idx] + 1;
3886 elem_mbs_len = extra[idx++];
3887 if (mbs_len == elem_mbs_len)
3888 {
3889 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3890 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3891 break;
3892 if (mbs_cnt == elem_mbs_len)
3893 /* Found the entry. */
3894 found = 1;
3895 }
3896 /* Skip the byte sequence of the collating element. */
3897 idx += elem_mbs_len;
3898 /* Adjust for the alignment. */
3899 idx = (idx + 3) & ~3;
3900 /* Skip the collation sequence value. */
3901 idx += sizeof (uint32_t);
3902 /* Skip the wide char sequence of the collating element. */
3903 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3904 /* If we found the entry, return the sequence value. */
3905 if (found)
3906 return *(uint32_t *) (extra + idx);
3907 /* Skip the collation sequence value. */
3908 idx += sizeof (uint32_t);
3909 }
6c2a04a7 3910 return UINT_MAX;
3b0bdc72
UD
3911 }
3912}
434d3784
UD
3913# endif /* _LIBC */
3914#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3915
3916/* Check whether the node accepts the byte which is IDX-th
3917 byte of the INPUT. */
3918
3919static int
e3a87852 3920check_node_accept (mctx, node, idx)
612546c6 3921 const re_match_context_t *mctx;
e3a87852 3922 const re_token_t *node;
612546c6 3923 int idx;
3b0bdc72 3924{
e3a87852 3925 re_dfa_t *const dfa = mctx->dfa;
3b0bdc72 3926 unsigned char ch;
485d775d 3927 if (node->constraint)
3b0bdc72
UD
3928 {
3929 /* The node has constraints. Check whether the current context
15a7d175 3930 satisfies the constraints. */
56b168be
UD
3931 unsigned int context = re_string_context_at (&mctx->input, idx,
3932 mctx->eflags);
3b0bdc72 3933 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
15a7d175 3934 return 0;
3b0bdc72 3935 }
56b168be 3936 ch = re_string_byte_at (&mctx->input, idx);
ad7f28c2
UD
3937 switch (node->type)
3938 {
3939 case CHARACTER:
3940 return node->opr.c == ch;
3941 case SIMPLE_BRACKET:
3942 return bitset_contain (node->opr.sbcset, ch);
c0d5034e 3943#ifdef RE_ENABLE_I18N
ad7f28c2
UD
3944 case OP_UTF8_PERIOD:
3945 if (ch >= 0x80)
3946 return 0;
3947 /* FALLTHROUGH */
c0d5034e 3948#endif
ad7f28c2 3949 case OP_PERIOD:
56b168be
UD
3950 return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
3951 || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
ad7f28c2
UD
3952 default:
3953 return 0;
3954 }
3b0bdc72 3955}
612546c6
UD
3956
3957/* Extend the buffers, if the buffers have run out. */
3958
3959static reg_errcode_t
3960extend_buffers (mctx)
3961 re_match_context_t *mctx;
3962{
3963 reg_errcode_t ret;
56b168be 3964 re_string_t *pstr = &mctx->input;
612546c6
UD
3965
3966 /* Double the lengthes of the buffers. */
3967 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3968 if (BE (ret != REG_NOERROR, 0))
3969 return ret;
3970
3971 if (mctx->state_log != NULL)
3972 {
3973 /* And double the length of state_log. */
951d6408
UD
3974 /* XXX We have no indication of the size of this buffer. If this
3975 allocation fail we have no indication that the state_log array
3976 does not have the right size. */
3977 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3978 pstr->bufs_len + 1);
1b2c2628 3979 if (BE (new_array == NULL, 0))
15a7d175 3980 return REG_ESPACE;
1b2c2628 3981 mctx->state_log = new_array;
612546c6
UD
3982 }
3983
3984 /* Then reconstruct the buffers. */
3985 if (pstr->icase)
3986 {
3987#ifdef RE_ENABLE_I18N
3c0fb574 3988 if (pstr->mb_cur_max > 1)
bb3f4825
UD
3989 {
3990 ret = build_wcs_upper_buffer (pstr);
3991 if (BE (ret != REG_NOERROR, 0))
3992 return ret;
3993 }
612546c6
UD
3994 else
3995#endif /* RE_ENABLE_I18N */
15a7d175 3996 build_upper_buffer (pstr);
612546c6
UD
3997 }
3998 else
3999 {
4000#ifdef RE_ENABLE_I18N
3c0fb574 4001 if (pstr->mb_cur_max > 1)
15a7d175 4002 build_wcs_buffer (pstr);
612546c6
UD
4003 else
4004#endif /* RE_ENABLE_I18N */
15a7d175
UD
4005 {
4006 if (pstr->trans != NULL)
4007 re_string_translate_buffer (pstr);
15a7d175 4008 }
612546c6
UD
4009 }
4010 return REG_NOERROR;
4011}
4012
3b0bdc72
UD
4013\f
4014/* Functions for matching context. */
4015
6291ee3c
UD
4016/* Initialize MCTX. */
4017
a9388965 4018static reg_errcode_t
56b168be 4019match_ctx_init (mctx, eflags, n)
3b0bdc72 4020 re_match_context_t *mctx;
612546c6 4021 int eflags, n;
3b0bdc72
UD
4022{
4023 mctx->eflags = eflags;
612546c6 4024 mctx->match_last = -1;
3b0bdc72 4025 if (n > 0)
a9388965
UD
4026 {
4027 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
6291ee3c
UD
4028 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4029 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
15a7d175 4030 return REG_ESPACE;
a9388965 4031 }
56b168be
UD
4032 /* Already zero-ed by the caller.
4033 else
4034 mctx->bkref_ents = NULL;
4035 mctx->nbkref_ents = 0;
4036 mctx->nsub_tops = 0; */
3b0bdc72 4037 mctx->abkref_ents = n;
6291ee3c 4038 mctx->max_mb_elem_len = 1;
6291ee3c 4039 mctx->asub_tops = n;
a9388965 4040 return REG_NOERROR;
3b0bdc72
UD
4041}
4042
6291ee3c
UD
4043/* Clean the entries which depend on the current input in MCTX.
4044 This function must be invoked when the matcher changes the start index
4045 of the input, or changes the input string. */
4046
4047static void
4048match_ctx_clean (mctx)
4049 re_match_context_t *mctx;
4050{
4051 match_ctx_free_subtops (mctx);
4052 mctx->nsub_tops = 0;
4053 mctx->nbkref_ents = 0;
4054}
4055
4056/* Free all the memory associated with MCTX. */
4057
3b0bdc72
UD
4058static void
4059match_ctx_free (mctx)
4060 re_match_context_t *mctx;
4061{
6291ee3c
UD
4062 match_ctx_free_subtops (mctx);
4063 re_free (mctx->sub_tops);
3b0bdc72
UD
4064 re_free (mctx->bkref_ents);
4065}
4066
6291ee3c
UD
4067/* Free all the memory associated with MCTX->SUB_TOPS. */
4068
4069static void
4070match_ctx_free_subtops (mctx)
4071 re_match_context_t *mctx;
4072{
4073 int st_idx;
4074 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4075 {
4076 int sl_idx;
4077 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4078 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4079 {
4080 re_sub_match_last_t *last = top->lasts[sl_idx];
4081 re_free (last->path.array);
4082 re_free (last);
4083 }
4084 re_free (top->lasts);
4085 if (top->path)
4086 {
4087 re_free (top->path->array);
4088 re_free (top->path);
4089 }
4090 free (top);
4091 }
4092}
4093
4094/* Add a new backreference entry to MCTX.
4095 Note that we assume that caller never call this function with duplicate
4096 entry, and call with STR_IDX which isn't smaller than any existing entry.
4097*/
3b0bdc72 4098
a9388965 4099static reg_errcode_t
0742e48e 4100match_ctx_add_entry (mctx, node, str_idx, from, to)
6291ee3c
UD
4101 re_match_context_t *mctx;
4102 int node, str_idx, from, to;
3b0bdc72
UD
4103{
4104 if (mctx->nbkref_ents >= mctx->abkref_ents)
4105 {
1b2c2628
UD
4106 struct re_backref_cache_entry* new_entry;
4107 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
15a7d175 4108 mctx->abkref_ents * 2);
1b2c2628 4109 if (BE (new_entry == NULL, 0))
15a7d175
UD
4110 {
4111 re_free (mctx->bkref_ents);
4112 return REG_ESPACE;
4113 }
1b2c2628 4114 mctx->bkref_ents = new_entry;
3b0bdc72 4115 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
15a7d175 4116 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3b0bdc72
UD
4117 mctx->abkref_ents *= 2;
4118 }
4119 mctx->bkref_ents[mctx->nbkref_ents].node = node;
0742e48e
UD
4120 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4121 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4122 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4123 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
4124 if (mctx->max_mb_elem_len < to - from)
4125 mctx->max_mb_elem_len = to - from;
a9388965 4126 return REG_NOERROR;
3b0bdc72 4127}
0742e48e 4128
6291ee3c
UD
4129/* Search for the first entry which has the same str_idx.
4130 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4131
4132static int
4133search_cur_bkref_entry (mctx, str_idx)
4134 re_match_context_t *mctx;
4135 int str_idx;
4136{
4137 int left, right, mid;
4138 right = mctx->nbkref_ents;
4139 for (left = 0; left < right;)
4140 {
4141 mid = (left + right) / 2;
4142 if (mctx->bkref_ents[mid].str_idx < str_idx)
4143 left = mid + 1;
4144 else
4145 right = mid;
4146 }
4147 return left;
4148}
4149
0742e48e
UD
4150static void
4151match_ctx_clear_flag (mctx)
4152 re_match_context_t *mctx;
4153{
4154 int i;
4155 for (i = 0; i < mctx->nbkref_ents; ++i)
951d6408 4156 mctx->bkref_ents[i].flag = 0;
0742e48e
UD
4157}
4158
6291ee3c
UD
4159/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4160 at STR_IDX. */
4161
4162static reg_errcode_t
4163match_ctx_add_subtop (mctx, node, str_idx)
4164 re_match_context_t *mctx;
4165 int node, str_idx;
4166{
4167#ifdef DEBUG
4168 assert (mctx->sub_tops != NULL);
4169 assert (mctx->asub_tops > 0);
4170#endif
951d6408 4171 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
6291ee3c 4172 {
951d6408
UD
4173 int new_asub_tops = mctx->asub_tops * 2;
4174 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4175 re_sub_match_top_t *,
4176 new_asub_tops);
6291ee3c
UD
4177 if (BE (new_array == NULL, 0))
4178 return REG_ESPACE;
4179 mctx->sub_tops = new_array;
951d6408 4180 mctx->asub_tops = new_asub_tops;
6291ee3c
UD
4181 }
4182 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
951d6408 4183 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
6291ee3c
UD
4184 return REG_ESPACE;
4185 mctx->sub_tops[mctx->nsub_tops]->node = node;
4186 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4187 return REG_NOERROR;
4188}
4189
4190/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4191 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4192
4193static re_sub_match_last_t *
4194match_ctx_add_sublast (subtop, node, str_idx)
4195 re_sub_match_top_t *subtop;
4196 int node, str_idx;
4197{
4198 re_sub_match_last_t *new_entry;
951d6408 4199 if (BE (subtop->nlasts == subtop->alasts, 0))
6291ee3c 4200 {
951d6408
UD
4201 int new_alasts = 2 * subtop->alasts + 1;
4202 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4203 re_sub_match_last_t *,
4204 new_alasts);
6291ee3c
UD
4205 if (BE (new_array == NULL, 0))
4206 return NULL;
4207 subtop->lasts = new_array;
951d6408 4208 subtop->alasts = new_alasts;
6291ee3c
UD
4209 }
4210 new_entry = calloc (1, sizeof (re_sub_match_last_t));
951d6408
UD
4211 if (BE (new_entry != NULL, 1))
4212 {
4213 subtop->lasts[subtop->nlasts] = new_entry;
4214 new_entry->node = node;
4215 new_entry->str_idx = str_idx;
4216 ++subtop->nlasts;
4217 }
6291ee3c
UD
4218 return new_entry;
4219}
4220
0742e48e
UD
4221static void
4222sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
15a7d175 4223 check_subexp)
0742e48e
UD
4224 re_sift_context_t *sctx;
4225 re_dfastate_t **sifted_sts, **limited_sts;
4226 int last_node, last_str_idx, check_subexp;
4227{
4228 sctx->sifted_states = sifted_sts;
4229 sctx->limited_states = limited_sts;
4230 sctx->last_node = last_node;
4231 sctx->last_str_idx = last_str_idx;
4232 sctx->check_subexp = check_subexp;
485d775d
UD
4233 sctx->cur_bkref = -1;
4234 sctx->cls_subexp_idx = -1;
0742e48e
UD
4235 re_node_set_init_empty (&sctx->limits);
4236}