]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regexec.c
tst-gnuglob64: New test for glob64 based on tst-gnuglob
[thirdparty/glibc.git] / posix / regexec.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
bfff8b1b 2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
3b0bdc72 19
e054f494
RA
20#include <stdint.h>
21
a9388965 22static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
b41bd5bc
FW
23 int n);
24static void match_ctx_clean (re_match_context_t *mctx);
25static void match_ctx_free (re_match_context_t *cache);
a9388965 26static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
b41bd5bc
FW
27 int str_idx, int from, int to);
28static int search_cur_bkref_entry (const re_match_context_t *mctx,
29 int str_idx);
6291ee3c 30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
b41bd5bc 31 int str_idx);
6291ee3c 32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
b41bd5bc 33 int node, int str_idx);
0742e48e 34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
15a7d175 35 re_dfastate_t **limited_sts, int last_node,
b41bd5bc 36 int last_str_idx);
a9388965 37static reg_errcode_t re_search_internal (const regex_t *preg,
15a7d175
UD
38 const char *string, int length,
39 int start, int range, int stop,
40 size_t nmatch, regmatch_t pmatch[],
b41bd5bc 41 int eflags);
ac3d553b 42static int re_search_2_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
43 const char *string1, int length1,
44 const char *string2, int length2,
45 int start, int range, struct re_registers *regs,
b41bd5bc 46 int stop, int ret_len);
ac3d553b 47static int re_search_stub (struct re_pattern_buffer *bufp,
15a7d175
UD
48 const char *string, int length, int start,
49 int range, int stop, struct re_registers *regs,
b41bd5bc 50 int ret_len);
ac3d553b 51static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
b41bd5bc
FW
52 int nregs, int regs_allocated);
53static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
15bad1a5 54static int check_matching (re_match_context_t *mctx, int fl_longest_match,
b41bd5bc 55 int *p_match_first);
e3a87852 56static int check_halt_state_context (const re_match_context_t *mctx,
b41bd5bc 57 const re_dfastate_t *state, int idx);
76b864c8 58static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
6b6557e8 59 regmatch_t *prev_idx_match, int cur_node,
b41bd5bc 60 int cur_idx, int nmatch);
1b2c2628 61static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
5cf1ec52 62 int str_idx, int dest_node, int nregs,
15a7d175 63 regmatch_t *regs,
b41bd5bc 64 re_node_set *eps_via_nodes);
612546c6 65static reg_errcode_t set_regs (const regex_t *preg,
15a7d175
UD
66 const re_match_context_t *mctx,
67 size_t nmatch, regmatch_t *pmatch,
b41bd5bc
FW
68 int fl_backtrack);
69static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
485d775d 70
434d3784 71#ifdef RE_ENABLE_I18N
e3a87852 72static int sift_states_iter_mb (const re_match_context_t *mctx,
15a7d175 73 re_sift_context_t *sctx,
b41bd5bc 74 int node_idx, int str_idx, int max_str_idx);
434d3784 75#endif /* RE_ENABLE_I18N */
76b864c8 76static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
b41bd5bc 77 re_sift_context_t *sctx);
76b864c8 78static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
e40a38b3 79 re_sift_context_t *sctx, int str_idx,
b41bd5bc 80 re_node_set *cur_dest);
76b864c8 81static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
15a7d175
UD
82 re_sift_context_t *sctx,
83 int str_idx,
b41bd5bc 84 re_node_set *dest_nodes);
76b864c8 85static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
15a7d175 86 re_node_set *dest_nodes,
b41bd5bc 87 const re_node_set *candidates);
01ed6ceb
UD
88static int check_dst_limits (const re_match_context_t *mctx,
89 re_node_set *limits,
e3a87852 90 int dst_node, int dst_idx, int src_node,
b41bd5bc 91 int src_idx);
01ed6ceb 92static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
e40a38b3 93 int boundaries, int subexp_idx,
b41bd5bc 94 int from_node, int bkref_idx);
01ed6ceb 95static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
e40a38b3
UD
96 int limit, int subexp_idx,
97 int node, int str_idx,
b41bd5bc 98 int bkref_idx);
76b864c8 99static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
15a7d175
UD
100 re_node_set *dest_nodes,
101 const re_node_set *candidates,
102 re_node_set *limits,
103 struct re_backref_cache_entry *bkref_ents,
b41bd5bc 104 int str_idx);
76b864c8 105static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
15a7d175 106 re_sift_context_t *sctx,
b41bd5bc
FW
107 int str_idx,
108 const re_node_set *candidates);
76b864c8
UD
109static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
110 re_dfastate_t **dst,
b41bd5bc 111 re_dfastate_t **src, int num);
4c595adb 112static re_dfastate_t *find_recover_state (reg_errcode_t *err,
b41bd5bc 113 re_match_context_t *mctx);
e3a87852 114static re_dfastate_t *transit_state (reg_errcode_t *err,
15a7d175 115 re_match_context_t *mctx,
b41bd5bc 116 re_dfastate_t *state);
4c595adb
UD
117static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
118 re_match_context_t *mctx,
b41bd5bc 119 re_dfastate_t *next_state);
e3a87852 120static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6291ee3c 121 re_node_set *cur_nodes,
b41bd5bc 122 int str_idx);
c13c99fa 123#if 0
e3a87852
UD
124static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
125 re_match_context_t *mctx,
b41bd5bc 126 re_dfastate_t *pstate);
c13c99fa 127#endif
434d3784 128#ifdef RE_ENABLE_I18N
e3a87852 129static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
b41bd5bc 130 re_dfastate_t *pstate);
434d3784 131#endif /* RE_ENABLE_I18N */
e3a87852 132static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
b41bd5bc 133 const re_node_set *nodes);
e3a87852 134static reg_errcode_t get_subexp (re_match_context_t *mctx,
b41bd5bc 135 int bkref_node, int bkref_str_idx);
e3a87852 136static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
fe9434bb 137 const re_sub_match_top_t *sub_top,
6291ee3c 138 re_sub_match_last_t *sub_last,
b41bd5bc 139 int bkref_node, int bkref_str);
fe9434bb 140static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
b41bd5bc 141 int subexp_idx, int type);
e3a87852 142static reg_errcode_t check_arrival (re_match_context_t *mctx,
6291ee3c
UD
143 state_array_t *path, int top_node,
144 int top_str, int last_node, int last_str,
b41bd5bc 145 int type);
e3a87852 146static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6291ee3c
UD
147 int str_idx,
148 re_node_set *cur_nodes,
b41bd5bc 149 re_node_set *next_nodes);
6efbd82c 150static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
6291ee3c 151 re_node_set *cur_nodes,
b41bd5bc 152 int ex_subexp, int type);
6efbd82c 153static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
6291ee3c
UD
154 re_node_set *dst_nodes,
155 int target, int ex_subexp,
b41bd5bc 156 int type);
e3a87852 157static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6291ee3c 158 re_node_set *cur_nodes, int cur_str,
b41bd5bc
FW
159 int subexp_num, int type);
160static int build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
434d3784 161#ifdef RE_ENABLE_I18N
c42b4152 162static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
b41bd5bc 163 const re_string_t *input, int idx);
434d3784 164# ifdef _LIBC
c202c2c5 165static unsigned int find_collation_sequence_value (const unsigned char *mbs,
b41bd5bc 166 size_t name_len);
434d3784
UD
167# endif /* _LIBC */
168#endif /* RE_ENABLE_I18N */
01ed6ceb 169static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
15a7d175
UD
170 const re_dfastate_t *state,
171 re_node_set *states_node,
b41bd5bc 172 bitset_t *states_ch);
e3a87852 173static int check_node_accept (const re_match_context_t *mctx,
b41bd5bc
FW
174 const re_token_t *node, int idx);
175static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
3b0bdc72
UD
176\f
177/* Entry point for POSIX code. */
178
179/* regexec searches for a given pattern, specified by PREG, in the
180 string STRING.
181
182 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
183 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
184 least NMATCH elements, and we set them to the offsets of the
185 corresponding matched substrings.
186
187 EFLAGS specifies `execution flags' which affect matching: if
188 REG_NOTBOL is set, then ^ does not match at the beginning of the
189 string; if REG_NOTEOL is set, then $ does not match at the end.
190
191 We return 0 if we find a match and REG_NOMATCH if not. */
192
193int
9dd346ff
JM
194regexec (const regex_t *__restrict preg, const char *__restrict string,
195 size_t nmatch, regmatch_t pmatch[], int eflags)
3b0bdc72 196{
a9388965 197 reg_errcode_t err;
6fefb4e0 198 int start, length;
76b864c8 199 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3f2fb223
UD
200
201 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
202 return REG_BADPAT;
203
6fefb4e0
UD
204 if (eflags & REG_STARTEND)
205 {
206 start = pmatch[0].rm_so;
207 length = pmatch[0].rm_eo;
208 }
209 else
210 {
211 start = 0;
212 length = strlen (string);
213 }
7b918993
UD
214
215 __libc_lock_lock (dfa->lock);
3b0bdc72 216 if (preg->no_sub)
6fefb4e0
UD
217 err = re_search_internal (preg, string, length, start, length - start,
218 length, 0, NULL, eflags);
3b0bdc72 219 else
6fefb4e0
UD
220 err = re_search_internal (preg, string, length, start, length - start,
221 length, nmatch, pmatch, eflags);
7b918993 222 __libc_lock_unlock (dfa->lock);
a9388965 223 return err != REG_NOERROR;
3b0bdc72 224}
3f2fb223 225
3b0bdc72 226#ifdef _LIBC
b68f8620
L
227libc_hidden_def (__regexec)
228
3f2fb223
UD
229# include <shlib-compat.h>
230versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
231
232# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
233__typeof__ (__regexec) __compat_regexec;
234
235int
78678039 236attribute_compat_text_section
3f2fb223
UD
237__compat_regexec (const regex_t *__restrict preg,
238 const char *__restrict string, size_t nmatch,
239 regmatch_t pmatch[], int eflags)
240{
241 return regexec (preg, string, nmatch, pmatch,
242 eflags & (REG_NOTBOL | REG_NOTEOL));
243}
244compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
245# endif
3b0bdc72
UD
246#endif
247
248/* Entry points for GNU code. */
249
ac3d553b
UD
250/* re_match, re_search, re_match_2, re_search_2
251
252 The former two functions operate on STRING with length LENGTH,
253 while the later two operate on concatenation of STRING1 and STRING2
254 with lengths LENGTH1 and LENGTH2, respectively.
255
256 re_match() matches the compiled pattern in BUFP against the string,
257 starting at index START.
258
259 re_search() first tries matching at index START, then it tries to match
260 starting from index START + 1, and so on. The last start position tried
261 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
262 way as re_match().)
263
264 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
265 the first STOP characters of the concatenation of the strings should be
266 concerned.
267
268 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
269 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
270 computed relative to the concatenation, not relative to the individual
271 strings.)
272
273 On success, re_match* functions return the length of the match, re_search*
274 return the position of the start of the match. Return value -1 means no
275 match was found and -2 indicates an internal error. */
3b0bdc72
UD
276
277int
9dd346ff
JM
278re_match (struct re_pattern_buffer *bufp, const char *string, int length,
279 int start, struct re_registers *regs)
3b0bdc72 280{
ac3d553b 281 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
3b0bdc72
UD
282}
283#ifdef _LIBC
284weak_alias (__re_match, re_match)
285#endif
286
ac3d553b 287int
9dd346ff
JM
288re_search (struct re_pattern_buffer *bufp, const char *string, int length,
289 int start, int range, struct re_registers *regs)
ac3d553b
UD
290{
291 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
292}
293#ifdef _LIBC
294weak_alias (__re_search, re_search)
295#endif
3b0bdc72
UD
296
297int
9dd346ff
JM
298re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
299 const char *string2, int length2, int start,
300 struct re_registers *regs, int stop)
3b0bdc72 301{
ac3d553b 302 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 303 start, 0, regs, stop, 1);
3b0bdc72
UD
304}
305#ifdef _LIBC
306weak_alias (__re_match_2, re_match_2)
307#endif
308
3b0bdc72 309int
9dd346ff
JM
310re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
311 const char *string2, int length2, int start, int range,
312 struct re_registers *regs, int stop)
ac3d553b
UD
313{
314 return re_search_2_stub (bufp, string1, length1, string2, length2,
15a7d175 315 start, range, regs, stop, 0);
ac3d553b
UD
316}
317#ifdef _LIBC
318weak_alias (__re_search_2, re_search_2)
319#endif
320
321static int
85231522
JM
322re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
323 int length1, const char *string2, int length2, int start,
324 int range, struct re_registers *regs,
325 int stop, int ret_len)
ac3d553b
UD
326{
327 const char *str;
328 int rval;
329 int len = length1 + length2;
d044d844 330 char *s = NULL;
ac3d553b 331
42a2c9b5 332 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
ac3d553b
UD
333 return -2;
334
335 /* Concatenate the strings. */
336 if (length2 > 0)
337 if (length1 > 0)
338 {
d044d844 339 s = re_malloc (char, len);
15a7d175
UD
340
341 if (BE (s == NULL, 0))
342 return -2;
c42b4152
UD
343#ifdef _LIBC
344 memcpy (__mempcpy (s, string1, length1), string2, length2);
345#else
15a7d175
UD
346 memcpy (s, string1, length1);
347 memcpy (s + length1, string2, length2);
c42b4152 348#endif
15a7d175 349 str = s;
ac3d553b
UD
350 }
351 else
352 str = string2;
353 else
354 str = string1;
355
d044d844
PE
356 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
357 re_free (s);
ac3d553b
UD
358 return rval;
359}
360
361/* The parameters have the same meaning as those of re_search.
362 Additional parameters:
363 If RET_LEN is nonzero the length of the match is returned (re_match style);
364 otherwise the position of the match is returned. */
365
366static int
9dd346ff
JM
367re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length,
368 int start, int range, int stop, struct re_registers *regs,
369 int ret_len)
3b0bdc72 370{
a9388965 371 reg_errcode_t result;
3b0bdc72 372 regmatch_t *pmatch;
ac3d553b
UD
373 int nregs, rval;
374 int eflags = 0;
76b864c8 375 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
ac3d553b
UD
376
377 /* Check for out-of-range. */
a877402c 378 if (BE (start < 0 || start > length, 0))
ac3d553b
UD
379 return -1;
380 if (BE (start + range > length, 0))
381 range = length - start;
a877402c
UD
382 else if (BE (start + range < 0, 0))
383 range = -start;
3b0bdc72 384
7b918993
UD
385 __libc_lock_lock (dfa->lock);
386
3b0bdc72
UD
387 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
388 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
389
ac3d553b
UD
390 /* Compile fastmap if we haven't yet. */
391 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
392 re_compile_fastmap (bufp);
393
394 if (BE (bufp->no_sub, 0))
395 regs = NULL;
3b0bdc72
UD
396
397 /* We need at least 1 register. */
ac3d553b
UD
398 if (regs == NULL)
399 nregs = 1;
400 else if (BE (bufp->regs_allocated == REGS_FIXED &&
15a7d175 401 regs->num_regs < bufp->re_nsub + 1, 0))
ac3d553b
UD
402 {
403 nregs = regs->num_regs;
404 if (BE (nregs < 1, 0))
15a7d175
UD
405 {
406 /* Nothing can be copied to regs. */
407 regs = NULL;
408 nregs = 1;
409 }
ac3d553b
UD
410 }
411 else
412 nregs = bufp->re_nsub + 1;
3b0bdc72 413 pmatch = re_malloc (regmatch_t, nregs);
bc15410e 414 if (BE (pmatch == NULL, 0))
7b918993
UD
415 {
416 rval = -2;
417 goto out;
418 }
3b0bdc72 419
ac3d553b 420 result = re_search_internal (bufp, string, length, start, range, stop,
15a7d175 421 nregs, pmatch, eflags);
3b0bdc72 422
ac3d553b
UD
423 rval = 0;
424
425 /* I hope we needn't fill ther regs with -1's when no match was found. */
426 if (result != REG_NOERROR)
427 rval = -1;
428 else if (regs != NULL)
3b0bdc72 429 {
ac3d553b
UD
430 /* If caller wants register contents data back, copy them. */
431 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
15a7d175 432 bufp->regs_allocated);
ac3d553b 433 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
15a7d175 434 rval = -2;
3b0bdc72
UD
435 }
436
ac3d553b 437 if (BE (rval == 0, 1))
3b0bdc72 438 {
ac3d553b 439 if (ret_len)
15a7d175
UD
440 {
441 assert (pmatch[0].rm_so == start);
442 rval = pmatch[0].rm_eo - start;
443 }
ac3d553b 444 else
15a7d175 445 rval = pmatch[0].rm_so;
3b0bdc72 446 }
3b0bdc72 447 re_free (pmatch);
7b918993
UD
448 out:
449 __libc_lock_unlock (dfa->lock);
3b0bdc72
UD
450 return rval;
451}
3b0bdc72 452
ac3d553b 453static unsigned
9dd346ff
JM
454re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
455 int regs_allocated)
3b0bdc72 456{
ac3d553b
UD
457 int rval = REGS_REALLOCATE;
458 int i;
459 int need_regs = nregs + 1;
460 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
461 uses. */
462
463 /* Have the register data arrays been allocated? */
464 if (regs_allocated == REGS_UNALLOCATED)
44777771 465 { /* No. So allocate them with malloc. */
a96c63ed 466 regs->start = re_malloc (regoff_t, need_regs);
74bc9f14 467 if (BE (regs->start == NULL, 0))
15a7d175 468 return REGS_UNALLOCATED;
74bc9f14
PE
469 regs->end = re_malloc (regoff_t, need_regs);
470 if (BE (regs->end == NULL, 0))
471 {
472 re_free (regs->start);
473 return REGS_UNALLOCATED;
474 }
ac3d553b
UD
475 regs->num_regs = need_regs;
476 }
477 else if (regs_allocated == REGS_REALLOCATE)
478 { /* Yes. If we need more elements than were already
15a7d175
UD
479 allocated, reallocate them. If we need fewer, just
480 leave it alone. */
951d6408 481 if (BE (need_regs > regs->num_regs, 0))
15a7d175 482 {
44777771 483 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
74bc9f14
PE
484 regoff_t *new_end;
485 if (BE (new_start == NULL, 0))
a96c63ed 486 return REGS_UNALLOCATED;
74bc9f14
PE
487 new_end = re_realloc (regs->end, regoff_t, need_regs);
488 if (BE (new_end == NULL, 0))
489 {
490 re_free (new_start);
491 return REGS_UNALLOCATED;
492 }
44777771
UD
493 regs->start = new_start;
494 regs->end = new_end;
15a7d175
UD
495 regs->num_regs = need_regs;
496 }
ac3d553b
UD
497 }
498 else
499 {
500 assert (regs_allocated == REGS_FIXED);
501 /* This function may not be called with REGS_FIXED and nregs too big. */
502 assert (regs->num_regs >= nregs);
503 rval = REGS_FIXED;
504 }
505
506 /* Copy the regs. */
507 for (i = 0; i < nregs; ++i)
508 {
509 regs->start[i] = pmatch[i].rm_so;
510 regs->end[i] = pmatch[i].rm_eo;
511 }
512 for ( ; i < regs->num_regs; ++i)
513 regs->start[i] = regs->end[i] = -1;
514
515 return rval;
3b0bdc72 516}
3b0bdc72
UD
517
518/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
519 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
520 this memory for recording register information. STARTS and ENDS
521 must be allocated using the malloc library routine, and must each
522 be at least NUM_REGS * sizeof (regoff_t) bytes long.
523
524 If NUM_REGS == 0, then subsequent matches should allocate their own
525 register data.
526
527 Unless this function is called, the first search or match using
528 PATTERN_BUFFER will allocate its own register data, without
529 freeing the old data. */
530
531void
9dd346ff
JM
532re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
533 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3b0bdc72
UD
534{
535 if (num_regs)
536 {
537 bufp->regs_allocated = REGS_REALLOCATE;
538 regs->num_regs = num_regs;
539 regs->start = starts;
540 regs->end = ends;
541 }
542 else
543 {
544 bufp->regs_allocated = REGS_UNALLOCATED;
545 regs->num_regs = 0;
546 regs->start = regs->end = (regoff_t *) 0;
547 }
548}
549#ifdef _LIBC
550weak_alias (__re_set_registers, re_set_registers)
551#endif
552\f
553/* Entry points compatible with 4.2 BSD regex library. We don't define
554 them unless specifically requested. */
555
556#if defined _REGEX_RE_COMP || defined _LIBC
557int
558# ifdef _LIBC
559weak_function
560# endif
80d9be81 561re_exec (const char *s)
3b0bdc72
UD
562{
563 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
564}
565#endif /* _REGEX_RE_COMP */
566\f
3b0bdc72
UD
567/* Internal entry point. */
568
569/* Searches for a compiled pattern PREG in the string STRING, whose
570 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
571 mingings with regexec. START, and RANGE have the same meanings
572 with re_search.
a9388965
UD
573 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574 otherwise return the error code.
3b0bdc72
UD
575 Note: We assume front end functions already check ranges.
576 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
577
a9388965 578static reg_errcode_t
b41bd5bc 579__attribute_warn_unused_result__
85231522
JM
580re_search_internal (const regex_t *preg, const char *string, int length,
581 int start, int range, int stop, size_t nmatch,
582 regmatch_t pmatch[], int eflags)
3b0bdc72 583{
a9388965 584 reg_errcode_t err;
76b864c8 585 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
612546c6 586 int left_lim, right_lim, incr;
bb677c95 587 int fl_longest_match, match_first, match_kind, match_last = -1;
963d8d78 588 int extra_nmatch;
bb677c95 589 int sb, ch;
e3a87852
UD
590#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
591 re_match_context_t mctx = { .dfa = dfa };
592#else
3b0bdc72 593 re_match_context_t mctx;
e3a87852 594#endif
bb677c95
UD
595 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
596 && range && !preg->can_be_null) ? preg->fastmap : NULL;
997470b3 597 RE_TRANSLATE_TYPE t = preg->translate;
3b0bdc72 598
e3a87852
UD
599#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
600 memset (&mctx, '\0', sizeof (re_match_context_t));
601 mctx.dfa = dfa;
602#endif
603
963d8d78
UD
604 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
605 nmatch -= extra_nmatch;
606
3b0bdc72 607 /* Check if the DFA haven't been compiled. */
bc15410e 608 if (BE (preg->used == 0 || dfa->init_state == NULL
15a7d175
UD
609 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
610 || dfa->init_state_begbuf == NULL, 0))
a9388965 611 return REG_NOMATCH;
3b0bdc72 612
c70f81dd
UD
613#ifdef DEBUG
614 /* We assume front-end functions already check them. */
615 assert (start + range >= 0 && start + range <= length);
616#endif
617
618 /* If initial states with non-begbuf contexts have no elements,
619 the regex must be anchored. If preg->newline_anchor is set,
620 we'll never use init_state_nl, so do not check it. */
621 if (dfa->init_state->nodes.nelem == 0
622 && dfa->init_state_word->nodes.nelem == 0
623 && (dfa->init_state_nl->nodes.nelem == 0
624 || !preg->newline_anchor))
625 {
626 if (start != 0 && start + range != 0)
2da42bc0 627 return REG_NOMATCH;
c70f81dd
UD
628 start = range = 0;
629 }
630
3b0bdc72 631 /* We must check the longest matching, if nmatch > 0. */
6291ee3c 632 fl_longest_match = (nmatch != 0 || dfa->nbackref);
3b0bdc72 633
56b168be 634 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
f0c7c524 635 preg->translate, preg->syntax & RE_ICASE, dfa);
612546c6 636 if (BE (err != REG_NOERROR, 0))
1b2c2628 637 goto free_return;
56b168be
UD
638 mctx.input.stop = stop;
639 mctx.input.raw_stop = stop;
640 mctx.input.newline_anchor = preg->newline_anchor;
612546c6 641
56b168be 642 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
612546c6 643 if (BE (err != REG_NOERROR, 0))
1b2c2628 644 goto free_return;
612546c6 645
3b0bdc72
UD
646 /* We will log all the DFA states through which the dfa pass,
647 if nmatch > 1, or this dfa has "multibyte node", which is a
648 back-reference or a node which can accept multibyte character or
649 multi character collating element. */
650 if (nmatch > 1 || dfa->has_mb_node)
a9388965 651 {
eadc09f2
PE
652 /* Avoid overflow. */
653 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
654 {
655 err = REG_ESPACE;
656 goto free_return;
657 }
658
56b168be 659 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
612546c6 660 if (BE (mctx.state_log == NULL, 0))
15a7d175
UD
661 {
662 err = REG_ESPACE;
663 goto free_return;
664 }
a9388965 665 }
3b0bdc72 666 else
612546c6 667 mctx.state_log = NULL;
3b0bdc72 668
612546c6 669 match_first = start;
56b168be
UD
670 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
671 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
612546c6 672
3b0bdc72 673 /* Check incrementally whether of not the input string match. */
612546c6
UD
674 incr = (range < 0) ? -1 : 1;
675 left_lim = (range < 0) ? start + range : start;
676 right_lim = (range < 0) ? start : start + range;
3c0fb574 677 sb = dfa->mb_cur_max == 1;
bb677c95
UD
678 match_kind =
679 (fastmap
680 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
681 | (range >= 0 ? 2 : 0)
682 | (t != NULL ? 1 : 0))
683 : 8);
684
685 for (;; match_first += incr)
3b0bdc72 686 {
bb677c95
UD
687 err = REG_NOMATCH;
688 if (match_first < left_lim || right_lim < match_first)
689 goto free_return;
690
691 /* Advance as rapidly as possible through the string, until we
692 find a plausible place to start matching. This may be done
693 with varying efficiency, so there are various possibilities:
694 only the most common of them are specialized, in order to
695 save on code size. We use a switch statement for speed. */
696 switch (match_kind)
1b2c2628 697 {
bb677c95
UD
698 case 8:
699 /* No fastmap. */
700 break;
701
702 case 7:
703 /* Fastmap with single-byte translation, match forward. */
704 while (BE (match_first < right_lim, 1)
705 && !fastmap[t[(unsigned char) string[match_first]]])
706 ++match_first;
707 goto forward_match_found_start_or_reached_end;
708
709 case 6:
710 /* Fastmap without translation, match forward. */
711 while (BE (match_first < right_lim, 1)
712 && !fastmap[(unsigned char) string[match_first]])
713 ++match_first;
714
715 forward_match_found_start_or_reached_end:
716 if (BE (match_first == right_lim, 0))
1b2c2628 717 {
bb677c95
UD
718 ch = match_first >= length
719 ? 0 : (unsigned char) string[match_first];
720 if (!fastmap[t ? t[ch] : ch])
721 goto free_return;
1b2c2628 722 }
bb677c95
UD
723 break;
724
725 case 4:
726 case 5:
727 /* Fastmap without multi-byte translation, match backwards. */
728 while (match_first >= left_lim)
1b2c2628 729 {
bb677c95
UD
730 ch = match_first >= length
731 ? 0 : (unsigned char) string[match_first];
732 if (fastmap[t ? t[ch] : ch])
733 break;
734 --match_first;
735 }
736 if (match_first < left_lim)
737 goto free_return;
738 break;
1b2c2628 739
bb677c95
UD
740 default:
741 /* In this case, we can't determine easily the current byte,
742 since it might be a component byte of a multibyte
743 character. Then we use the constructed buffer instead. */
744 for (;;)
745 {
746 /* If MATCH_FIRST is out of the valid range, reconstruct the
747 buffers. */
748 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
749 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
1b2c2628 750 {
bb677c95
UD
751 err = re_string_reconstruct (&mctx.input, match_first,
752 eflags);
753 if (BE (err != REG_NOERROR, 0))
754 goto free_return;
755
756 offset = match_first - mctx.input.raw_mbs_idx;
1b2c2628 757 }
bb677c95
UD
758 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
759 Note that MATCH_FIRST must not be smaller than 0. */
760 ch = (match_first >= length
761 ? 0 : re_string_byte_at (&mctx.input, offset));
762 if (fastmap[ch])
1b2c2628 763 break;
bb677c95
UD
764 match_first += incr;
765 if (match_first < left_lim || match_first > right_lim)
2da42bc0
UD
766 {
767 err = REG_NOMATCH;
768 goto free_return;
769 }
1b2c2628 770 }
bb677c95 771 break;
1b2c2628 772 }
612546c6 773
1b2c2628 774 /* Reconstruct the buffers so that the matcher can assume that
fe9434bb 775 the matching starts from the beginning of the buffer. */
56b168be 776 err = re_string_reconstruct (&mctx.input, match_first, eflags);
1b2c2628
UD
777 if (BE (err != REG_NOERROR, 0))
778 goto free_return;
bb677c95 779
3b0bdc72 780#ifdef RE_ENABLE_I18N
bb677c95
UD
781 /* Don't consider this char as a possible match start if it part,
782 yet isn't the head, of a multibyte character. */
783 if (!sb && !re_string_first_byte (&mctx.input, 0))
784 continue;
3b0bdc72 785#endif
bb677c95
UD
786
787 /* It seems to be appropriate one, then use the matcher. */
788 /* We assume that the matching starts from 0. */
789 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
790 match_last = check_matching (&mctx, fl_longest_match,
791 range >= 0 ? &match_first : NULL);
792 if (match_last != -1)
1b2c2628 793 {
bb677c95 794 if (BE (match_last == -2, 0))
1b2c2628 795 {
bb677c95
UD
796 err = REG_ESPACE;
797 goto free_return;
798 }
799 else
800 {
801 mctx.match_last = match_last;
802 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
1b2c2628 803 {
bb677c95
UD
804 re_dfastate_t *pstate = mctx.state_log[match_last];
805 mctx.last_node = check_halt_state_context (&mctx, pstate,
806 match_last);
1b2c2628 807 }
bb677c95
UD
808 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
809 || dfa->nbackref)
6291ee3c 810 {
bb677c95
UD
811 err = prune_impossible_nodes (&mctx);
812 if (err == REG_NOERROR)
813 break;
814 if (BE (err != REG_NOMATCH, 0))
815 goto free_return;
816 match_last = -1;
6291ee3c 817 }
bb677c95
UD
818 else
819 break; /* We found a match. */
1b2c2628
UD
820 }
821 }
bb677c95
UD
822
823 match_ctx_clean (&mctx);
3b0bdc72
UD
824 }
825
bb677c95
UD
826#ifdef DEBUG
827 assert (match_last != -1);
828 assert (err == REG_NOERROR);
829#endif
830
3b0bdc72 831 /* Set pmatch[] if we need. */
bb677c95 832 if (nmatch > 0)
3b0bdc72
UD
833 {
834 int reg_idx;
835
836 /* Initialize registers. */
97fd3a30 837 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
15a7d175 838 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
3b0bdc72
UD
839
840 /* Set the points where matching start/end. */
612546c6 841 pmatch[0].rm_so = 0;
6291ee3c 842 pmatch[0].rm_eo = mctx.match_last;
3b0bdc72
UD
843
844 if (!preg->no_sub && nmatch > 1)
15a7d175 845 {
15a7d175
UD
846 err = set_regs (preg, &mctx, nmatch, pmatch,
847 dfa->has_plural_match && dfa->nbackref > 0);
848 if (BE (err != REG_NOERROR, 0))
849 goto free_return;
850 }
612546c6
UD
851
852 /* At last, add the offset to the each registers, since we slided
97fd3a30
UD
853 the buffers so that we could assume that the matching starts
854 from 0. */
612546c6 855 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
15a7d175
UD
856 if (pmatch[reg_idx].rm_so != -1)
857 {
c0d5034e 858#ifdef RE_ENABLE_I18N
56b168be 859 if (BE (mctx.input.offsets_needed != 0, 0))
bb3f4825 860 {
01ed6ceb
UD
861 pmatch[reg_idx].rm_so =
862 (pmatch[reg_idx].rm_so == mctx.input.valid_len
863 ? mctx.input.valid_raw_len
864 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
865 pmatch[reg_idx].rm_eo =
866 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
867 ? mctx.input.valid_raw_len
868 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
bb3f4825 869 }
c0d5034e 870#else
56b168be 871 assert (mctx.input.offsets_needed == 0);
c0d5034e 872#endif
15a7d175
UD
873 pmatch[reg_idx].rm_so += match_first;
874 pmatch[reg_idx].rm_eo += match_first;
875 }
963d8d78
UD
876 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
877 {
878 pmatch[nmatch + reg_idx].rm_so = -1;
879 pmatch[nmatch + reg_idx].rm_eo = -1;
880 }
c06a6956
UD
881
882 if (dfa->subexp_map)
2da42bc0
UD
883 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
884 if (dfa->subexp_map[reg_idx] != reg_idx)
885 {
886 pmatch[reg_idx + 1].rm_so
887 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
888 pmatch[reg_idx + 1].rm_eo
889 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
890 }
3b0bdc72 891 }
bb677c95 892
1b2c2628 893 free_return:
612546c6 894 re_free (mctx.state_log);
3b0bdc72
UD
895 if (dfa->nbackref)
896 match_ctx_free (&mctx);
56b168be 897 re_string_destruct (&mctx.input);
1b2c2628 898 return err;
3b0bdc72
UD
899}
900
6291ee3c 901static reg_errcode_t
b41bd5bc 902__attribute_warn_unused_result__
9dd346ff 903prune_impossible_nodes (re_match_context_t *mctx)
6291ee3c 904{
76b864c8 905 const re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
906 int halt_node, match_last;
907 reg_errcode_t ret;
6291ee3c
UD
908 re_dfastate_t **sifted_states;
909 re_dfastate_t **lim_states = NULL;
910 re_sift_context_t sctx;
911#ifdef DEBUG
912 assert (mctx->state_log != NULL);
913#endif
914 match_last = mctx->match_last;
915 halt_node = mctx->last_node;
4cd02867
PE
916
917 /* Avoid overflow. */
918 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
919 return REG_ESPACE;
920
6291ee3c
UD
921 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
922 if (BE (sifted_states == NULL, 0))
923 {
924 ret = REG_ESPACE;
925 goto free_return;
926 }
927 if (dfa->nbackref)
928 {
929 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
930 if (BE (lim_states == NULL, 0))
931 {
932 ret = REG_ESPACE;
933 goto free_return;
934 }
935 while (1)
936 {
937 memset (lim_states, '\0',
938 sizeof (re_dfastate_t *) * (match_last + 1));
6291ee3c 939 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
d2c38eb3 940 match_last);
e3a87852 941 ret = sift_states_backward (mctx, &sctx);
6291ee3c
UD
942 re_node_set_free (&sctx.limits);
943 if (BE (ret != REG_NOERROR, 0))
944 goto free_return;
945 if (sifted_states[0] != NULL || lim_states[0] != NULL)
946 break;
947 do
948 {
949 --match_last;
950 if (match_last < 0)
951 {
952 ret = REG_NOMATCH;
953 goto free_return;
954 }
97fd3a30
UD
955 } while (mctx->state_log[match_last] == NULL
956 || !mctx->state_log[match_last]->halt);
e3a87852 957 halt_node = check_halt_state_context (mctx,
6291ee3c 958 mctx->state_log[match_last],
e3a87852 959 match_last);
6291ee3c
UD
960 }
961 ret = merge_state_array (dfa, sifted_states, lim_states,
962 match_last + 1);
963 re_free (lim_states);
964 lim_states = NULL;
965 if (BE (ret != REG_NOERROR, 0))
966 goto free_return;
967 }
968 else
969 {
d2c38eb3 970 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
e3a87852 971 ret = sift_states_backward (mctx, &sctx);
6291ee3c
UD
972 re_node_set_free (&sctx.limits);
973 if (BE (ret != REG_NOERROR, 0))
974 goto free_return;
76c7f2cd
UD
975 if (sifted_states[0] == NULL)
976 {
977 ret = REG_NOMATCH;
978 goto free_return;
979 }
6291ee3c
UD
980 }
981 re_free (mctx->state_log);
982 mctx->state_log = sifted_states;
983 sifted_states = NULL;
984 mctx->last_node = halt_node;
985 mctx->match_last = match_last;
986 ret = REG_NOERROR;
987 free_return:
988 re_free (sifted_states);
989 re_free (lim_states);
990 return ret;
991}
992
a9388965 993/* Acquire an initial state and return it.
3b0bdc72
UD
994 We must select appropriate initial state depending on the context,
995 since initial states may have constraints like "\<", "^", etc.. */
996
bb3f4825 997static inline re_dfastate_t *
b41bd5bc 998__attribute ((always_inline))
e2f55264
UD
999acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1000 int idx)
3b0bdc72 1001{
76b864c8 1002 const re_dfa_t *const dfa = mctx->dfa;
3b0bdc72
UD
1003 if (dfa->init_state->has_constraint)
1004 {
1005 unsigned int context;
56b168be 1006 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
3b0bdc72 1007 if (IS_WORD_CONTEXT (context))
15a7d175 1008 return dfa->init_state_word;
3b0bdc72 1009 else if (IS_ORDINARY_CONTEXT (context))
15a7d175 1010 return dfa->init_state;
3b0bdc72 1011 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
15a7d175 1012 return dfa->init_state_begbuf;
3b0bdc72 1013 else if (IS_NEWLINE_CONTEXT (context))
15a7d175 1014 return dfa->init_state_nl;
3b0bdc72 1015 else if (IS_BEGBUF_CONTEXT (context))
15a7d175
UD
1016 {
1017 /* It is relatively rare case, then calculate on demand. */
4c595adb
UD
1018 return re_acquire_state_context (err, dfa,
1019 dfa->init_state->entrance_nodes,
1020 context);
15a7d175 1021 }
3b0bdc72 1022 else
15a7d175
UD
1023 /* Must not happen? */
1024 return dfa->init_state;
3b0bdc72
UD
1025 }
1026 else
1027 return dfa->init_state;
1028}
1029
1030/* Check whether the regular expression match input string INPUT or not,
a9388965
UD
1031 and return the index where the matching end, return -1 if not match,
1032 or return -2 in case of an error.
612546c6 1033 FL_LONGEST_MATCH means we want the POSIX longest matching.
15bad1a5
UD
1034 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1035 next place where we may want to try matching.
612546c6
UD
1036 Note that the matcher assume that the maching starts from the current
1037 index of the buffer. */
3b0bdc72
UD
1038
1039static int
b41bd5bc 1040__attribute_warn_unused_result__
e2f55264
UD
1041check_matching (re_match_context_t *mctx, int fl_longest_match,
1042 int *p_match_first)
3b0bdc72 1043{
76b864c8 1044 const re_dfa_t *const dfa = mctx->dfa;
a9388965 1045 reg_errcode_t err;
612546c6
UD
1046 int match = 0;
1047 int match_last = -1;
56b168be 1048 int cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72 1049 re_dfastate_t *cur_state;
4c595adb
UD
1050 int at_init_state = p_match_first != NULL;
1051 int next_start_idx = cur_str_idx;
3b0bdc72 1052
4c595adb 1053 err = REG_NOERROR;
e3a87852 1054 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
4c595adb 1055 /* An initial state must not be NULL (invalid). */
bc15410e 1056 if (BE (cur_state == NULL, 0))
4c595adb
UD
1057 {
1058 assert (err == REG_ESPACE);
1059 return -2;
1060 }
0742e48e 1061
4c595adb 1062 if (mctx->state_log != NULL)
6291ee3c 1063 {
4c595adb 1064 mctx->state_log[cur_str_idx] = cur_state;
6291ee3c 1065
4c595adb
UD
1066 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1067 later. E.g. Processing back references. */
1068 if (BE (dfa->nbackref, 0))
bb3f4825 1069 {
4c595adb
UD
1070 at_init_state = 0;
1071 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
bb3f4825
UD
1072 if (BE (err != REG_NOERROR, 0))
1073 return err;
4c595adb
UD
1074
1075 if (cur_state->has_backref)
1076 {
1077 err = transit_state_bkref (mctx, &cur_state->nodes);
1078 if (BE (err != REG_NOERROR, 0))
2da42bc0 1079 return err;
4c595adb 1080 }
bb3f4825 1081 }
0742e48e
UD
1082 }
1083
3b0bdc72 1084 /* If the RE accepts NULL string. */
bb3f4825 1085 if (BE (cur_state->halt, 0))
3b0bdc72
UD
1086 {
1087 if (!cur_state->has_constraint
e3a87852 1088 || check_halt_state_context (mctx, cur_state, cur_str_idx))
15a7d175
UD
1089 {
1090 if (!fl_longest_match)
1091 return cur_str_idx;
1092 else
1093 {
1094 match_last = cur_str_idx;
1095 match = 1;
1096 }
1097 }
3b0bdc72
UD
1098 }
1099
56b168be 1100 while (!re_string_eoi (&mctx->input))
3b0bdc72 1101 {
15bad1a5 1102 re_dfastate_t *old_state = cur_state;
bb677c95
UD
1103 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1104
8887a920
UD
1105 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1106 && mctx->input.bufs_len < mctx->input.len)
2da42bc0
UD
1107 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1108 && mctx->input.valid_len < mctx->input.len))
1109 {
a445af0b 1110 err = extend_buffers (mctx, next_char_idx + 1);
2da42bc0 1111 if (BE (err != REG_NOERROR, 0))
bb677c95
UD
1112 {
1113 assert (err == REG_ESPACE);
1114 return -2;
1115 }
2da42bc0 1116 }
bb677c95 1117
e3a87852 1118 cur_state = transit_state (&err, mctx, cur_state);
4c595adb
UD
1119 if (mctx->state_log != NULL)
1120 cur_state = merge_state_with_log (&err, mctx, cur_state);
15bad1a5 1121
4c595adb 1122 if (cur_state == NULL)
15a7d175 1123 {
4c595adb
UD
1124 /* Reached the invalid state or an error. Try to recover a valid
1125 state using the state log, if available and if we have not
1126 already found a valid (even if not the longest) match. */
15a7d175
UD
1127 if (BE (err != REG_NOERROR, 0))
1128 return -2;
4c595adb
UD
1129
1130 if (mctx->state_log == NULL
1131 || (match && !fl_longest_match)
1132 || (cur_state = find_recover_state (&err, mctx)) == NULL)
15a7d175 1133 break;
4c595adb
UD
1134 }
1135
bb677c95 1136 if (BE (at_init_state, 0))
4c595adb
UD
1137 {
1138 if (old_state == cur_state)
bb677c95 1139 next_start_idx = next_char_idx;
c13c99fa 1140 else
4c595adb 1141 at_init_state = 0;
15a7d175 1142 }
3b0bdc72 1143
4c595adb 1144 if (cur_state->halt)
15a7d175 1145 {
4c595adb 1146 /* Reached a halt state.
15a7d175
UD
1147 Check the halt state can satisfy the current context. */
1148 if (!cur_state->has_constraint
e3a87852 1149 || check_halt_state_context (mctx, cur_state,
56b168be 1150 re_string_cur_idx (&mctx->input)))
15a7d175
UD
1151 {
1152 /* We found an appropriate halt state. */
56b168be 1153 match_last = re_string_cur_idx (&mctx->input);
15a7d175 1154 match = 1;
bb677c95
UD
1155
1156 /* We found a match, do not modify match_first below. */
1157 p_match_first = NULL;
15a7d175
UD
1158 if (!fl_longest_match)
1159 break;
1160 }
1161 }
bb677c95 1162 }
15bad1a5 1163
bb677c95 1164 if (p_match_first)
4c595adb 1165 *p_match_first += next_start_idx;
15bad1a5 1166
3b0bdc72
UD
1167 return match_last;
1168}
1169
1170/* Check NODE match the current context. */
1171
e2f55264 1172static int
e2f55264 1173check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
3b0bdc72 1174{
3b0bdc72 1175 re_token_type_t type = dfa->nodes[node].type;
485d775d
UD
1176 unsigned int constraint = dfa->nodes[node].constraint;
1177 if (type != END_OF_RE)
3b0bdc72 1178 return 0;
485d775d
UD
1179 if (!constraint)
1180 return 1;
1181 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
3b0bdc72
UD
1182 return 0;
1183 return 1;
1184}
1185
1186/* Check the halt state STATE match the current context.
1187 Return 0 if not match, if the node, STATE has, is a halt node and
1188 match the context, return the node. */
1189
1190static int
e2f55264
UD
1191check_halt_state_context (const re_match_context_t *mctx,
1192 const re_dfastate_t *state, int idx)
3b0bdc72 1193{
3b0bdc72
UD
1194 int i;
1195 unsigned int context;
1196#ifdef DEBUG
1197 assert (state->halt);
1198#endif
56b168be 1199 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
3b0bdc72 1200 for (i = 0; i < state->nodes.nelem; ++i)
e3a87852 1201 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
3b0bdc72
UD
1202 return state->nodes.elems[i];
1203 return 0;
1204}
1205
a9388965
UD
1206/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1207 corresponding to the DFA).
1208 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1209 of errors. */
3b0bdc72
UD
1210
1211static int
e2f55264
UD
1212proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1213 int *pidx, int node, re_node_set *eps_via_nodes,
1214 struct re_fail_stack_t *fs)
3b0bdc72 1215{
76b864c8 1216 const re_dfa_t *const dfa = mctx->dfa;
2d87db5b 1217 int i, err;
3b0bdc72
UD
1218 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1219 {
485d775d 1220 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
5cf1ec52
UD
1221 re_node_set *edests = &dfa->edests[node];
1222 int dest_node;
a9388965 1223 err = re_node_set_insert (eps_via_nodes, node);
bc15410e 1224 if (BE (err < 0, 0))
44777771 1225 return -2;
5cf1ec52
UD
1226 /* Pick up a valid destination, or return -1 if none is found. */
1227 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
15a7d175 1228 {
5cf1ec52 1229 int candidate = edests->elems[i];
15a7d175
UD
1230 if (!re_node_set_contains (cur_nodes, candidate))
1231 continue;
2da42bc0 1232 if (dest_node == -1)
5cf1ec52
UD
1233 dest_node = candidate;
1234
2da42bc0 1235 else
5cf1ec52
UD
1236 {
1237 /* In order to avoid infinite loop like "(a*)*", return the second
2da42bc0 1238 epsilon-transition if the first was already considered. */
5cf1ec52 1239 if (re_node_set_contains (eps_via_nodes, dest_node))
2da42bc0 1240 return candidate;
5cf1ec52
UD
1241
1242 /* Otherwise, push the second epsilon-transition on the fail stack. */
1243 else if (fs != NULL
1244 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
2da42bc0 1245 eps_via_nodes))
5cf1ec52
UD
1246 return -2;
1247
1248 /* We know we are going to exit. */
1249 break;
1250 }
1251 }
1252 return dest_node;
3b0bdc72
UD
1253 }
1254 else
1255 {
485d775d 1256 int naccepted = 0;
3b0bdc72 1257 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72 1258
434d3784 1259#ifdef RE_ENABLE_I18N
02f3550c 1260 if (dfa->nodes[node].accept_mb)
56b168be 1261 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
434d3784
UD
1262 else
1263#endif /* RE_ENABLE_I18N */
1264 if (type == OP_BACK_REF)
15a7d175 1265 {
d8f73de8 1266 int subexp_idx = dfa->nodes[node].opr.idx + 1;
15a7d175
UD
1267 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1268 if (fs != NULL)
1269 {
1270 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1271 return -1;
1272 else if (naccepted)
1273 {
56b168be 1274 char *buf = (char *) re_string_get_buffer (&mctx->input);
6291ee3c
UD
1275 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1276 naccepted) != 0)
15a7d175
UD
1277 return -1;
1278 }
1279 }
1280
1281 if (naccepted == 0)
1282 {
2d87db5b 1283 int dest_node;
15a7d175
UD
1284 err = re_node_set_insert (eps_via_nodes, node);
1285 if (BE (err < 0, 0))
1286 return -2;
1287 dest_node = dfa->edests[node].elems[0];
1288 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1289 dest_node))
1290 return dest_node;
1291 }
1292 }
3b0bdc72
UD
1293
1294 if (naccepted != 0
e3a87852 1295 || check_node_accept (mctx, dfa->nodes + node, *pidx))
15a7d175 1296 {
2d87db5b 1297 int dest_node = dfa->nexts[node];
15a7d175
UD
1298 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1299 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1300 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1301 dest_node)))
1302 return -1;
1303 re_node_set_empty (eps_via_nodes);
1304 return dest_node;
1305 }
3b0bdc72 1306 }
a3022b82
UD
1307 return -1;
1308}
1309
1310static reg_errcode_t
b41bd5bc 1311__attribute_warn_unused_result__
e2f55264
UD
1312push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1313 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
a3022b82
UD
1314{
1315 reg_errcode_t err;
1316 int num = fs->num++;
1317 if (fs->num == fs->alloc)
1318 {
1b2c2628 1319 struct re_fail_stack_ent_t *new_array;
1b2c2628 1320 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
44777771 1321 * fs->alloc * 2));
1b2c2628 1322 if (new_array == NULL)
15a7d175 1323 return REG_ESPACE;
44777771 1324 fs->alloc *= 2;
1b2c2628 1325 fs->stack = new_array;
a3022b82
UD
1326 }
1327 fs->stack[num].idx = str_idx;
5cf1ec52 1328 fs->stack[num].node = dest_node;
a3022b82 1329 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
5a299c96
UD
1330 if (fs->stack[num].regs == NULL)
1331 return REG_ESPACE;
a3022b82
UD
1332 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1333 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1334 return err;
1335}
1b2c2628 1336
a3022b82 1337static int
e2f55264
UD
1338pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1339 regmatch_t *regs, re_node_set *eps_via_nodes)
a3022b82
UD
1340{
1341 int num = --fs->num;
1342 assert (num >= 0);
44777771 1343 *pidx = fs->stack[num].idx;
a3022b82
UD
1344 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1345 re_node_set_free (eps_via_nodes);
485d775d 1346 re_free (fs->stack[num].regs);
a3022b82
UD
1347 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1348 return fs->stack[num].node;
3b0bdc72
UD
1349}
1350
1351/* Set the positions where the subexpressions are starts/ends to registers
1352 PMATCH.
1353 Note: We assume that pmatch[0] is already set, and
97fd3a30 1354 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
3b0bdc72 1355
a9388965 1356static reg_errcode_t
b41bd5bc 1357__attribute_warn_unused_result__
e2f55264
UD
1358set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1359 regmatch_t *pmatch, int fl_backtrack)
3b0bdc72 1360{
76b864c8 1361 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
963d8d78 1362 int idx, cur_node;
3b0bdc72 1363 re_node_set eps_via_nodes;
a3022b82 1364 struct re_fail_stack_t *fs;
6b6557e8
UD
1365 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1366 regmatch_t *prev_idx_match;
2d87db5b 1367 int prev_idx_match_malloced = 0;
6b6557e8 1368
3b0bdc72
UD
1369#ifdef DEBUG
1370 assert (nmatch > 1);
612546c6 1371 assert (mctx->state_log != NULL);
3b0bdc72 1372#endif
a3022b82
UD
1373 if (fl_backtrack)
1374 {
1375 fs = &fs_body;
1376 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
6b6557e8
UD
1377 if (fs->stack == NULL)
1378 return REG_ESPACE;
a3022b82
UD
1379 }
1380 else
1381 fs = NULL;
6b6557e8 1382
3b0bdc72 1383 cur_node = dfa->init_node;
3b0bdc72 1384 re_node_set_init_empty (&eps_via_nodes);
6b6557e8 1385
2d87db5b
UD
1386 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1387 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1388 else
1389 {
1390 prev_idx_match = re_malloc (regmatch_t, nmatch);
1391 if (prev_idx_match == NULL)
1392 {
1393 free_fail_stack_return (fs);
1394 return REG_ESPACE;
1395 }
1396 prev_idx_match_malloced = 1;
1397 }
963d8d78 1398 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
6b6557e8 1399
3b0bdc72
UD
1400 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1401 {
963d8d78 1402 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
6b6557e8 1403
a3022b82 1404 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
15a7d175
UD
1405 {
1406 int reg_idx;
1407 if (fs)
1408 {
1409 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1410 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1411 break;
1412 if (reg_idx == nmatch)
1413 {
1414 re_node_set_free (&eps_via_nodes);
2d87db5b
UD
1415 if (prev_idx_match_malloced)
1416 re_free (prev_idx_match);
15a7d175
UD
1417 return free_fail_stack_return (fs);
1418 }
1419 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1420 &eps_via_nodes);
1421 }
1422 else
1423 {
1424 re_node_set_free (&eps_via_nodes);
2d87db5b
UD
1425 if (prev_idx_match_malloced)
1426 re_free (prev_idx_match);
15a7d175
UD
1427 return REG_NOERROR;
1428 }
1429 }
3b0bdc72
UD
1430
1431 /* Proceed to next node. */
e3a87852 1432 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
15a7d175 1433 &eps_via_nodes, fs);
a3022b82 1434
bc15410e 1435 if (BE (cur_node < 0, 0))
15a7d175 1436 {
44777771
UD
1437 if (BE (cur_node == -2, 0))
1438 {
1439 re_node_set_free (&eps_via_nodes);
2d87db5b
UD
1440 if (prev_idx_match_malloced)
1441 re_free (prev_idx_match);
44777771
UD
1442 free_fail_stack_return (fs);
1443 return REG_ESPACE;
1444 }
15a7d175
UD
1445 if (fs)
1446 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1447 &eps_via_nodes);
1448 else
1449 {
1450 re_node_set_free (&eps_via_nodes);
2d87db5b
UD
1451 if (prev_idx_match_malloced)
1452 re_free (prev_idx_match);
15a7d175
UD
1453 return REG_NOMATCH;
1454 }
1455 }
3b0bdc72
UD
1456 }
1457 re_node_set_free (&eps_via_nodes);
2d87db5b
UD
1458 if (prev_idx_match_malloced)
1459 re_free (prev_idx_match);
485d775d
UD
1460 return free_fail_stack_return (fs);
1461}
1462
1463static reg_errcode_t
e2f55264 1464free_fail_stack_return (struct re_fail_stack_t *fs)
485d775d
UD
1465{
1466 if (fs)
1467 {
1468 int fs_idx;
1469 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
15a7d175
UD
1470 {
1471 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1472 re_free (fs->stack[fs_idx].regs);
1473 }
485d775d
UD
1474 re_free (fs->stack);
1475 }
a9388965 1476 return REG_NOERROR;
3b0bdc72
UD
1477}
1478
81c64d40 1479static void
e2f55264
UD
1480update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1481 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
81c64d40
UD
1482{
1483 int type = dfa->nodes[cur_node].type;
81c64d40
UD
1484 if (type == OP_OPEN_SUBEXP)
1485 {
6b6557e8
UD
1486 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1487
81c64d40 1488 /* We are at the first node of this sub expression. */
6b6557e8
UD
1489 if (reg_num < nmatch)
1490 {
1491 pmatch[reg_num].rm_so = cur_idx;
1492 pmatch[reg_num].rm_eo = -1;
1493 }
81c64d40
UD
1494 }
1495 else if (type == OP_CLOSE_SUBEXP)
6b6557e8
UD
1496 {
1497 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1498 if (reg_num < nmatch)
1499 {
1500 /* We are at the last node of this sub expression. */
1501 if (pmatch[reg_num].rm_so < cur_idx)
1502 {
1503 pmatch[reg_num].rm_eo = cur_idx;
1504 /* This is a non-empty match or we are not inside an optional
1505 subexpression. Accept this right away. */
1506 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1507 }
1508 else
1509 {
1510 if (dfa->nodes[cur_node].opt_subexp
1511 && prev_idx_match[reg_num].rm_so != -1)
1512 /* We transited through an empty match for an optional
1513 subexpression, like (a?)*, and this is not the subexp's
1514 first match. Copy back the old content of the registers
1515 so that matches of an inner subexpression are undone as
1516 well, like in ((a?))*. */
1517 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1518 else
1519 /* We completed a subexpression, but it may be part of
1520 an optional one, so do not update PREV_IDX_MATCH. */
1521 pmatch[reg_num].rm_eo = cur_idx;
1522 }
1523 }
1524 }
0742e48e 1525}
81c64d40 1526
0742e48e 1527/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
612546c6
UD
1528 and sift the nodes in each states according to the following rules.
1529 Updated state_log will be wrote to STATE_LOG.
3b0bdc72
UD
1530
1531 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1532 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
15a7d175
UD
1533 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1534 the LAST_NODE, we throw away the node `a'.
612546c6 1535 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
15a7d175
UD
1536 string `s' and transit to `b':
1537 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1538 away the node `a'.
1539 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
bb3f4825 1540 thrown away, we throw away the node `a'.
d0b96fc4 1541 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
15a7d175
UD
1542 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1543 node `a'.
bb3f4825 1544 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
15a7d175 1545 we throw away the node `a'. */
3b0bdc72
UD
1546
1547#define STATE_NODE_CONTAINS(state,node) \
1548 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1549
a9388965 1550static reg_errcode_t
e2f55264 1551sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
3b0bdc72 1552{
a9388965 1553 reg_errcode_t err;
0742e48e
UD
1554 int null_cnt = 0;
1555 int str_idx = sctx->last_str_idx;
1556 re_node_set cur_dest;
3b0bdc72
UD
1557
1558#ifdef DEBUG
612546c6 1559 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
3b0bdc72 1560#endif
3b0bdc72
UD
1561
1562 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1563 transit to the last_node and the last_node itself. */
0742e48e 1564 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
bc15410e 1565 if (BE (err != REG_NOERROR, 0))
a9388965 1566 return err;
e3a87852 1567 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
0742e48e 1568 if (BE (err != REG_NOERROR, 0))
1b2c2628 1569 goto free_return;
3b0bdc72
UD
1570
1571 /* Then check each states in the state_log. */
612546c6 1572 while (str_idx > 0)
3b0bdc72 1573 {
3b0bdc72 1574 /* Update counters. */
0742e48e
UD
1575 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1576 if (null_cnt > mctx->max_mb_elem_len)
15a7d175
UD
1577 {
1578 memset (sctx->sifted_states, '\0',
1579 sizeof (re_dfastate_t *) * str_idx);
1580 re_node_set_free (&cur_dest);
1581 return REG_NOERROR;
1582 }
0742e48e 1583 re_node_set_empty (&cur_dest);
3b0bdc72 1584 --str_idx;
15a7d175 1585
e40a38b3
UD
1586 if (mctx->state_log[str_idx])
1587 {
1588 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
2da42bc0 1589 if (BE (err != REG_NOERROR, 0))
e40a38b3 1590 goto free_return;
15a7d175 1591 }
3b0bdc72 1592
0742e48e 1593 /* Add all the nodes which satisfy the following conditions:
15a7d175
UD
1594 - It can epsilon transit to a node in CUR_DEST.
1595 - It is in CUR_SRC.
1596 And update state_log. */
e3a87852 1597 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
0742e48e 1598 if (BE (err != REG_NOERROR, 0))
15a7d175 1599 goto free_return;
3b0bdc72 1600 }
1b2c2628
UD
1601 err = REG_NOERROR;
1602 free_return:
0742e48e 1603 re_node_set_free (&cur_dest);
1b2c2628 1604 return err;
3b0bdc72
UD
1605}
1606
e40a38b3 1607static reg_errcode_t
b41bd5bc 1608__attribute_warn_unused_result__
e2f55264
UD
1609build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1610 int str_idx, re_node_set *cur_dest)
e40a38b3 1611{
76b864c8
UD
1612 const re_dfa_t *const dfa = mctx->dfa;
1613 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
e40a38b3
UD
1614 int i;
1615
1616 /* Then build the next sifted state.
1617 We build the next sifted state on `cur_dest', and update
1618 `sifted_states[str_idx]' with `cur_dest'.
1619 Note:
1620 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
5cf1ec52
UD
1621 `cur_src' points the node_set of the old `state_log[str_idx]'
1622 (with the epsilon nodes pre-filtered out). */
e40a38b3
UD
1623 for (i = 0; i < cur_src->nelem; i++)
1624 {
1625 int prev_node = cur_src->elems[i];
1626 int naccepted = 0;
e40a38b3
UD
1627 int ret;
1628
a334319f 1629#ifdef DEBUG
02f3550c 1630 re_token_type_t type = dfa->nodes[prev_node].type;
5cf1ec52
UD
1631 assert (!IS_EPSILON_NODE (type));
1632#endif
e40a38b3
UD
1633#ifdef RE_ENABLE_I18N
1634 /* If the node may accept `multi byte'. */
02f3550c 1635 if (dfa->nodes[prev_node].accept_mb)
e40a38b3
UD
1636 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1637 str_idx, sctx->last_str_idx);
1638#endif /* RE_ENABLE_I18N */
1639
1640 /* We don't check backreferences here.
1641 See update_cur_sifted_state(). */
1642 if (!naccepted
1643 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1644 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1645 dfa->nexts[prev_node]))
1646 naccepted = 1;
1647
1648 if (naccepted == 0)
1649 continue;
1650
1651 if (sctx->limits.nelem)
1652 {
1653 int to_idx = str_idx + naccepted;
1654 if (check_dst_limits (mctx, &sctx->limits,
1655 dfa->nexts[prev_node], to_idx,
1656 prev_node, str_idx))
1657 continue;
1658 }
1659 ret = re_node_set_insert (cur_dest, prev_node);
1660 if (BE (ret == -1, 0))
1661 return REG_ESPACE;
1662 }
1663
1664 return REG_NOERROR;
1665}
1666
3b0bdc72
UD
1667/* Helper functions. */
1668
25337753 1669static reg_errcode_t
e2f55264 1670clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
3b0bdc72
UD
1671{
1672 int top = mctx->state_log_top;
612546c6 1673
8887a920
UD
1674 if ((next_state_log_idx >= mctx->input.bufs_len
1675 && mctx->input.bufs_len < mctx->input.len)
56b168be
UD
1676 || (next_state_log_idx >= mctx->input.valid_len
1677 && mctx->input.valid_len < mctx->input.len))
612546c6
UD
1678 {
1679 reg_errcode_t err;
a445af0b 1680 err = extend_buffers (mctx, next_state_log_idx + 1);
612546c6 1681 if (BE (err != REG_NOERROR, 0))
15a7d175 1682 return err;
612546c6
UD
1683 }
1684
3b0bdc72
UD
1685 if (top < next_state_log_idx)
1686 {
612546c6 1687 memset (mctx->state_log + top + 1, '\0',
15a7d175 1688 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
3b0bdc72
UD
1689 mctx->state_log_top = next_state_log_idx;
1690 }
612546c6 1691 return REG_NOERROR;
3b0bdc72
UD
1692}
1693
1b2c2628 1694static reg_errcode_t
e2f55264
UD
1695merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1696 re_dfastate_t **src, int num)
3b0bdc72 1697{
0742e48e
UD
1698 int st_idx;
1699 reg_errcode_t err;
1700 for (st_idx = 0; st_idx < num; ++st_idx)
1701 {
1702 if (dst[st_idx] == NULL)
15a7d175 1703 dst[st_idx] = src[st_idx];
0742e48e 1704 else if (src[st_idx] != NULL)
15a7d175
UD
1705 {
1706 re_node_set merged_set;
1707 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1708 &src[st_idx]->nodes);
1709 if (BE (err != REG_NOERROR, 0))
1710 return err;
1711 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1712 re_node_set_free (&merged_set);
1713 if (BE (err != REG_NOERROR, 0))
1714 return err;
1715 }
0742e48e
UD
1716 }
1717 return REG_NOERROR;
3b0bdc72
UD
1718}
1719
0742e48e 1720static reg_errcode_t
e2f55264
UD
1721update_cur_sifted_state (const re_match_context_t *mctx,
1722 re_sift_context_t *sctx, int str_idx,
1723 re_node_set *dest_nodes)
3b0bdc72 1724{
76b864c8 1725 const re_dfa_t *const dfa = mctx->dfa;
c0c9615c 1726 reg_errcode_t err = REG_NOERROR;
0742e48e 1727 const re_node_set *candidates;
e40a38b3 1728 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
15a7d175 1729 : &mctx->state_log[str_idx]->nodes);
0742e48e 1730
e40a38b3
UD
1731 if (dest_nodes->nelem == 0)
1732 sctx->sifted_states[str_idx] = NULL;
1733 else
485d775d 1734 {
e40a38b3
UD
1735 if (candidates)
1736 {
1737 /* At first, add the nodes which can epsilon transit to a node in
1738 DEST_NODE. */
1739 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1740 if (BE (err != REG_NOERROR, 0))
1741 return err;
0742e48e 1742
e40a38b3
UD
1743 /* Then, check the limitations in the current sift_context. */
1744 if (sctx->limits.nelem)
1745 {
1746 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1747 mctx->bkref_ents, str_idx);
1748 if (BE (err != REG_NOERROR, 0))
1749 return err;
1750 }
1751 }
1752
1753 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
0742e48e 1754 if (BE (err != REG_NOERROR, 0))
15a7d175 1755 return err;
0742e48e
UD
1756 }
1757
e40a38b3 1758 if (candidates && mctx->state_log[str_idx]->has_backref)
0742e48e 1759 {
d2c38eb3 1760 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
0742e48e 1761 if (BE (err != REG_NOERROR, 0))
15a7d175 1762 return err;
0742e48e
UD
1763 }
1764 return REG_NOERROR;
3b0bdc72
UD
1765}
1766
a9388965 1767static reg_errcode_t
b41bd5bc 1768__attribute_warn_unused_result__
e2f55264
UD
1769add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1770 const re_node_set *candidates)
3b0bdc72 1771{
5cf1ec52
UD
1772 reg_errcode_t err = REG_NOERROR;
1773 int i;
0742e48e 1774
5cf1ec52 1775 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
0742e48e
UD
1776 if (BE (err != REG_NOERROR, 0))
1777 return err;
5cf1ec52
UD
1778
1779 if (!state->inveclosure.alloc)
3b0bdc72 1780 {
5cf1ec52 1781 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
0742e48e 1782 if (BE (err != REG_NOERROR, 0))
2da42bc0 1783 return REG_ESPACE;
5cf1ec52 1784 for (i = 0; i < dest_nodes->nelem; i++)
2da42bc0
UD
1785 {
1786 err = re_node_set_merge (&state->inveclosure,
1787 dfa->inveclosures + dest_nodes->elems[i]);
1788 if (BE (err != REG_NOERROR, 0))
1789 return REG_ESPACE;
1790 }
0742e48e 1791 }
5cf1ec52
UD
1792 return re_node_set_add_intersect (dest_nodes, candidates,
1793 &state->inveclosure);
0742e48e 1794}
3b0bdc72 1795
0742e48e 1796static reg_errcode_t
e2f55264
UD
1797sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1798 const re_node_set *candidates)
0742e48e
UD
1799{
1800 int ecl_idx;
1801 reg_errcode_t err;
1802 re_node_set *inv_eclosure = dfa->inveclosures + node;
1803 re_node_set except_nodes;
1804 re_node_set_init_empty (&except_nodes);
1805 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1806 {
15a7d175
UD
1807 int cur_node = inv_eclosure->elems[ecl_idx];
1808 if (cur_node == node)
1809 continue;
1810 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1811 {
1812 int edst1 = dfa->edests[cur_node].elems[0];
1813 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1814 ? dfa->edests[cur_node].elems[1] : -1);
1815 if ((!re_node_set_contains (inv_eclosure, edst1)
1816 && re_node_set_contains (dest_nodes, edst1))
1817 || (edst2 > 0
1818 && !re_node_set_contains (inv_eclosure, edst2)
1819 && re_node_set_contains (dest_nodes, edst2)))
1820 {
1821 err = re_node_set_add_intersect (&except_nodes, candidates,
1822 dfa->inveclosures + cur_node);
1823 if (BE (err != REG_NOERROR, 0))
1824 {
1825 re_node_set_free (&except_nodes);
1826 return err;
1827 }
1828 }
1829 }
0742e48e
UD
1830 }
1831 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1832 {
15a7d175
UD
1833 int cur_node = inv_eclosure->elems[ecl_idx];
1834 if (!re_node_set_contains (&except_nodes, cur_node))
1835 {
1836 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1837 re_node_set_remove_at (dest_nodes, idx);
1838 }
0742e48e
UD
1839 }
1840 re_node_set_free (&except_nodes);
1841 return REG_NOERROR;
1842}
1843
a3022b82 1844static int
e2f55264
UD
1845check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1846 int dst_node, int dst_idx, int src_node, int src_idx)
a3022b82 1847{
76b864c8 1848 const re_dfa_t *const dfa = mctx->dfa;
a3022b82
UD
1849 int lim_idx, src_pos, dst_pos;
1850
e40a38b3
UD
1851 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1852 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
a3022b82
UD
1853 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1854 {
485d775d 1855 int subexp_idx;
a3022b82
UD
1856 struct re_backref_cache_entry *ent;
1857 ent = mctx->bkref_ents + limits->elems[lim_idx];
d8f73de8 1858 subexp_idx = dfa->nodes[ent->node].opr.idx;
a3022b82 1859
e3a87852 1860 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
e40a38b3
UD
1861 subexp_idx, dst_node, dst_idx,
1862 dst_bkref_idx);
e3a87852 1863 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
e40a38b3
UD
1864 subexp_idx, src_node, src_idx,
1865 src_bkref_idx);
a3022b82
UD
1866
1867 /* In case of:
15a7d175
UD
1868 <src> <dst> ( <subexp> )
1869 ( <subexp> ) <src> <dst>
1870 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
a3022b82 1871 if (src_pos == dst_pos)
15a7d175 1872 continue; /* This is unrelated limitation. */
a3022b82 1873 else
15a7d175 1874 return 1;
a3022b82
UD
1875 }
1876 return 0;
1877}
1878
1879static int
e2f55264
UD
1880check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1881 int subexp_idx, int from_node, int bkref_idx)
a3022b82 1882{
76b864c8
UD
1883 const re_dfa_t *const dfa = mctx->dfa;
1884 const re_node_set *eclosures = dfa->eclosures + from_node;
1cef7b3c 1885 int node_idx;
d0b96fc4 1886
d0b96fc4
UD
1887 /* Else, we are on the boundary: examine the nodes on the epsilon
1888 closure. */
1cef7b3c
UD
1889 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1890 {
1891 int node = eclosures->elems[node_idx];
d0b96fc4
UD
1892 switch (dfa->nodes[node].type)
1893 {
1894 case OP_BACK_REF:
d8f73de8
UD
1895 if (bkref_idx != -1)
1896 {
1897 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1898 do
2da42bc0 1899 {
d8f73de8 1900 int dst, cpos;
d0b96fc4 1901
d8f73de8
UD
1902 if (ent->node != node)
1903 continue;
d0b96fc4 1904
2c05d33f
UD
1905 if (subexp_idx < BITSET_WORD_BITS
1906 && !(ent->eps_reachable_subexps_map
1907 & ((bitset_word_t) 1 << subexp_idx)))
d8f73de8 1908 continue;
d0b96fc4 1909
d8f73de8
UD
1910 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1911 OP_CLOSE_SUBEXP cases below. But, if the
1912 destination node is the same node as the source
1913 node, don't recurse because it would cause an
1914 infinite loop: a regex that exhibits this behavior
1915 is ()\1*\1* */
1916 dst = dfa->edests[node].elems[0];
1917 if (dst == from_node)
1918 {
1919 if (boundaries & 1)
2da42bc0 1920 return -1;
d8f73de8 1921 else /* if (boundaries & 2) */
2da42bc0 1922 return 0;
d8f73de8 1923 }
7db61208 1924
d8f73de8
UD
1925 cpos =
1926 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1927 dst, bkref_idx);
1928 if (cpos == -1 /* && (boundaries & 1) */)
1929 return -1;
1930 if (cpos == 0 && (boundaries & 2))
1931 return 0;
1932
2c05d33f
UD
1933 if (subexp_idx < BITSET_WORD_BITS)
1934 ent->eps_reachable_subexps_map
1935 &= ~((bitset_word_t) 1 << subexp_idx);
2da42bc0 1936 }
d8f73de8
UD
1937 while (ent++->more);
1938 }
1939 break;
f0c7c524 1940
d0b96fc4 1941 case OP_OPEN_SUBEXP:
e40a38b3 1942 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
d0b96fc4
UD
1943 return -1;
1944 break;
f0c7c524 1945
d0b96fc4 1946 case OP_CLOSE_SUBEXP:
e40a38b3 1947 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
d0b96fc4
UD
1948 return 0;
1949 break;
1950
1951 default:
15a7d175
UD
1952 break;
1953 }
a3022b82 1954 }
d0b96fc4 1955
e40a38b3
UD
1956 return (boundaries & 2) ? 1 : 0;
1957}
1958
1959static int
e2f55264
UD
1960check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1961 int subexp_idx, int from_node, int str_idx,
1962 int bkref_idx)
e40a38b3
UD
1963{
1964 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1965 int boundaries;
1966
1967 /* If we are outside the range of the subexpression, return -1 or 1. */
1968 if (str_idx < lim->subexp_from)
1969 return -1;
1970
1971 if (lim->subexp_to < str_idx)
d0b96fc4 1972 return 1;
e40a38b3
UD
1973
1974 /* If we are within the subexpression, return 0. */
1975 boundaries = (str_idx == lim->subexp_from);
1976 boundaries |= (str_idx == lim->subexp_to) << 1;
1977 if (boundaries == 0)
d0b96fc4 1978 return 0;
e40a38b3
UD
1979
1980 /* Else, examine epsilon closure. */
1981 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1982 from_node, bkref_idx);
a3022b82
UD
1983}
1984
0742e48e
UD
1985/* Check the limitations of sub expressions LIMITS, and remove the nodes
1986 which are against limitations from DEST_NODES. */
1987
1988static reg_errcode_t
e2f55264
UD
1989check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1990 const re_node_set *candidates, re_node_set *limits,
1991 struct re_backref_cache_entry *bkref_ents, int str_idx)
0742e48e
UD
1992{
1993 reg_errcode_t err;
1994 int node_idx, lim_idx;
1995
1996 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1997 {
485d775d 1998 int subexp_idx;
0742e48e
UD
1999 struct re_backref_cache_entry *ent;
2000 ent = bkref_ents + limits->elems[lim_idx];
2001
2002 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
15a7d175 2003 continue; /* This is unrelated limitation. */
0742e48e 2004
d8f73de8 2005 subexp_idx = dfa->nodes[ent->node].opr.idx;
0742e48e 2006 if (ent->subexp_to == str_idx)
15a7d175
UD
2007 {
2008 int ops_node = -1;
2009 int cls_node = -1;
2010 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2011 {
2012 int node = dest_nodes->elems[node_idx];
46bf9de7 2013 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
2014 if (type == OP_OPEN_SUBEXP
2015 && subexp_idx == dfa->nodes[node].opr.idx)
2016 ops_node = node;
2017 else if (type == OP_CLOSE_SUBEXP
2018 && subexp_idx == dfa->nodes[node].opr.idx)
2019 cls_node = node;
2020 }
2021
2022 /* Check the limitation of the open subexpression. */
2023 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2024 if (ops_node >= 0)
2025 {
46bf9de7
UD
2026 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2027 candidates);
15a7d175
UD
2028 if (BE (err != REG_NOERROR, 0))
2029 return err;
2030 }
46bf9de7 2031
15a7d175 2032 /* Check the limitation of the close subexpression. */
46bf9de7
UD
2033 if (cls_node >= 0)
2034 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2035 {
2036 int node = dest_nodes->elems[node_idx];
2037 if (!re_node_set_contains (dfa->inveclosures + node,
2038 cls_node)
2039 && !re_node_set_contains (dfa->eclosures + node,
2040 cls_node))
2041 {
2042 /* It is against this limitation.
2043 Remove it form the current sifted state. */
2044 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2045 candidates);
2046 if (BE (err != REG_NOERROR, 0))
2047 return err;
2048 --node_idx;
2049 }
2050 }
15a7d175 2051 }
0742e48e 2052 else /* (ent->subexp_to != str_idx) */
15a7d175
UD
2053 {
2054 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2055 {
2056 int node = dest_nodes->elems[node_idx];
46bf9de7 2057 re_token_type_t type = dfa->nodes[node].type;
15a7d175
UD
2058 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2059 {
2060 if (subexp_idx != dfa->nodes[node].opr.idx)
2061 continue;
d8f73de8
UD
2062 /* It is against this limitation.
2063 Remove it form the current sifted state. */
2064 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2065 candidates);
2066 if (BE (err != REG_NOERROR, 0))
2067 return err;
15a7d175
UD
2068 }
2069 }
2070 }
0742e48e
UD
2071 }
2072 return REG_NOERROR;
2073}
2074
0742e48e 2075static reg_errcode_t
b41bd5bc 2076__attribute_warn_unused_result__
e2f55264
UD
2077sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2078 int str_idx, const re_node_set *candidates)
0742e48e 2079{
76b864c8 2080 const re_dfa_t *const dfa = mctx->dfa;
0742e48e 2081 reg_errcode_t err;
0742e48e
UD
2082 int node_idx, node;
2083 re_sift_context_t local_sctx;
e40a38b3
UD
2084 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2085
2086 if (first_idx == -1)
2087 return REG_NOERROR;
2088
0742e48e
UD
2089 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2090
2091 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2092 {
e40a38b3 2093 int enabled_idx;
0742e48e 2094 re_token_type_t type;
e40a38b3 2095 struct re_backref_cache_entry *entry;
0742e48e
UD
2096 node = candidates->elems[node_idx];
2097 type = dfa->nodes[node].type;
0742e48e
UD
2098 /* Avoid infinite loop for the REs like "()\1+". */
2099 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
15a7d175 2100 continue;
e40a38b3
UD
2101 if (type != OP_BACK_REF)
2102 continue;
2103
2104 entry = mctx->bkref_ents + first_idx;
2105 enabled_idx = first_idx;
2106 do
15a7d175 2107 {
2d87db5b
UD
2108 int subexp_len;
2109 int to_idx;
2110 int dst_node;
2111 int ret;
e40a38b3
UD
2112 re_dfastate_t *cur_state;
2113
2114 if (entry->node != node)
2115 continue;
2116 subexp_len = entry->subexp_to - entry->subexp_from;
2117 to_idx = str_idx + subexp_len;
2118 dst_node = (subexp_len ? dfa->nexts[node]
2119 : dfa->edests[node].elems[0]);
2120
2121 if (to_idx > sctx->last_str_idx
2122 || sctx->sifted_states[to_idx] == NULL
2123 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2124 || check_dst_limits (mctx, &sctx->limits, node,
2125 str_idx, dst_node, to_idx))
2126 continue;
2127
2128 if (local_sctx.sifted_states == NULL)
15a7d175 2129 {
e40a38b3
UD
2130 local_sctx = *sctx;
2131 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2132 if (BE (err != REG_NOERROR, 0))
2133 goto free_return;
2134 }
2135 local_sctx.last_node = node;
2136 local_sctx.last_str_idx = str_idx;
2d87db5b
UD
2137 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2138 if (BE (ret < 0, 0))
e40a38b3
UD
2139 {
2140 err = REG_ESPACE;
2141 goto free_return;
15a7d175 2142 }
e40a38b3
UD
2143 cur_state = local_sctx.sifted_states[str_idx];
2144 err = sift_states_backward (mctx, &local_sctx);
2145 if (BE (err != REG_NOERROR, 0))
2146 goto free_return;
2147 if (sctx->limited_states != NULL)
2148 {
2149 err = merge_state_array (dfa, sctx->limited_states,
2150 local_sctx.sifted_states,
2151 str_idx + 1);
2152 if (BE (err != REG_NOERROR, 0))
2153 goto free_return;
2154 }
2155 local_sctx.sifted_states[str_idx] = cur_state;
2156 re_node_set_remove (&local_sctx.limits, enabled_idx);
2157
2158 /* mctx->bkref_ents may have changed, reload the pointer. */
2da42bc0 2159 entry = mctx->bkref_ents + enabled_idx;
15a7d175 2160 }
e40a38b3 2161 while (enabled_idx++, entry++->more);
0742e48e 2162 }
1b2c2628
UD
2163 err = REG_NOERROR;
2164 free_return:
0742e48e
UD
2165 if (local_sctx.sifted_states != NULL)
2166 {
0742e48e
UD
2167 re_node_set_free (&local_sctx.limits);
2168 }
2169
1b2c2628 2170 return err;
0742e48e
UD
2171}
2172
2173
2174#ifdef RE_ENABLE_I18N
2175static int
e2f55264
UD
2176sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2177 int node_idx, int str_idx, int max_str_idx)
0742e48e 2178{
c42b4152 2179 const re_dfa_t *const dfa = mctx->dfa;
0742e48e
UD
2180 int naccepted;
2181 /* Check the node can accept `multi byte'. */
56b168be 2182 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
0742e48e
UD
2183 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2184 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
15a7d175 2185 dfa->nexts[node_idx]))
0742e48e 2186 /* The node can't accept the `multi byte', or the
bb3f4825 2187 destination was already thrown away, then the node
0742e48e
UD
2188 could't accept the current input `multi byte'. */
2189 naccepted = 0;
2190 /* Otherwise, it is sure that the node could accept
2191 `naccepted' bytes input. */
2192 return naccepted;
2193}
2194#endif /* RE_ENABLE_I18N */
2195
3b0bdc72
UD
2196\f
2197/* Functions for state transition. */
2198
2199/* Return the next state to which the current state STATE will transit by
2200 accepting the current input byte, and update STATE_LOG if necessary.
2201 If STATE can accept a multibyte char/collating element/back reference
2202 update the destination of STATE_LOG. */
2203
2204static re_dfastate_t *
b41bd5bc 2205__attribute_warn_unused_result__
e2f55264
UD
2206transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2207 re_dfastate_t *state)
3b0bdc72 2208{
58845a70 2209 re_dfastate_t **trtable;
3b0bdc72
UD
2210 unsigned char ch;
2211
bb677c95
UD
2212#ifdef RE_ENABLE_I18N
2213 /* If the current state can accept multibyte. */
2214 if (BE (state->accept_mb, 0))
612546c6 2215 {
bb677c95 2216 *err = transit_state_mb (mctx, state);
612546c6 2217 if (BE (*err != REG_NOERROR, 0))
15a7d175 2218 return NULL;
612546c6 2219 }
434d3784 2220#endif /* RE_ENABLE_I18N */
3b0bdc72 2221
4c595adb 2222 /* Then decide the next state with the single byte. */
ab4b89fe
UD
2223#if 0
2224 if (0)
2225 /* don't use transition table */
2226 return transit_state_sb (err, mctx, state);
2227#endif
2228
2229 /* Use transition table */
2230 ch = re_string_fetch_byte (&mctx->input);
2231 for (;;)
4c595adb 2232 {
4c595adb 2233 trtable = state->trtable;
ab4b89fe
UD
2234 if (BE (trtable != NULL, 1))
2235 return trtable[ch];
2236
2237 trtable = state->word_trtable;
2238 if (BE (trtable != NULL, 1))
2da42bc0 2239 {
4c595adb
UD
2240 unsigned int context;
2241 context
2242 = re_string_context_at (&mctx->input,
2243 re_string_cur_idx (&mctx->input) - 1,
2244 mctx->eflags);
2245 if (IS_WORD_CONTEXT (context))
2246 return trtable[ch + SBC_MAX];
c13c99fa 2247 else
4c595adb 2248 return trtable[ch];
15a7d175 2249 }
ab4b89fe
UD
2250
2251 if (!build_trtable (mctx->dfa, state))
2252 {
2253 *err = REG_ESPACE;
2254 return NULL;
2255 }
2256
2257 /* Retry, we now have a transition table. */
3b0bdc72 2258 }
4c595adb 2259}
3b0bdc72 2260
4c595adb 2261/* Update the state_log if we need */
58845a70 2262re_dfastate_t *
e2f55264
UD
2263merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2264 re_dfastate_t *next_state)
58845a70 2265{
c42b4152 2266 const re_dfa_t *const dfa = mctx->dfa;
4c595adb
UD
2267 int cur_idx = re_string_cur_idx (&mctx->input);
2268
2269 if (cur_idx > mctx->state_log_top)
3b0bdc72 2270 {
4c595adb
UD
2271 mctx->state_log[cur_idx] = next_state;
2272 mctx->state_log_top = cur_idx;
2273 }
2274 else if (mctx->state_log[cur_idx] == 0)
2275 {
2276 mctx->state_log[cur_idx] = next_state;
2277 }
2278 else
2279 {
2280 re_dfastate_t *pstate;
2281 unsigned int context;
2282 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2283 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2da42bc0
UD
2284 the destination of a multibyte char/collating element/
2285 back reference. Then the next state is the union set of
2286 these destinations and the results of the transition table. */
4c595adb
UD
2287 pstate = mctx->state_log[cur_idx];
2288 log_nodes = pstate->entrance_nodes;
2289 if (next_state != NULL)
2da42bc0
UD
2290 {
2291 table_nodes = next_state->entrance_nodes;
2292 *err = re_node_set_init_union (&next_nodes, table_nodes,
15a7d175 2293 log_nodes);
2da42bc0 2294 if (BE (*err != REG_NOERROR, 0))
4c595adb 2295 return NULL;
2da42bc0 2296 }
4c595adb 2297 else
2da42bc0 2298 next_nodes = *log_nodes;
4c595adb
UD
2299 /* Note: We already add the nodes of the initial state,
2300 then we don't need to add them here. */
2301
2302 context = re_string_context_at (&mctx->input,
2303 re_string_cur_idx (&mctx->input) - 1,
2304 mctx->eflags);
2305 next_state = mctx->state_log[cur_idx]
2da42bc0 2306 = re_acquire_state_context (err, dfa, &next_nodes, context);
4c595adb 2307 /* We don't need to check errors here, since the return value of
2da42bc0 2308 this function is next_state and ERR is already set. */
4c595adb
UD
2309
2310 if (table_nodes != NULL)
2da42bc0 2311 re_node_set_free (&next_nodes);
6291ee3c
UD
2312 }
2313
bb3f4825 2314 if (BE (dfa->nbackref, 0) && next_state != NULL)
6291ee3c 2315 {
bb3f4825
UD
2316 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2317 later. We must check them here, since the back references in the
2318 next state might use them. */
e3a87852 2319 *err = check_subexp_matching_top (mctx, &next_state->nodes,
6291ee3c
UD
2320 cur_idx);
2321 if (BE (*err != REG_NOERROR, 0))
2322 return NULL;
6291ee3c 2323
bb3f4825
UD
2324 /* If the next state has back references. */
2325 if (next_state->has_backref)
2326 {
e3a87852 2327 *err = transit_state_bkref (mctx, &next_state->nodes);
bb3f4825
UD
2328 if (BE (*err != REG_NOERROR, 0))
2329 return NULL;
2330 next_state = mctx->state_log[cur_idx];
2331 }
3b0bdc72 2332 }
4c595adb 2333
3b0bdc72
UD
2334 return next_state;
2335}
2336
4c595adb
UD
2337/* Skip bytes in the input that correspond to part of a
2338 multi-byte match, then look in the log for a state
2339 from which to restart matching. */
2340re_dfastate_t *
e2f55264 2341find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
4c595adb 2342{
1878e9af 2343 re_dfastate_t *cur_state;
4c595adb
UD
2344 do
2345 {
2346 int max = mctx->state_log_top;
2347 int cur_str_idx = re_string_cur_idx (&mctx->input);
2348
2349 do
2350 {
2da42bc0
UD
2351 if (++cur_str_idx > max)
2352 return NULL;
2353 re_string_skip_bytes (&mctx->input, 1);
4c595adb
UD
2354 }
2355 while (mctx->state_log[cur_str_idx] == NULL);
2356
2357 cur_state = merge_state_with_log (err, mctx, NULL);
2358 }
2d87db5b 2359 while (*err == REG_NOERROR && cur_state == NULL);
4c595adb
UD
2360 return cur_state;
2361}
2362
3b0bdc72
UD
2363/* Helper functions for transit_state. */
2364
6291ee3c
UD
2365/* From the node set CUR_NODES, pick up the nodes whose types are
2366 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2367 expression. And register them to use them later for evaluating the
2368 correspoding back references. */
2369
2370static reg_errcode_t
e2f55264
UD
2371check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2372 int str_idx)
6291ee3c 2373{
76b864c8 2374 const re_dfa_t *const dfa = mctx->dfa;
6291ee3c
UD
2375 int node_idx;
2376 reg_errcode_t err;
2377
2378 /* TODO: This isn't efficient.
2379 Because there might be more than one nodes whose types are
2380 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2381 nodes.
2382 E.g. RE: (a){2} */
2383 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2384 {
2385 int node = cur_nodes->elems[node_idx];
2386 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2c05d33f
UD
2387 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2388 && (dfa->used_bkref_map
2389 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
6291ee3c
UD
2390 {
2391 err = match_ctx_add_subtop (mctx, node, str_idx);
2392 if (BE (err != REG_NOERROR, 0))
2393 return err;
2394 }
2395 }
2396 return REG_NOERROR;
2397}
2398
c13c99fa 2399#if 0
3b0bdc72
UD
2400/* Return the next state to which the current state STATE will transit by
2401 accepting the current input byte. */
2402
2403static re_dfastate_t *
e2f55264
UD
2404transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2405 re_dfastate_t *state)
3b0bdc72 2406{
c42b4152 2407 const re_dfa_t *const dfa = mctx->dfa;
3b0bdc72
UD
2408 re_node_set next_nodes;
2409 re_dfastate_t *next_state;
56b168be 2410 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72
UD
2411 unsigned int context;
2412
a9388965 2413 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
bc15410e 2414 if (BE (*err != REG_NOERROR, 0))
a9388965 2415 return NULL;
3b0bdc72
UD
2416 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2417 {
2418 int cur_node = state->nodes.elems[node_cnt];
e3a87852 2419 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
15a7d175
UD
2420 {
2421 *err = re_node_set_merge (&next_nodes,
2422 dfa->eclosures + dfa->nexts[cur_node]);
2423 if (BE (*err != REG_NOERROR, 0))
2424 {
2425 re_node_set_free (&next_nodes);
2426 return NULL;
2427 }
2428 }
3b0bdc72 2429 }
56b168be 2430 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
a9388965
UD
2431 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2432 /* We don't need to check errors here, since the return value of
2433 this function is next_state and ERR is already set. */
2434
3b0bdc72 2435 re_node_set_free (&next_nodes);
56b168be 2436 re_string_skip_bytes (&mctx->input, 1);
3b0bdc72
UD
2437 return next_state;
2438}
c13c99fa 2439#endif
3b0bdc72 2440
434d3784 2441#ifdef RE_ENABLE_I18N
a9388965 2442static reg_errcode_t
e2f55264 2443transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
3b0bdc72 2444{
c42b4152 2445 const re_dfa_t *const dfa = mctx->dfa;
a9388965 2446 reg_errcode_t err;
3b0bdc72
UD
2447 int i;
2448
2449 for (i = 0; i < pstate->nodes.nelem; ++i)
2450 {
2451 re_node_set dest_nodes, *new_nodes;
2452 int cur_node_idx = pstate->nodes.elems[i];
963d8d78 2453 int naccepted, dest_idx;
3b0bdc72
UD
2454 unsigned int context;
2455 re_dfastate_t *dest_state;
2456
963d8d78 2457 if (!dfa->nodes[cur_node_idx].accept_mb)
2da42bc0 2458 continue;
963d8d78 2459
485d775d 2460 if (dfa->nodes[cur_node_idx].constraint)
15a7d175 2461 {
56b168be
UD
2462 context = re_string_context_at (&mctx->input,
2463 re_string_cur_idx (&mctx->input),
2464 mctx->eflags);
15a7d175
UD
2465 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2466 context))
2467 continue;
2468 }
3b0bdc72 2469
ad7f28c2 2470 /* How many bytes the node can accept? */
963d8d78
UD
2471 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2472 re_string_cur_idx (&mctx->input));
3b0bdc72 2473 if (naccepted == 0)
15a7d175 2474 continue;
3b0bdc72
UD
2475
2476 /* The node can accepts `naccepted' bytes. */
56b168be 2477 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
0742e48e 2478 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
15a7d175 2479 : mctx->max_mb_elem_len);
7c1be3ec 2480 err = clean_state_log_if_needed (mctx, dest_idx);
612546c6 2481 if (BE (err != REG_NOERROR, 0))
15a7d175 2482 return err;
3b0bdc72
UD
2483#ifdef DEBUG
2484 assert (dfa->nexts[cur_node_idx] != -1);
2485#endif
963d8d78 2486 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
3b0bdc72 2487
612546c6 2488 dest_state = mctx->state_log[dest_idx];
3b0bdc72 2489 if (dest_state == NULL)
15a7d175 2490 dest_nodes = *new_nodes;
3b0bdc72 2491 else
15a7d175
UD
2492 {
2493 err = re_node_set_init_union (&dest_nodes,
2494 dest_state->entrance_nodes, new_nodes);
2495 if (BE (err != REG_NOERROR, 0))
2496 return err;
2497 }
01ed6ceb
UD
2498 context = re_string_context_at (&mctx->input, dest_idx - 1,
2499 mctx->eflags);
612546c6 2500 mctx->state_log[dest_idx]
15a7d175 2501 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
3b0bdc72 2502 if (dest_state != NULL)
15a7d175 2503 re_node_set_free (&dest_nodes);
1b2c2628 2504 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
15a7d175 2505 return err;
3b0bdc72 2506 }
a9388965 2507 return REG_NOERROR;
3b0bdc72 2508}
434d3784 2509#endif /* RE_ENABLE_I18N */
3b0bdc72 2510
a9388965 2511static reg_errcode_t
e2f55264 2512transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
3b0bdc72 2513{
6efbd82c 2514 const re_dfa_t *const dfa = mctx->dfa;
a9388965 2515 reg_errcode_t err;
0742e48e 2516 int i;
56b168be 2517 int cur_str_idx = re_string_cur_idx (&mctx->input);
3b0bdc72
UD
2518
2519 for (i = 0; i < nodes->nelem; ++i)
2520 {
6291ee3c 2521 int dest_str_idx, prev_nelem, bkc_idx;
3b0bdc72
UD
2522 int node_idx = nodes->elems[i];
2523 unsigned int context;
fe9434bb 2524 const re_token_t *node = dfa->nodes + node_idx;
3b0bdc72
UD
2525 re_node_set *new_dest_nodes;
2526
2527 /* Check whether `node' is a backreference or not. */
6291ee3c 2528 if (node->type != OP_BACK_REF)
15a7d175 2529 continue;
485d775d
UD
2530
2531 if (node->constraint)
15a7d175 2532 {
56b168be
UD
2533 context = re_string_context_at (&mctx->input, cur_str_idx,
2534 mctx->eflags);
15a7d175
UD
2535 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2536 continue;
2537 }
3b0bdc72
UD
2538
2539 /* `node' is a backreference.
15a7d175 2540 Check the substring which the substring matched. */
6291ee3c 2541 bkc_idx = mctx->nbkref_ents;
e3a87852 2542 err = get_subexp (mctx, node_idx, cur_str_idx);
612546c6 2543 if (BE (err != REG_NOERROR, 0))
15a7d175 2544 goto free_return;
3b0bdc72
UD
2545
2546 /* And add the epsilon closures (which is `new_dest_nodes') of
15a7d175 2547 the backreference to appropriate state_log. */
3b0bdc72
UD
2548#ifdef DEBUG
2549 assert (dfa->nexts[node_idx] != -1);
2550#endif
6291ee3c 2551 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
15a7d175
UD
2552 {
2553 int subexp_len;
2554 re_dfastate_t *dest_state;
2555 struct re_backref_cache_entry *bkref_ent;
2556 bkref_ent = mctx->bkref_ents + bkc_idx;
2557 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2558 continue;
2559 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2560 new_dest_nodes = (subexp_len == 0
2561 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2562 : dfa->eclosures + dfa->nexts[node_idx]);
2563 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2564 - bkref_ent->subexp_from);
56b168be
UD
2565 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2566 mctx->eflags);
15a7d175
UD
2567 dest_state = mctx->state_log[dest_str_idx];
2568 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2569 : mctx->state_log[cur_str_idx]->nodes.nelem);
2570 /* Add `new_dest_node' to state_log. */
2571 if (dest_state == NULL)
2572 {
2573 mctx->state_log[dest_str_idx]
2574 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2575 context);
2576 if (BE (mctx->state_log[dest_str_idx] == NULL
2577 && err != REG_NOERROR, 0))
2578 goto free_return;
2579 }
2580 else
2581 {
2582 re_node_set dest_nodes;
2583 err = re_node_set_init_union (&dest_nodes,
2584 dest_state->entrance_nodes,
2585 new_dest_nodes);
2586 if (BE (err != REG_NOERROR, 0))
2587 {
2588 re_node_set_free (&dest_nodes);
2589 goto free_return;
2590 }
2591 mctx->state_log[dest_str_idx]
2592 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2593 re_node_set_free (&dest_nodes);
2594 if (BE (mctx->state_log[dest_str_idx] == NULL
2595 && err != REG_NOERROR, 0))
2596 goto free_return;
2597 }
2598 /* We need to check recursively if the backreference can epsilon
2599 transit. */
2600 if (subexp_len == 0
2601 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2602 {
e3a87852 2603 err = check_subexp_matching_top (mctx, new_dest_nodes,
6291ee3c
UD
2604 cur_str_idx);
2605 if (BE (err != REG_NOERROR, 0))
2606 goto free_return;
e3a87852 2607 err = transit_state_bkref (mctx, new_dest_nodes);
15a7d175
UD
2608 if (BE (err != REG_NOERROR, 0))
2609 goto free_return;
2610 }
2611 }
3b0bdc72 2612 }
1b2c2628
UD
2613 err = REG_NOERROR;
2614 free_return:
1b2c2628 2615 return err;
3b0bdc72
UD
2616}
2617
6291ee3c
UD
2618/* Enumerate all the candidates which the backreference BKREF_NODE can match
2619 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2620 Note that we might collect inappropriate candidates here.
2621 However, the cost of checking them strictly here is too high, then we
2622 delay these checking for prune_impossible_nodes(). */
3b0bdc72 2623
6291ee3c 2624static reg_errcode_t
b41bd5bc 2625__attribute_warn_unused_result__
e2f55264 2626get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
3b0bdc72 2627{
6efbd82c 2628 const re_dfa_t *const dfa = mctx->dfa;
6291ee3c 2629 int subexp_num, sub_top_idx;
56b168be 2630 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
6291ee3c
UD
2631 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2632 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
e40a38b3 2633 if (cache_idx != -1)
6291ee3c 2634 {
01ed6ceb
UD
2635 const struct re_backref_cache_entry *entry
2636 = mctx->bkref_ents + cache_idx;
e40a38b3 2637 do
2da42bc0 2638 if (entry->node == bkref_node)
e40a38b3
UD
2639 return REG_NOERROR; /* We already checked it. */
2640 while (entry++->more);
6291ee3c 2641 }
e40a38b3 2642
d8f73de8 2643 subexp_num = dfa->nodes[bkref_node].opr.idx;
6291ee3c
UD
2644
2645 /* For each sub expression */
2646 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2647 {
2648 reg_errcode_t err;
2649 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2650 re_sub_match_last_t *sub_last;
7c1be3ec 2651 int sub_last_idx, sl_str, bkref_str_off;
6291ee3c
UD
2652
2653 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2654 continue; /* It isn't related. */
2655
2656 sl_str = sub_top->str_idx;
7c1be3ec 2657 bkref_str_off = bkref_str_idx;
6291ee3c
UD
2658 /* At first, check the last node of sub expressions we already
2659 evaluated. */
2660 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2661 {
2662 int sl_str_diff;
2663 sub_last = sub_top->lasts[sub_last_idx];
2664 sl_str_diff = sub_last->str_idx - sl_str;
2665 /* The matched string by the sub expression match with the substring
2666 at the back reference? */
c1baba0f
UD
2667 if (sl_str_diff > 0)
2668 {
2669 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2670 {
2671 /* Not enough chars for a successful match. */
2672 if (bkref_str_off + sl_str_diff > mctx->input.len)
2673 break;
2674
2675 err = clean_state_log_if_needed (mctx,
2676 bkref_str_off
2677 + sl_str_diff);
2678 if (BE (err != REG_NOERROR, 0))
2679 return err;
2680 buf = (const char *) re_string_get_buffer (&mctx->input);
2681 }
2682 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
01ed6ceb
UD
2683 /* We don't need to search this sub expression any more. */
2684 break;
c1baba0f 2685 }
7c1be3ec 2686 bkref_str_off += sl_str_diff;
6291ee3c 2687 sl_str += sl_str_diff;
e3a87852 2688 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
6291ee3c 2689 bkref_str_idx);
3ee363e2 2690
7c1be3ec
UD
2691 /* Reload buf, since the preceding call might have reallocated
2692 the buffer. */
56b168be 2693 buf = (const char *) re_string_get_buffer (&mctx->input);
3ee363e2 2694
6291ee3c
UD
2695 if (err == REG_NOMATCH)
2696 continue;
2697 if (BE (err != REG_NOERROR, 0))
2698 return err;
2699 }
7c1be3ec 2700
6291ee3c
UD
2701 if (sub_last_idx < sub_top->nlasts)
2702 continue;
2703 if (sub_last_idx > 0)
2704 ++sl_str;
2705 /* Then, search for the other last nodes of the sub expression. */
2706 for (; sl_str <= bkref_str_idx; ++sl_str)
2707 {
2708 int cls_node, sl_str_off;
fe9434bb 2709 const re_node_set *nodes;
6291ee3c
UD
2710 sl_str_off = sl_str - sub_top->str_idx;
2711 /* The matched string by the sub expression match with the substring
2712 at the back reference? */
c1baba0f
UD
2713 if (sl_str_off > 0)
2714 {
2715 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2716 {
2717 /* If we are at the end of the input, we cannot match. */
2718 if (bkref_str_off >= mctx->input.len)
2719 break;
2720
a445af0b 2721 err = extend_buffers (mctx, bkref_str_off + 1);
c1baba0f
UD
2722 if (BE (err != REG_NOERROR, 0))
2723 return err;
2724
2725 buf = (const char *) re_string_get_buffer (&mctx->input);
2726 }
2727 if (buf [bkref_str_off++] != buf[sl_str - 1])
2728 break; /* We don't need to search this sub expression
2729 any more. */
2730 }
6291ee3c
UD
2731 if (mctx->state_log[sl_str] == NULL)
2732 continue;
2733 /* Does this state have a ')' of the sub expression? */
2734 nodes = &mctx->state_log[sl_str]->nodes;
01ed6ceb
UD
2735 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2736 OP_CLOSE_SUBEXP);
6291ee3c
UD
2737 if (cls_node == -1)
2738 continue; /* No. */
2739 if (sub_top->path == NULL)
2740 {
2741 sub_top->path = calloc (sizeof (state_array_t),
2742 sl_str - sub_top->str_idx + 1);
2743 if (sub_top->path == NULL)
2744 return REG_ESPACE;
2745 }
2746 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2747 in the current context? */
e3a87852 2748 err = check_arrival (mctx, sub_top->path, sub_top->node,
01ed6ceb
UD
2749 sub_top->str_idx, cls_node, sl_str,
2750 OP_CLOSE_SUBEXP);
6291ee3c
UD
2751 if (err == REG_NOMATCH)
2752 continue;
2753 if (BE (err != REG_NOERROR, 0))
2754 return err;
2755 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2756 if (BE (sub_last == NULL, 0))
2757 return REG_ESPACE;
e3a87852 2758 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
6291ee3c
UD
2759 bkref_str_idx);
2760 if (err == REG_NOMATCH)
2761 continue;
2762 }
2763 }
2764 return REG_NOERROR;
2765}
2766
2767/* Helper functions for get_subexp(). */
2768
2769/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2770 If it can arrive, register the sub expression expressed with SUB_TOP
2771 and SUB_LAST. */
2772
2773static reg_errcode_t
e2f55264
UD
2774get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2775 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
6291ee3c
UD
2776{
2777 reg_errcode_t err;
2778 int to_idx;
2779 /* Can the subexpression arrive the back reference? */
e3a87852 2780 err = check_arrival (mctx, &sub_last->path, sub_last->node,
01ed6ceb
UD
2781 sub_last->str_idx, bkref_node, bkref_str,
2782 OP_OPEN_SUBEXP);
6291ee3c
UD
2783 if (err != REG_NOERROR)
2784 return err;
2785 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2786 sub_last->str_idx);
2787 if (BE (err != REG_NOERROR, 0))
2788 return err;
2789 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
c1baba0f 2790 return clean_state_log_if_needed (mctx, to_idx);
6291ee3c
UD
2791}
2792
2793/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2794 Search '(' if FL_OPEN, or search ')' otherwise.
2795 TODO: This function isn't efficient...
2796 Because there might be more than one nodes whose types are
2797 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2798 nodes.
2799 E.g. RE: (a){2} */
2800
2801static int
e2f55264
UD
2802find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2803 int subexp_idx, int type)
6291ee3c
UD
2804{
2805 int cls_idx;
2806 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2807 {
2808 int cls_node = nodes->elems[cls_idx];
fe9434bb 2809 const re_token_t *node = dfa->nodes + cls_node;
0ce7f49c 2810 if (node->type == type
6291ee3c
UD
2811 && node->opr.idx == subexp_idx)
2812 return cls_node;
2813 }
2814 return -1;
2815}
2816
2817/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2818 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2819 heavily reused.
2820 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2821
2822static reg_errcode_t
b41bd5bc 2823__attribute_warn_unused_result__
e2f55264
UD
2824check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2825 int top_str, int last_node, int last_str, int type)
6291ee3c 2826{
6efbd82c 2827 const re_dfa_t *const dfa = mctx->dfa;
c0c9615c 2828 reg_errcode_t err = REG_NOERROR;
6291ee3c
UD
2829 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2830 re_dfastate_t *cur_state = NULL;
2831 re_node_set *cur_nodes, next_nodes;
2832 re_dfastate_t **backup_state_log;
2833 unsigned int context;
2834
2835 subexp_num = dfa->nodes[top_node].opr.idx;
2836 /* Extend the buffer if we need. */
951d6408 2837 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
6291ee3c
UD
2838 {
2839 re_dfastate_t **new_array;
2840 int old_alloc = path->alloc;
2841 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2842 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
6efbd82c 2843 if (BE (new_array == NULL, 0))
951d6408
UD
2844 {
2845 path->alloc = old_alloc;
2846 return REG_ESPACE;
2847 }
6291ee3c
UD
2848 path->array = new_array;
2849 memset (new_array + old_alloc, '\0',
2850 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2851 }
2852
513bbb25 2853 str_idx = path->next_idx ?: top_str;
6291ee3c
UD
2854
2855 /* Temporary modify MCTX. */
2856 backup_state_log = mctx->state_log;
56b168be 2857 backup_cur_idx = mctx->input.cur_idx;
6291ee3c 2858 mctx->state_log = path->array;
56b168be 2859 mctx->input.cur_idx = str_idx;
6291ee3c
UD
2860
2861 /* Setup initial node set. */
56b168be 2862 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
6291ee3c
UD
2863 if (str_idx == top_str)
2864 {
2865 err = re_node_set_init_1 (&next_nodes, top_node);
2866 if (BE (err != REG_NOERROR, 0))
2867 return err;
0ce7f49c 2868 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2869 if (BE (err != REG_NOERROR, 0))
2870 {
2871 re_node_set_free (&next_nodes);
2872 return err;
2873 }
2874 }
2875 else
2876 {
2877 cur_state = mctx->state_log[str_idx];
2878 if (cur_state && cur_state->has_backref)
2879 {
2880 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
6efbd82c 2881 if (BE (err != REG_NOERROR, 0))
6291ee3c
UD
2882 return err;
2883 }
2884 else
2885 re_node_set_init_empty (&next_nodes);
2886 }
2887 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2888 {
2889 if (next_nodes.nelem)
2890 {
d2c38eb3 2891 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
0ce7f49c 2892 subexp_num, type);
6efbd82c 2893 if (BE (err != REG_NOERROR, 0))
6291ee3c
UD
2894 {
2895 re_node_set_free (&next_nodes);
2896 return err;
2897 }
2898 }
2899 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2900 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2901 {
2902 re_node_set_free (&next_nodes);
2903 return err;
2904 }
2905 mctx->state_log[str_idx] = cur_state;
2906 }
2907
2908 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2909 {
2910 re_node_set_empty (&next_nodes);
2911 if (mctx->state_log[str_idx + 1])
2912 {
2913 err = re_node_set_merge (&next_nodes,
2914 &mctx->state_log[str_idx + 1]->nodes);
2915 if (BE (err != REG_NOERROR, 0))
2916 {
2917 re_node_set_free (&next_nodes);
2918 return err;
2919 }
2920 }
2921 if (cur_state)
2922 {
e3a87852 2923 err = check_arrival_add_next_nodes (mctx, str_idx,
6efbd82c
UD
2924 &cur_state->non_eps_nodes,
2925 &next_nodes);
6291ee3c
UD
2926 if (BE (err != REG_NOERROR, 0))
2927 {
2928 re_node_set_free (&next_nodes);
2929 return err;
2930 }
2931 }
2932 ++str_idx;
2933 if (next_nodes.nelem)
2934 {
0ce7f49c 2935 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
6291ee3c
UD
2936 if (BE (err != REG_NOERROR, 0))
2937 {
2938 re_node_set_free (&next_nodes);
2939 return err;
2940 }
d2c38eb3 2941 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
0ce7f49c 2942 subexp_num, type);
6efbd82c 2943 if (BE (err != REG_NOERROR, 0))
6291ee3c
UD
2944 {
2945 re_node_set_free (&next_nodes);
2946 return err;
2947 }
2948 }
56b168be 2949 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
6291ee3c
UD
2950 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2951 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2952 {
2953 re_node_set_free (&next_nodes);
2954 return err;
2955 }
2956 mctx->state_log[str_idx] = cur_state;
2957 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2958 }
2959 re_node_set_free (&next_nodes);
2960 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2961 : &mctx->state_log[last_str]->nodes);
2962 path->next_idx = str_idx;
2963
2964 /* Fix MCTX. */
2965 mctx->state_log = backup_state_log;
56b168be 2966 mctx->input.cur_idx = backup_cur_idx;
6291ee3c 2967
6291ee3c 2968 /* Then check the current node set has the node LAST_NODE. */
c0d5034e
UD
2969 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2970 return REG_NOERROR;
2971
2972 return REG_NOMATCH;
6291ee3c
UD
2973}
2974
2975/* Helper functions for check_arrival. */
2976
2977/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2978 to NEXT_NODES.
2979 TODO: This function is similar to the functions transit_state*(),
2980 however this function has many additional works.
2981 Can't we unify them? */
2982
2983static reg_errcode_t
b41bd5bc 2984__attribute_warn_unused_result__
e2f55264
UD
2985check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2986 re_node_set *cur_nodes, re_node_set *next_nodes)
6291ee3c 2987{
c42b4152 2988 const re_dfa_t *const dfa = mctx->dfa;
d2c38eb3 2989 int result;
6291ee3c 2990 int cur_idx;
c0c9615c 2991 reg_errcode_t err = REG_NOERROR;
6291ee3c
UD
2992 re_node_set union_set;
2993 re_node_set_init_empty (&union_set);
2994 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2995 {
2996 int naccepted = 0;
2997 int cur_node = cur_nodes->elems[cur_idx];
a334319f 2998#ifdef DEBUG
02f3550c 2999 re_token_type_t type = dfa->nodes[cur_node].type;
5cf1ec52
UD
3000 assert (!IS_EPSILON_NODE (type));
3001#endif
6291ee3c
UD
3002#ifdef RE_ENABLE_I18N
3003 /* If the node may accept `multi byte'. */
02f3550c 3004 if (dfa->nodes[cur_node].accept_mb)
6291ee3c 3005 {
56b168be 3006 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
6291ee3c
UD
3007 str_idx);
3008 if (naccepted > 1)
3009 {
3010 re_dfastate_t *dest_state;
3011 int next_node = dfa->nexts[cur_node];
3012 int next_idx = str_idx + naccepted;
3013 dest_state = mctx->state_log[next_idx];
3014 re_node_set_empty (&union_set);
3015 if (dest_state)
3016 {
3017 err = re_node_set_merge (&union_set, &dest_state->nodes);
3018 if (BE (err != REG_NOERROR, 0))
3019 {
3020 re_node_set_free (&union_set);
3021 return err;
3022 }
6291ee3c 3023 }
d2c38eb3
UD
3024 result = re_node_set_insert (&union_set, next_node);
3025 if (BE (result < 0, 0))
6291ee3c 3026 {
44777771
UD
3027 re_node_set_free (&union_set);
3028 return REG_ESPACE;
6291ee3c
UD
3029 }
3030 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3031 &union_set);
3032 if (BE (mctx->state_log[next_idx] == NULL
3033 && err != REG_NOERROR, 0))
3034 {
3035 re_node_set_free (&union_set);
3036 return err;
3037 }
3038 }
3039 }
3040#endif /* RE_ENABLE_I18N */
3041 if (naccepted
e3a87852 3042 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
6291ee3c 3043 {
d2c38eb3
UD
3044 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3045 if (BE (result < 0, 0))
6291ee3c
UD
3046 {
3047 re_node_set_free (&union_set);
3048 return REG_ESPACE;
3049 }
3050 }
3051 }
3052 re_node_set_free (&union_set);
3053 return REG_NOERROR;
3054}
3055
3056/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3057 CUR_NODES, however exclude the nodes which are:
3058 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3059 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3060*/
3061
3062static reg_errcode_t
e2f55264
UD
3063check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3064 int ex_subexp, int type)
6291ee3c
UD
3065{
3066 reg_errcode_t err;
3067 int idx, outside_node;
3068 re_node_set new_nodes;
3069#ifdef DEBUG
3070 assert (cur_nodes->nelem);
3071#endif
3072 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3073 if (BE (err != REG_NOERROR, 0))
3074 return err;
3075 /* Create a new node set NEW_NODES with the nodes which are epsilon
3076 closures of the node in CUR_NODES. */
3077
3078 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3079 {
3080 int cur_node = cur_nodes->elems[idx];
6efbd82c 3081 const re_node_set *eclosure = dfa->eclosures + cur_node;
0ce7f49c 3082 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
6291ee3c
UD
3083 if (outside_node == -1)
3084 {
3085 /* There are no problematic nodes, just merge them. */
3086 err = re_node_set_merge (&new_nodes, eclosure);
3087 if (BE (err != REG_NOERROR, 0))
3088 {
3089 re_node_set_free (&new_nodes);
3090 return err;
3091 }
3092 }
3093 else
3094 {
3095 /* There are problematic nodes, re-calculate incrementally. */
3096 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
0ce7f49c 3097 ex_subexp, type);
6291ee3c
UD
3098 if (BE (err != REG_NOERROR, 0))
3099 {
3100 re_node_set_free (&new_nodes);
3101 return err;
3102 }
3103 }
3104 }
3105 re_node_set_free (cur_nodes);
3106 *cur_nodes = new_nodes;
3107 return REG_NOERROR;
3108}
3109
3110/* Helper function for check_arrival_expand_ecl.
3111 Check incrementally the epsilon closure of TARGET, and if it isn't
3112 problematic append it to DST_NODES. */
3113
3114static reg_errcode_t
b41bd5bc 3115__attribute_warn_unused_result__
e2f55264
UD
3116check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3117 int target, int ex_subexp, int type)
6291ee3c 3118{
0ce7f49c 3119 int cur_node;
6291ee3c
UD
3120 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3121 {
3122 int err;
6291ee3c 3123
0ce7f49c 3124 if (dfa->nodes[cur_node].type == type
6291ee3c
UD
3125 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3126 {
0ce7f49c 3127 if (type == OP_CLOSE_SUBEXP)
6291ee3c
UD
3128 {
3129 err = re_node_set_insert (dst_nodes, cur_node);
3130 if (BE (err == -1, 0))
3131 return REG_ESPACE;
3132 }
3133 break;
3134 }
3135 err = re_node_set_insert (dst_nodes, cur_node);
3136 if (BE (err == -1, 0))
3137 return REG_ESPACE;
3138 if (dfa->edests[cur_node].nelem == 0)
3139 break;
3140 if (dfa->edests[cur_node].nelem == 2)
3141 {
3142 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3143 dfa->edests[cur_node].elems[1],
0ce7f49c 3144 ex_subexp, type);
6291ee3c
UD
3145 if (BE (err != REG_NOERROR, 0))
3146 return err;
3147 }
3148 cur_node = dfa->edests[cur_node].elems[0];
3149 }
3150 return REG_NOERROR;
3151}
3152
3153
3154/* For all the back references in the current state, calculate the
3155 destination of the back references by the appropriate entry
3156 in MCTX->BKREF_ENTS. */
3157
3158static reg_errcode_t
b41bd5bc 3159__attribute_warn_unused_result__
e2f55264
UD
3160expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3161 int cur_str, int subexp_num, int type)
6291ee3c 3162{
76b864c8 3163 const re_dfa_t *const dfa = mctx->dfa;
6291ee3c 3164 reg_errcode_t err;
e40a38b3
UD
3165 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3166 struct re_backref_cache_entry *ent;
6291ee3c 3167
e40a38b3
UD
3168 if (cache_idx_start == -1)
3169 return REG_NOERROR;
3170
3171 restart:
3172 ent = mctx->bkref_ents + cache_idx_start;
3173 do
6291ee3c
UD
3174 {
3175 int to_idx, next_node;
e40a38b3 3176
6291ee3c
UD
3177 /* Is this entry ENT is appropriate? */
3178 if (!re_node_set_contains (cur_nodes, ent->node))
3179 continue; /* No. */
3180
3181 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3182 /* Calculate the destination of the back reference, and append it
3183 to MCTX->STATE_LOG. */
3184 if (to_idx == cur_str)
3185 {
3186 /* The backreference did epsilon transit, we must re-check all the
3187 node in the current state. */
3188 re_node_set new_dests;
3189 reg_errcode_t err2, err3;
3190 next_node = dfa->edests[ent->node].elems[0];
3191 if (re_node_set_contains (cur_nodes, next_node))
3192 continue;
3193 err = re_node_set_init_1 (&new_dests, next_node);
0ce7f49c 3194 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
6291ee3c
UD
3195 err3 = re_node_set_merge (cur_nodes, &new_dests);
3196 re_node_set_free (&new_dests);
3197 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3198 || err3 != REG_NOERROR, 0))
3199 {
3200 err = (err != REG_NOERROR ? err
3201 : (err2 != REG_NOERROR ? err2 : err3));
3202 return err;
3203 }
3204 /* TODO: It is still inefficient... */
e40a38b3 3205 goto restart;
6291ee3c
UD
3206 }
3207 else
3208 {
3209 re_node_set union_set;
3210 next_node = dfa->nexts[ent->node];
3211 if (mctx->state_log[to_idx])
3212 {
3213 int ret;
3214 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3215 next_node))
3216 continue;
3217 err = re_node_set_init_copy (&union_set,
3218 &mctx->state_log[to_idx]->nodes);
3219 ret = re_node_set_insert (&union_set, next_node);
3220 if (BE (err != REG_NOERROR || ret < 0, 0))
3221 {
3222 re_node_set_free (&union_set);
3223 err = err != REG_NOERROR ? err : REG_ESPACE;
3224 return err;
3225 }
3226 }
3227 else
3228 {
3229 err = re_node_set_init_1 (&union_set, next_node);
3230 if (BE (err != REG_NOERROR, 0))
3231 return err;
3232 }
3233 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3234 re_node_set_free (&union_set);
3235 if (BE (mctx->state_log[to_idx] == NULL
3236 && err != REG_NOERROR, 0))
3237 return err;
3238 }
3239 }
e40a38b3 3240 while (ent++->more);
6291ee3c
UD
3241 return REG_NOERROR;
3242}
3243
3244/* Build transition table for the state.
ab4b89fe 3245 Return 1 if succeeded, otherwise return NULL. */
6291ee3c 3246
ab4b89fe 3247static int
e2f55264 3248build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
6291ee3c
UD
3249{
3250 reg_errcode_t err;
ab4b89fe 3251 int i, j, ch, need_word_trtable = 0;
2c05d33f
UD
3252 bitset_word_t elem, mask;
3253 bool dests_node_malloced = false;
3254 bool dest_states_malloced = false;
6291ee3c
UD
3255 int ndests; /* Number of the destination states from `state'. */
3256 re_dfastate_t **trtable;
3257 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3258 re_node_set follows, *dests_node;
2c05d33f
UD
3259 bitset_t *dests_ch;
3260 bitset_t acceptable;
3261
3262 struct dests_alloc
3263 {
3264 re_node_set dests_node[SBC_MAX];
3265 bitset_t dests_ch[SBC_MAX];
3266 } *dests_alloc;
3b0bdc72
UD
3267
3268 /* We build DFA states which corresponds to the destination nodes
3269 from `state'. `dests_node[i]' represents the nodes which i-th
3270 destination state contains, and `dests_ch[i]' represents the
3271 characters which i-th destination state accepts. */
2c05d33f
UD
3272 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3273 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
05dab910 3274 else
05dab910 3275 {
2c05d33f
UD
3276 dests_alloc = re_malloc (struct dests_alloc, 1);
3277 if (BE (dests_alloc == NULL, 0))
ab4b89fe 3278 return 0;
2c05d33f 3279 dests_node_malloced = true;
05dab910 3280 }
2c05d33f
UD
3281 dests_node = dests_alloc->dests_node;
3282 dests_ch = dests_alloc->dests_ch;
3b0bdc72
UD
3283
3284 /* Initialize transiton table. */
ab4b89fe 3285 state->word_trtable = state->trtable = NULL;
3b0bdc72
UD
3286
3287 /* At first, group all nodes belonging to `state' into several
3288 destinations. */
56b168be 3289 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
bc15410e 3290 if (BE (ndests <= 0, 0))
3b0bdc72 3291 {
05dab910 3292 if (dests_node_malloced)
2c05d33f 3293 free (dests_alloc);
ab4b89fe 3294 /* Return 0 in case of an error, 1 otherwise. */
05dab910 3295 if (ndests == 0)
c13c99fa 3296 {
3ce12656 3297 state->trtable = (re_dfastate_t **)
ab4b89fe 3298 calloc (sizeof (re_dfastate_t *), SBC_MAX);
2543fef2
JM
3299 if (BE (state->trtable == NULL, 0))
3300 return 0;
ab4b89fe 3301 return 1;
c13c99fa 3302 }
ab4b89fe 3303 return 0;
3b0bdc72
UD
3304 }
3305
a9388965 3306 err = re_node_set_alloc (&follows, ndests + 1);
05dab910
RM
3307 if (BE (err != REG_NOERROR, 0))
3308 goto out_free;
daa84549
PE
3309
3310 /* Avoid arithmetic overflow in size calculation. */
3311 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3312 / (3 * sizeof (re_dfastate_t *)))
3313 < ndests),
3314 0))
3315 goto out_free;
05dab910 3316
2c05d33f 3317 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
05dab910
RM
3318 + ndests * 3 * sizeof (re_dfastate_t *)))
3319 dest_states = (re_dfastate_t **)
ab4b89fe 3320 alloca (ndests * 3 * sizeof (re_dfastate_t *));
05dab910 3321 else
05dab910
RM
3322 {
3323 dest_states = (re_dfastate_t **)
ab4b89fe 3324 malloc (ndests * 3 * sizeof (re_dfastate_t *));
05dab910
RM
3325 if (BE (dest_states == NULL, 0))
3326 {
3327out_free:
3328 if (dest_states_malloced)
3329 free (dest_states);
3330 re_node_set_free (&follows);
3331 for (i = 0; i < ndests; ++i)
3332 re_node_set_free (dests_node + i);
05dab910 3333 if (dests_node_malloced)
2c05d33f 3334 free (dests_alloc);
ab4b89fe 3335 return 0;
05dab910 3336 }
2c05d33f 3337 dest_states_malloced = true;
05dab910
RM
3338 }
3339 dest_states_word = dest_states + ndests;
3340 dest_states_nl = dest_states_word + ndests;
3341 bitset_empty (acceptable);
a9388965 3342
3b0bdc72
UD
3343 /* Then build the states for all destinations. */
3344 for (i = 0; i < ndests; ++i)
3345 {
3346 int next_node;
3347 re_node_set_empty (&follows);
3348 /* Merge the follows of this destination states. */
3349 for (j = 0; j < dests_node[i].nelem; ++j)
15a7d175
UD
3350 {
3351 next_node = dfa->nexts[dests_node[i].elems[j]];
3352 if (next_node != -1)
3353 {
3354 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3355 if (BE (err != REG_NOERROR, 0))
05dab910 3356 goto out_free;
15a7d175
UD
3357 }
3358 }
a9388965 3359 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
bc15410e 3360 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
15a7d175 3361 goto out_free;
3b0bdc72 3362 /* If the new state has context constraint,
15a7d175 3363 build appropriate states for these contexts. */
3b0bdc72 3364 if (dest_states[i]->has_constraint)
15a7d175
UD
3365 {
3366 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3367 CONTEXT_WORD);
3368 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3369 goto out_free;
3ce12656 3370
ab4b89fe
UD
3371 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3372 need_word_trtable = 1;
3ce12656 3373
15a7d175
UD
3374 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3375 CONTEXT_NEWLINE);
3376 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3377 goto out_free;
3ce12656 3378 }
3b0bdc72 3379 else
15a7d175
UD
3380 {
3381 dest_states_word[i] = dest_states[i];
3382 dest_states_nl[i] = dest_states[i];
3383 }
3b0bdc72
UD
3384 bitset_merge (acceptable, dests_ch[i]);
3385 }
3386
ab4b89fe 3387 if (!BE (need_word_trtable, 0))
3ce12656
UD
3388 {
3389 /* We don't care about whether the following character is a word
3390 character, or we are in a single-byte character set so we can
3391 discern by looking at the character code: allocate a
3392 256-entry transition table. */
ab4b89fe
UD
3393 trtable = state->trtable =
3394 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3ce12656
UD
3395 if (BE (trtable == NULL, 0))
3396 goto out_free;
3397
3398 /* For all characters ch...: */
2c05d33f
UD
3399 for (i = 0; i < BITSET_WORDS; ++i)
3400 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3ce12656
UD
3401 elem;
3402 mask <<= 1, elem >>= 1, ++ch)
3403 if (BE (elem & 1, 0))
3404 {
3405 /* There must be exactly one destination which accepts
3406 character ch. See group_nodes_into_DFAstates. */
3407 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3408 ;
6b6557e8 3409
3ce12656 3410 /* j-th destination accepts the word character ch. */
56b168be 3411 if (dfa->word_char[i] & mask)
3ce12656
UD
3412 trtable[ch] = dest_states_word[j];
3413 else
3414 trtable[ch] = dest_states[j];
3415 }
3416 }
3417 else
3418 {
3419 /* We care about whether the following character is a word
3420 character, and we are in a multi-byte character set: discern
3421 by looking at the character code: build two 256-entry
3422 transition tables, one starting at trtable[0] and one
3423 starting at trtable[SBC_MAX]. */
ab4b89fe
UD
3424 trtable = state->word_trtable =
3425 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3ce12656
UD
3426 if (BE (trtable == NULL, 0))
3427 goto out_free;
6b6557e8 3428
3ce12656 3429 /* For all characters ch...: */
2c05d33f
UD
3430 for (i = 0; i < BITSET_WORDS; ++i)
3431 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3ce12656
UD
3432 elem;
3433 mask <<= 1, elem >>= 1, ++ch)
3434 if (BE (elem & 1, 0))
3435 {
3436 /* There must be exactly one destination which accepts
3437 character ch. See group_nodes_into_DFAstates. */
3438 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3439 ;
6b6557e8 3440
3ce12656
UD
3441 /* j-th destination accepts the word character ch. */
3442 trtable[ch] = dest_states[j];
3443 trtable[ch + SBC_MAX] = dest_states_word[j];
3444 }
3445 }
6b6557e8 3446
3b0bdc72 3447 /* new line */
c202c2c5
UD
3448 if (bitset_contain (acceptable, NEWLINE_CHAR))
3449 {
3450 /* The current state accepts newline character. */
3ce12656
UD
3451 for (j = 0; j < ndests; ++j)
3452 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
15a7d175
UD
3453 {
3454 /* k-th destination accepts newline character. */
3ce12656 3455 trtable[NEWLINE_CHAR] = dest_states_nl[j];
ab4b89fe 3456 if (need_word_trtable)
3ce12656 3457 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
15a7d175
UD
3458 /* There must be only one destination which accepts
3459 newline. See group_nodes_into_DFAstates. */
3460 break;
3461 }
c202c2c5 3462 }
3b0bdc72 3463
05dab910
RM
3464 if (dest_states_malloced)
3465 free (dest_states);
3b0bdc72
UD
3466
3467 re_node_set_free (&follows);
3468 for (i = 0; i < ndests; ++i)
3469 re_node_set_free (dests_node + i);
3470
05dab910 3471 if (dests_node_malloced)
2c05d33f 3472 free (dests_alloc);
3b0bdc72 3473
ab4b89fe 3474 return 1;
3b0bdc72
UD
3475}
3476
3477/* Group all nodes belonging to STATE into several destinations.
3478 Then for all destinations, set the nodes belonging to the destination
3479 to DESTS_NODE[i] and set the characters accepted by the destination
3480 to DEST_CH[i]. This function return the number of destinations. */
3481
3482static int
e2f55264
UD
3483group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3484 re_node_set *dests_node, bitset_t *dests_ch)
3b0bdc72 3485{
a9388965 3486 reg_errcode_t err;
d2c38eb3 3487 int result;
3b0bdc72
UD
3488 int i, j, k;
3489 int ndests; /* Number of the destinations from `state'. */
2c05d33f 3490 bitset_t accepts; /* Characters a node can accept. */
3b0bdc72
UD
3491 const re_node_set *cur_nodes = &state->nodes;
3492 bitset_empty (accepts);
3493 ndests = 0;
3494
3495 /* For all the nodes belonging to `state', */
3496 for (i = 0; i < cur_nodes->nelem; ++i)
3497 {
3b0bdc72
UD
3498 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3499 re_token_type_t type = node->type;
485d775d 3500 unsigned int constraint = node->constraint;
3b0bdc72
UD
3501
3502 /* Enumerate all single byte character this node can accept. */
3503 if (type == CHARACTER)
15a7d175 3504 bitset_set (accepts, node->opr.c);
3b0bdc72 3505 else if (type == SIMPLE_BRACKET)
15a7d175
UD
3506 {
3507 bitset_merge (accepts, node->opr.sbcset);
3508 }
3b0bdc72 3509 else if (type == OP_PERIOD)
15a7d175 3510 {
65e6becf
UD
3511#ifdef RE_ENABLE_I18N
3512 if (dfa->mb_cur_max > 1)
3513 bitset_merge (accepts, dfa->sb_char);
3514 else
0ce7f49c 3515#endif
65e6becf 3516 bitset_set_all (accepts);
56b168be 3517 if (!(dfa->syntax & RE_DOT_NEWLINE))
15a7d175 3518 bitset_clear (accepts, '\n');
56b168be 3519 if (dfa->syntax & RE_DOT_NOT_NULL)
15a7d175
UD
3520 bitset_clear (accepts, '\0');
3521 }
c0d5034e 3522#ifdef RE_ENABLE_I18N
ad7f28c2 3523 else if (type == OP_UTF8_PERIOD)
2da42bc0 3524 {
2c05d33f 3525 memset (accepts, '\xff', sizeof (bitset_t) / 2);
56b168be 3526 if (!(dfa->syntax & RE_DOT_NEWLINE))
ad7f28c2 3527 bitset_clear (accepts, '\n');
56b168be 3528 if (dfa->syntax & RE_DOT_NOT_NULL)
ad7f28c2 3529 bitset_clear (accepts, '\0');
2da42bc0 3530 }
c0d5034e 3531#endif
3b0bdc72 3532 else
15a7d175 3533 continue;
3b0bdc72
UD
3534
3535 /* Check the `accepts' and sift the characters which are not
15a7d175 3536 match it the context. */
3b0bdc72 3537 if (constraint)
15a7d175 3538 {
15a7d175
UD
3539 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3540 {
2c05d33f 3541 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
15a7d175
UD
3542 bitset_empty (accepts);
3543 if (accepts_newline)
3544 bitset_set (accepts, NEWLINE_CHAR);
3545 else
3546 continue;
3547 }
66b110e8
UD
3548 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3549 {
3550 bitset_empty (accepts);
3551 continue;
3552 }
c13c99fa 3553
66b110e8 3554 if (constraint & NEXT_WORD_CONSTRAINT)
65e6becf 3555 {
2c05d33f 3556 bitset_word_t any_set = 0;
1cef7b3c
UD
3557 if (type == CHARACTER && !node->word_char)
3558 {
3559 bitset_empty (accepts);
3560 continue;
3561 }
65e6becf
UD
3562#ifdef RE_ENABLE_I18N
3563 if (dfa->mb_cur_max > 1)
2c05d33f 3564 for (j = 0; j < BITSET_WORDS; ++j)
457beec8 3565 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
65e6becf
UD
3566 else
3567#endif
2c05d33f 3568 for (j = 0; j < BITSET_WORDS; ++j)
457beec8
UD
3569 any_set |= (accepts[j] &= dfa->word_char[j]);
3570 if (!any_set)
3571 continue;
65e6becf 3572 }
66b110e8 3573 if (constraint & NEXT_NOTWORD_CONSTRAINT)
65e6becf 3574 {
2c05d33f 3575 bitset_word_t any_set = 0;
1cef7b3c
UD
3576 if (type == CHARACTER && node->word_char)
3577 {
3578 bitset_empty (accepts);
3579 continue;
3580 }
65e6becf
UD
3581#ifdef RE_ENABLE_I18N
3582 if (dfa->mb_cur_max > 1)
2c05d33f 3583 for (j = 0; j < BITSET_WORDS; ++j)
457beec8 3584 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
65e6becf
UD
3585 else
3586#endif
2c05d33f 3587 for (j = 0; j < BITSET_WORDS; ++j)
457beec8
UD
3588 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3589 if (!any_set)
3590 continue;
65e6becf 3591 }
15a7d175 3592 }
3b0bdc72
UD
3593
3594 /* Then divide `accepts' into DFA states, or create a new
457beec8 3595 state. Above, we make sure that accepts is not empty. */
3b0bdc72 3596 for (j = 0; j < ndests; ++j)
15a7d175 3597 {
2c05d33f
UD
3598 bitset_t intersec; /* Intersection sets, see below. */
3599 bitset_t remains;
15a7d175 3600 /* Flags, see below. */
2c05d33f 3601 bitset_word_t has_intersec, not_subset, not_consumed;
15a7d175
UD
3602
3603 /* Optimization, skip if this state doesn't accept the character. */
3604 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3605 continue;
3606
3607 /* Enumerate the intersection set of this state and `accepts'. */
3608 has_intersec = 0;
2c05d33f 3609 for (k = 0; k < BITSET_WORDS; ++k)
15a7d175
UD
3610 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3611 /* And skip if the intersection set is empty. */
3612 if (!has_intersec)
3613 continue;
3614
3615 /* Then check if this state is a subset of `accepts'. */
3616 not_subset = not_consumed = 0;
2c05d33f 3617 for (k = 0; k < BITSET_WORDS; ++k)
15a7d175
UD
3618 {
3619 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3620 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3621 }
3622
3623 /* If this state isn't a subset of `accepts', create a
3624 new group state, which has the `remains'. */
3625 if (not_subset)
3626 {
3627 bitset_copy (dests_ch[ndests], remains);
3628 bitset_copy (dests_ch[j], intersec);
3629 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3630 if (BE (err != REG_NOERROR, 0))
3631 goto error_return;
3632 ++ndests;
3633 }
3634
3635 /* Put the position in the current group. */
d2c38eb3
UD
3636 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3637 if (BE (result < 0, 0))
15a7d175
UD
3638 goto error_return;
3639
3640 /* If all characters are consumed, go to next node. */
3641 if (!not_consumed)
3642 break;
3643 }
3b0bdc72
UD
3644 /* Some characters remain, create a new group. */
3645 if (j == ndests)
15a7d175
UD
3646 {
3647 bitset_copy (dests_ch[ndests], accepts);
3648 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3649 if (BE (err != REG_NOERROR, 0))
3650 goto error_return;
3651 ++ndests;
3652 bitset_empty (accepts);
3653 }
3b0bdc72
UD
3654 }
3655 return ndests;
1b2c2628
UD
3656 error_return:
3657 for (j = 0; j < ndests; ++j)
3658 re_node_set_free (dests_node + j);
3659 return -1;
3b0bdc72
UD
3660}
3661
434d3784
UD
3662#ifdef RE_ENABLE_I18N
3663/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3664 Return the number of the bytes the node accepts.
3665 STR_IDX is the current index of the input string.
3666
3667 This function handles the nodes which can accept one character, or
3668 one collating element like '.', '[a-z]', opposite to the other nodes
3669 can only accept one byte. */
3b0bdc72 3670
8c0ab919
RM
3671# ifdef _LIBC
3672# include <locale/weight.h>
3673# endif
3674
3b0bdc72 3675static int
e2f55264
UD
3676check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3677 const re_string_t *input, int str_idx)
3b0bdc72 3678{
3b0bdc72 3679 const re_token_t *node = dfa->nodes + node_idx;
ad7f28c2 3680 int char_len, elem_len;
434d3784 3681 int i;
ad7f28c2
UD
3682
3683 if (BE (node->type == OP_UTF8_PERIOD, 0))
3684 {
3685 unsigned char c = re_string_byte_at (input, str_idx), d;
3686 if (BE (c < 0xc2, 1))
3687 return 0;
3688
3689 if (str_idx + 2 > input->len)
3690 return 0;
3691
3692 d = re_string_byte_at (input, str_idx + 1);
3693 if (c < 0xe0)
3694 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3695 else if (c < 0xf0)
3696 {
3697 char_len = 3;
3698 if (c == 0xe0 && d < 0xa0)
3699 return 0;
3700 }
3701 else if (c < 0xf8)
3702 {
3703 char_len = 4;
3704 if (c == 0xf0 && d < 0x90)
3705 return 0;
3706 }
3707 else if (c < 0xfc)
3708 {
3709 char_len = 5;
3710 if (c == 0xf8 && d < 0x88)
3711 return 0;
3712 }
3713 else if (c < 0xfe)
3714 {
3715 char_len = 6;
3716 if (c == 0xfc && d < 0x84)
3717 return 0;
3718 }
3719 else
3720 return 0;
3721
3722 if (str_idx + char_len > input->len)
3723 return 0;
3724
3725 for (i = 1; i < char_len; ++i)
3726 {
3727 d = re_string_byte_at (input, str_idx + i);
3728 if (d < 0x80 || d > 0xbf)
3729 return 0;
3730 }
3731 return char_len;
3732 }
3733
3734 char_len = re_string_char_size_at (input, str_idx);
3b0bdc72
UD
3735 if (node->type == OP_PERIOD)
3736 {
ad7f28c2 3737 if (char_len <= 1)
2da42bc0 3738 return 0;
ad7f28c2
UD
3739 /* FIXME: I don't think this if is needed, as both '\n'
3740 and '\0' are char_len == 1. */
434d3784 3741 /* '.' accepts any one character except the following two cases. */
56b168be 3742 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
15a7d175 3743 re_string_byte_at (input, str_idx) == '\n') ||
56b168be 3744 ((dfa->syntax & RE_DOT_NOT_NULL) &&
15a7d175
UD
3745 re_string_byte_at (input, str_idx) == '\0'))
3746 return 0;
3b0bdc72
UD
3747 return char_len;
3748 }
ad7f28c2
UD
3749
3750 elem_len = re_string_elem_size_at (input, str_idx);
6c2a04a7 3751 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
ad7f28c2
UD
3752 return 0;
3753
3754 if (node->type == COMPLEX_BRACKET)
3b0bdc72
UD
3755 {
3756 const re_charset_t *cset = node->opr.mbcset;
434d3784 3757# ifdef _LIBC
1c99f950
UD
3758 const unsigned char *pin
3759 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
5f93cd52
UD
3760 int j;
3761 uint32_t nrules;
434d3784
UD
3762# endif /* _LIBC */
3763 int match_len = 0;
3764 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
15a7d175 3765 ? re_string_wchar_at (input, str_idx) : 0);
434d3784
UD
3766
3767 /* match with multibyte character? */
3768 for (i = 0; i < cset->nmbchars; ++i)
15a7d175
UD
3769 if (wc == cset->mbchars[i])
3770 {
3771 match_len = char_len;
3772 goto check_node_accept_bytes_match;
3773 }
434d3784
UD
3774 /* match with character_class? */
3775 for (i = 0; i < cset->nchar_classes; ++i)
15a7d175
UD
3776 {
3777 wctype_t wt = cset->char_classes[i];
3778 if (__iswctype (wc, wt))
3779 {
3780 match_len = char_len;
3781 goto check_node_accept_bytes_match;
3782 }
3783 }
434d3784
UD
3784
3785# ifdef _LIBC
5f93cd52 3786 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3b0bdc72 3787 if (nrules != 0)
15a7d175
UD
3788 {
3789 unsigned int in_collseq = 0;
3790 const int32_t *table, *indirect;
3791 const unsigned char *weights, *extra;
3792 const char *collseqwc;
3b0bdc72 3793
15a7d175
UD
3794 /* match with collating_symbol? */
3795 if (cset->ncoll_syms)
3796 extra = (const unsigned char *)
3797 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3798 for (i = 0; i < cset->ncoll_syms; ++i)
3799 {
3800 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3801 /* Compare the length of input collating element and
3802 the length of current collating element. */
3803 if (*coll_sym != elem_len)
3804 continue;
3805 /* Compare each bytes. */
3806 for (j = 0; j < *coll_sym; j++)
3807 if (pin[j] != coll_sym[1 + j])
3808 break;
3809 if (j == *coll_sym)
3810 {
3811 /* Match if every bytes is equal. */
3812 match_len = j;
3813 goto check_node_accept_bytes_match;
3814 }
3815 }
3816
3817 if (cset->nranges)
3818 {
3819 if (elem_len <= char_len)
3820 {
3821 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
25337753 3822 in_collseq = __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
3823 }
3824 else
3825 in_collseq = find_collation_sequence_value (pin, elem_len);
3826 }
3827 /* match with range expression? */
3828 for (i = 0; i < cset->nranges; ++i)
3829 if (cset->range_starts[i] <= in_collseq
3830 && in_collseq <= cset->range_ends[i])
3831 {
3832 match_len = elem_len;
3833 goto check_node_accept_bytes_match;
3834 }
3835
3836 /* match with equivalence_class? */
3837 if (cset->nequiv_classes)
3838 {
3839 const unsigned char *cp = pin;
3840 table = (const int32_t *)
3841 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3842 weights = (const unsigned char *)
3843 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3844 extra = (const unsigned char *)
3845 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3846 indirect = (const int32_t *)
3847 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
8c0ab919 3848 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
15a7d175
UD
3849 if (idx > 0)
3850 for (i = 0; i < cset->nequiv_classes; ++i)
3851 {
3852 int32_t equiv_class_idx = cset->equiv_classes[i];
b7d1c5fa
UD
3853 size_t weight_len = weights[idx & 0xffffff];
3854 if (weight_len == weights[equiv_class_idx & 0xffffff]
3855 && (idx >> 24) == (equiv_class_idx >> 24))
15a7d175
UD
3856 {
3857 int cnt = 0;
b7d1c5fa
UD
3858
3859 idx &= 0xffffff;
3860 equiv_class_idx &= 0xffffff;
3861
15a7d175
UD
3862 while (cnt <= weight_len
3863 && (weights[equiv_class_idx + 1 + cnt]
3864 == weights[idx + 1 + cnt]))
3865 ++cnt;
3866 if (cnt > weight_len)
3867 {
3868 match_len = elem_len;
3869 goto check_node_accept_bytes_match;
3870 }
3871 }
3872 }
3873 }
3874 }
434d3784
UD
3875 else
3876# endif /* _LIBC */
15a7d175
UD
3877 {
3878 /* match with range expression? */
92b27c74 3879#if __GNUC__ >= 2
15a7d175 3880 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
92b27c74 3881#else
15a7d175
UD
3882 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3883 cmp_buf[2] = wc;
92b27c74 3884#endif
15a7d175
UD
3885 for (i = 0; i < cset->nranges; ++i)
3886 {
3887 cmp_buf[0] = cset->range_starts[i];
3888 cmp_buf[4] = cset->range_ends[i];
2f44ee08
JM
3889 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
3890 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
15a7d175
UD
3891 {
3892 match_len = char_len;
3893 goto check_node_accept_bytes_match;
3894 }
3895 }
3896 }
434d3784
UD
3897 check_node_accept_bytes_match:
3898 if (!cset->non_match)
15a7d175 3899 return match_len;
434d3784 3900 else
15a7d175
UD
3901 {
3902 if (match_len > 0)
3903 return 0;
3904 else
3905 return (elem_len > char_len) ? elem_len : char_len;
3906 }
3b0bdc72
UD
3907 }
3908 return 0;
3909}
3910
434d3784 3911# ifdef _LIBC
3b0bdc72 3912static unsigned int
e2f55264 3913find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3b0bdc72
UD
3914{
3915 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3916 if (nrules == 0)
3917 {
3918 if (mbs_len == 1)
15a7d175
UD
3919 {
3920 /* No valid character. Match it as a single byte character. */
3921 const unsigned char *collseq = (const unsigned char *)
3922 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3923 return collseq[mbs[0]];
3924 }
3b0bdc72
UD
3925 return UINT_MAX;
3926 }
3927 else
3928 {
3929 int32_t idx;
3930 const unsigned char *extra = (const unsigned char *)
15a7d175 3931 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6c2a04a7
UD
3932 int32_t extrasize = (const unsigned char *)
3933 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3b0bdc72 3934
6c2a04a7 3935 for (idx = 0; idx < extrasize;)
15a7d175
UD
3936 {
3937 int mbs_cnt, found = 0;
3938 int32_t elem_mbs_len;
3939 /* Skip the name of collating element name. */
3940 idx = idx + extra[idx] + 1;
3941 elem_mbs_len = extra[idx++];
3942 if (mbs_len == elem_mbs_len)
3943 {
3944 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3945 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3946 break;
3947 if (mbs_cnt == elem_mbs_len)
3948 /* Found the entry. */
3949 found = 1;
3950 }
3951 /* Skip the byte sequence of the collating element. */
3952 idx += elem_mbs_len;
3953 /* Adjust for the alignment. */
3954 idx = (idx + 3) & ~3;
3955 /* Skip the collation sequence value. */
3956 idx += sizeof (uint32_t);
3957 /* Skip the wide char sequence of the collating element. */
d84acf38 3958 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
15a7d175
UD
3959 /* If we found the entry, return the sequence value. */
3960 if (found)
3961 return *(uint32_t *) (extra + idx);
3962 /* Skip the collation sequence value. */
3963 idx += sizeof (uint32_t);
3964 }
6c2a04a7 3965 return UINT_MAX;
3b0bdc72
UD
3966 }
3967}
434d3784
UD
3968# endif /* _LIBC */
3969#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3970
3971/* Check whether the node accepts the byte which is IDX-th
3972 byte of the INPUT. */
3973
3974static int
e2f55264
UD
3975check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3976 int idx)
3b0bdc72 3977{
3b0bdc72 3978 unsigned char ch;
56b168be 3979 ch = re_string_byte_at (&mctx->input, idx);
ad7f28c2
UD
3980 switch (node->type)
3981 {
3982 case CHARACTER:
5cf1ec52 3983 if (node->opr.c != ch)
2da42bc0 3984 return 0;
5cf1ec52
UD
3985 break;
3986
ad7f28c2 3987 case SIMPLE_BRACKET:
5cf1ec52 3988 if (!bitset_contain (node->opr.sbcset, ch))
2da42bc0 3989 return 0;
5cf1ec52
UD
3990 break;
3991
c0d5034e 3992#ifdef RE_ENABLE_I18N
ad7f28c2
UD
3993 case OP_UTF8_PERIOD:
3994 if (ch >= 0x80)
2da42bc0 3995 return 0;
ad7f28c2 3996 /* FALLTHROUGH */
c0d5034e 3997#endif
ad7f28c2 3998 case OP_PERIOD:
5cf1ec52
UD
3999 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4000 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4001 return 0;
4002 break;
4003
ad7f28c2
UD
4004 default:
4005 return 0;
4006 }
5cf1ec52
UD
4007
4008 if (node->constraint)
4009 {
4010 /* The node has constraints. Check whether the current context
4011 satisfies the constraints. */
4012 unsigned int context = re_string_context_at (&mctx->input, idx,
4013 mctx->eflags);
4014 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4015 return 0;
4016 }
4017
4018 return 1;
3b0bdc72 4019}
612546c6
UD
4020
4021/* Extend the buffers, if the buffers have run out. */
4022
4023static reg_errcode_t
b41bd5bc 4024__attribute_warn_unused_result__
a445af0b 4025extend_buffers (re_match_context_t *mctx, int min_len)
612546c6
UD
4026{
4027 reg_errcode_t ret;
56b168be 4028 re_string_t *pstr = &mctx->input;
612546c6 4029
aef699dc
PE
4030 /* Avoid overflow. */
4031 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4032 return REG_ESPACE;
4033
a445af0b
AS
4034 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4035 ret = re_string_realloc_buffers (pstr,
4036 MAX (min_len,
4037 MIN (pstr->len, pstr->bufs_len * 2)));
612546c6
UD
4038 if (BE (ret != REG_NOERROR, 0))
4039 return ret;
4040
4041 if (mctx->state_log != NULL)
4042 {
4043 /* And double the length of state_log. */
951d6408
UD
4044 /* XXX We have no indication of the size of this buffer. If this
4045 allocation fail we have no indication that the state_log array
4046 does not have the right size. */
4047 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4048 pstr->bufs_len + 1);
1b2c2628 4049 if (BE (new_array == NULL, 0))
15a7d175 4050 return REG_ESPACE;
1b2c2628 4051 mctx->state_log = new_array;
612546c6
UD
4052 }
4053
4054 /* Then reconstruct the buffers. */
4055 if (pstr->icase)
4056 {
4057#ifdef RE_ENABLE_I18N
3c0fb574 4058 if (pstr->mb_cur_max > 1)
bb3f4825
UD
4059 {
4060 ret = build_wcs_upper_buffer (pstr);
4061 if (BE (ret != REG_NOERROR, 0))
4062 return ret;
4063 }
612546c6
UD
4064 else
4065#endif /* RE_ENABLE_I18N */
15a7d175 4066 build_upper_buffer (pstr);
612546c6
UD
4067 }
4068 else
4069 {
4070#ifdef RE_ENABLE_I18N
3c0fb574 4071 if (pstr->mb_cur_max > 1)
15a7d175 4072 build_wcs_buffer (pstr);
612546c6
UD
4073 else
4074#endif /* RE_ENABLE_I18N */
15a7d175
UD
4075 {
4076 if (pstr->trans != NULL)
4077 re_string_translate_buffer (pstr);
15a7d175 4078 }
612546c6
UD
4079 }
4080 return REG_NOERROR;
4081}
4082
3b0bdc72
UD
4083\f
4084/* Functions for matching context. */
4085
6291ee3c
UD
4086/* Initialize MCTX. */
4087
a9388965 4088static reg_errcode_t
b41bd5bc 4089__attribute_warn_unused_result__
e2f55264 4090match_ctx_init (re_match_context_t *mctx, int eflags, int n)
3b0bdc72
UD
4091{
4092 mctx->eflags = eflags;
612546c6 4093 mctx->match_last = -1;
3b0bdc72 4094 if (n > 0)
a9388965
UD
4095 {
4096 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
6291ee3c
UD
4097 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4098 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
15a7d175 4099 return REG_ESPACE;
a9388965 4100 }
56b168be
UD
4101 /* Already zero-ed by the caller.
4102 else
4103 mctx->bkref_ents = NULL;
4104 mctx->nbkref_ents = 0;
4105 mctx->nsub_tops = 0; */
3b0bdc72 4106 mctx->abkref_ents = n;
6291ee3c 4107 mctx->max_mb_elem_len = 1;
6291ee3c 4108 mctx->asub_tops = n;
a9388965 4109 return REG_NOERROR;
3b0bdc72
UD
4110}
4111
6291ee3c
UD
4112/* Clean the entries which depend on the current input in MCTX.
4113 This function must be invoked when the matcher changes the start index
4114 of the input, or changes the input string. */
4115
4116static void
e2f55264 4117match_ctx_clean (re_match_context_t *mctx)
6291ee3c
UD
4118{
4119 int st_idx;
4120 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4121 {
4122 int sl_idx;
4123 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4124 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4125 {
4126 re_sub_match_last_t *last = top->lasts[sl_idx];
4127 re_free (last->path.array);
4128 re_free (last);
4129 }
4130 re_free (top->lasts);
4131 if (top->path)
4132 {
4133 re_free (top->path->array);
4134 re_free (top->path);
4135 }
4136 free (top);
4137 }
cb265fec
UD
4138
4139 mctx->nsub_tops = 0;
4140 mctx->nbkref_ents = 0;
4141}
4142
4143/* Free all the memory associated with MCTX. */
4144
4145static void
e2f55264 4146match_ctx_free (re_match_context_t *mctx)
cb265fec
UD
4147{
4148 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4149 match_ctx_clean (mctx);
4150 re_free (mctx->sub_tops);
4151 re_free (mctx->bkref_ents);
6291ee3c
UD
4152}
4153
4154/* Add a new backreference entry to MCTX.
4155 Note that we assume that caller never call this function with duplicate
4156 entry, and call with STR_IDX which isn't smaller than any existing entry.
4157*/
3b0bdc72 4158
a9388965 4159static reg_errcode_t
b41bd5bc 4160__attribute_warn_unused_result__
e2f55264
UD
4161match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4162 int to)
3b0bdc72
UD
4163{
4164 if (mctx->nbkref_ents >= mctx->abkref_ents)
4165 {
1b2c2628
UD
4166 struct re_backref_cache_entry* new_entry;
4167 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
15a7d175 4168 mctx->abkref_ents * 2);
1b2c2628 4169 if (BE (new_entry == NULL, 0))
15a7d175
UD
4170 {
4171 re_free (mctx->bkref_ents);
4172 return REG_ESPACE;
4173 }
1b2c2628 4174 mctx->bkref_ents = new_entry;
3b0bdc72 4175 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
15a7d175 4176 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3b0bdc72
UD
4177 mctx->abkref_ents *= 2;
4178 }
e40a38b3
UD
4179 if (mctx->nbkref_ents > 0
4180 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4181 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4182
3b0bdc72 4183 mctx->bkref_ents[mctx->nbkref_ents].node = node;
0742e48e
UD
4184 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4185 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
e40a38b3 4186 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
7db61208
UD
4187
4188 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4189 If bit N is clear, means that this entry won't epsilon-transition to
4190 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4191 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4192 such node.
4193
4194 A backreference does not epsilon-transition unless it is empty, so set
4195 to all zeros if FROM != TO. */
4196 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4197 = (from == to ? ~0 : 0);
4198
e40a38b3 4199 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
0742e48e
UD
4200 if (mctx->max_mb_elem_len < to - from)
4201 mctx->max_mb_elem_len = to - from;
a9388965 4202 return REG_NOERROR;
3b0bdc72 4203}
0742e48e 4204
e40a38b3
UD
4205/* Search for the first entry which has the same str_idx, or -1 if none is
4206 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
6291ee3c
UD
4207
4208static int
e2f55264 4209search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
6291ee3c 4210{
e40a38b3
UD
4211 int left, right, mid, last;
4212 last = right = mctx->nbkref_ents;
6291ee3c
UD
4213 for (left = 0; left < right;)
4214 {
4215 mid = (left + right) / 2;
4216 if (mctx->bkref_ents[mid].str_idx < str_idx)
4217 left = mid + 1;
4218 else
4219 right = mid;
4220 }
e40a38b3
UD
4221 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4222 return left;
4223 else
4224 return -1;
6291ee3c
UD
4225}
4226
6291ee3c
UD
4227/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4228 at STR_IDX. */
4229
4230static reg_errcode_t
b41bd5bc 4231__attribute_warn_unused_result__
e2f55264 4232match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
6291ee3c
UD
4233{
4234#ifdef DEBUG
4235 assert (mctx->sub_tops != NULL);
4236 assert (mctx->asub_tops > 0);
4237#endif
951d6408 4238 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
6291ee3c 4239 {
951d6408
UD
4240 int new_asub_tops = mctx->asub_tops * 2;
4241 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4242 re_sub_match_top_t *,
4243 new_asub_tops);
6291ee3c
UD
4244 if (BE (new_array == NULL, 0))
4245 return REG_ESPACE;
4246 mctx->sub_tops = new_array;
951d6408 4247 mctx->asub_tops = new_asub_tops;
6291ee3c
UD
4248 }
4249 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
951d6408 4250 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
6291ee3c
UD
4251 return REG_ESPACE;
4252 mctx->sub_tops[mctx->nsub_tops]->node = node;
4253 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4254 return REG_NOERROR;
4255}
4256
4257/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4258 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4259
4260static re_sub_match_last_t *
e2f55264 4261match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
6291ee3c
UD
4262{
4263 re_sub_match_last_t *new_entry;
951d6408 4264 if (BE (subtop->nlasts == subtop->alasts, 0))
6291ee3c 4265 {
951d6408
UD
4266 int new_alasts = 2 * subtop->alasts + 1;
4267 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4268 re_sub_match_last_t *,
4269 new_alasts);
6291ee3c
UD
4270 if (BE (new_array == NULL, 0))
4271 return NULL;
4272 subtop->lasts = new_array;
951d6408 4273 subtop->alasts = new_alasts;
6291ee3c
UD
4274 }
4275 new_entry = calloc (1, sizeof (re_sub_match_last_t));
951d6408
UD
4276 if (BE (new_entry != NULL, 1))
4277 {
4278 subtop->lasts[subtop->nlasts] = new_entry;
4279 new_entry->node = node;
4280 new_entry->str_idx = str_idx;
4281 ++subtop->nlasts;
4282 }
6291ee3c
UD
4283 return new_entry;
4284}
4285
0742e48e 4286static void
e2f55264
UD
4287sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4288 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
0742e48e
UD
4289{
4290 sctx->sifted_states = sifted_sts;
4291 sctx->limited_states = limited_sts;
4292 sctx->last_node = last_node;
4293 sctx->last_str_idx = last_str_idx;
0742e48e
UD
4294 re_node_set_init_empty (&sctx->limits);
4295}