]> git.ipfire.org Git - thirdparty/git.git/blob - compat/regex/regexec.c
compat/regex: define out variables only used under RE_ENABLE_I18N
[thirdparty/git.git] / compat / regex / regexec.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59 static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70 static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
76
77 #ifdef RE_ENABLE_I18N
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99 static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137 #if 0
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142 #endif
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182 static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188 # ifdef _LIBC
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 size_t name_len)
191 internal_function;
192 # endif /* _LIBC */
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198 static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
200 internal_function;
201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202 internal_function;
203 \f
204 /* Entry point for POSIX code. */
205
206 /* regexec searches for a given pattern, specified by PREG, in the
207 string STRING.
208
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
213
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
217
218 We return 0 if we find a match and REG_NOMATCH if not. */
219
220 int
221 regexec (preg, string, nmatch, pmatch, eflags)
222 const regex_t *__restrict preg;
223 const char *__restrict string;
224 size_t nmatch;
225 regmatch_t pmatch[];
226 int eflags;
227 {
228 reg_errcode_t err;
229 int start, length;
230
231 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
232 return REG_BADPAT;
233
234 if (eflags & REG_STARTEND)
235 {
236 start = pmatch[0].rm_so;
237 length = pmatch[0].rm_eo;
238 }
239 else
240 {
241 start = 0;
242 length = strlen (string);
243 }
244
245 __libc_lock_lock (dfa->lock);
246 if (preg->no_sub)
247 err = re_search_internal (preg, string, length, start, length - start,
248 length, 0, NULL, eflags);
249 else
250 err = re_search_internal (preg, string, length, start, length - start,
251 length, nmatch, pmatch, eflags);
252 __libc_lock_unlock (dfa->lock);
253 return err != REG_NOERROR;
254 }
255
256 #ifdef _LIBC
257 # include <shlib-compat.h>
258 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
259
260 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
261 __typeof__ (__regexec) __compat_regexec;
262
263 int
264 attribute_compat_text_section
265 __compat_regexec (const regex_t *__restrict preg,
266 const char *__restrict string, size_t nmatch,
267 regmatch_t pmatch[], int eflags)
268 {
269 return regexec (preg, string, nmatch, pmatch,
270 eflags & (REG_NOTBOL | REG_NOTEOL));
271 }
272 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
273 # endif
274 #endif
275
276 /* Entry points for GNU code. */
277
278 /* re_match, re_search, re_match_2, re_search_2
279
280 The former two functions operate on STRING with length LENGTH,
281 while the later two operate on concatenation of STRING1 and STRING2
282 with lengths LENGTH1 and LENGTH2, respectively.
283
284 re_match() matches the compiled pattern in BUFP against the string,
285 starting at index START.
286
287 re_search() first tries matching at index START, then it tries to match
288 starting from index START + 1, and so on. The last start position tried
289 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
290 way as re_match().)
291
292 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
293 the first STOP characters of the concatenation of the strings should be
294 concerned.
295
296 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
297 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
298 computed relative to the concatenation, not relative to the individual
299 strings.)
300
301 On success, re_match* functions return the length of the match, re_search*
302 return the position of the start of the match. Return value -1 means no
303 match was found and -2 indicates an internal error. */
304
305 int
306 re_match (bufp, string, length, start, regs)
307 struct re_pattern_buffer *bufp;
308 const char *string;
309 int length, start;
310 struct re_registers *regs;
311 {
312 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
313 }
314 #ifdef _LIBC
315 weak_alias (__re_match, re_match)
316 #endif
317
318 int
319 re_search (bufp, string, length, start, range, regs)
320 struct re_pattern_buffer *bufp;
321 const char *string;
322 int length, start, range;
323 struct re_registers *regs;
324 {
325 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
326 }
327 #ifdef _LIBC
328 weak_alias (__re_search, re_search)
329 #endif
330
331 int
332 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
333 struct re_pattern_buffer *bufp;
334 const char *string1, *string2;
335 int length1, length2, start, stop;
336 struct re_registers *regs;
337 {
338 return re_search_2_stub (bufp, string1, length1, string2, length2,
339 start, 0, regs, stop, 1);
340 }
341 #ifdef _LIBC
342 weak_alias (__re_match_2, re_match_2)
343 #endif
344
345 int
346 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
347 struct re_pattern_buffer *bufp;
348 const char *string1, *string2;
349 int length1, length2, start, range, stop;
350 struct re_registers *regs;
351 {
352 return re_search_2_stub (bufp, string1, length1, string2, length2,
353 start, range, regs, stop, 0);
354 }
355 #ifdef _LIBC
356 weak_alias (__re_search_2, re_search_2)
357 #endif
358
359 static int
360 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
361 stop, ret_len)
362 struct re_pattern_buffer *bufp;
363 const char *string1, *string2;
364 int length1, length2, start, range, stop, ret_len;
365 struct re_registers *regs;
366 {
367 const char *str;
368 int rval;
369 int len = length1 + length2;
370 int free_str = 0;
371
372 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
373 return -2;
374
375 /* Concatenate the strings. */
376 if (length2 > 0)
377 if (length1 > 0)
378 {
379 char *s = re_malloc (char, len);
380
381 if (BE (s == NULL, 0))
382 return -2;
383 memcpy (s, string1, length1);
384 memcpy (s + length1, string2, length2);
385 str = s;
386 free_str = 1;
387 }
388 else
389 str = string2;
390 else
391 str = string1;
392
393 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
394 if (free_str)
395 re_free ((char *) str);
396 return rval;
397 }
398
399 /* The parameters have the same meaning as those of re_search.
400 Additional parameters:
401 If RET_LEN is nonzero the length of the match is returned (re_match style);
402 otherwise the position of the match is returned. */
403
404 static int
405 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
406 struct re_pattern_buffer *bufp;
407 const char *string;
408 int length, start, range, stop, ret_len;
409 struct re_registers *regs;
410 {
411 reg_errcode_t result;
412 regmatch_t *pmatch;
413 int nregs, rval;
414 int eflags = 0;
415
416 /* Check for out-of-range. */
417 if (BE (start < 0 || start > length, 0))
418 return -1;
419 if (BE (start + range > length, 0))
420 range = length - start;
421 else if (BE (start + range < 0, 0))
422 range = -start;
423
424 __libc_lock_lock (dfa->lock);
425
426 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
427 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
428
429 /* Compile fastmap if we haven't yet. */
430 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
431 re_compile_fastmap (bufp);
432
433 if (BE (bufp->no_sub, 0))
434 regs = NULL;
435
436 /* We need at least 1 register. */
437 if (regs == NULL)
438 nregs = 1;
439 else if (BE (bufp->regs_allocated == REGS_FIXED &&
440 regs->num_regs < bufp->re_nsub + 1, 0))
441 {
442 nregs = regs->num_regs;
443 if (BE (nregs < 1, 0))
444 {
445 /* Nothing can be copied to regs. */
446 regs = NULL;
447 nregs = 1;
448 }
449 }
450 else
451 nregs = bufp->re_nsub + 1;
452 pmatch = re_malloc (regmatch_t, nregs);
453 if (BE (pmatch == NULL, 0))
454 {
455 rval = -2;
456 goto out;
457 }
458
459 result = re_search_internal (bufp, string, length, start, range, stop,
460 nregs, pmatch, eflags);
461
462 rval = 0;
463
464 /* I hope we needn't fill ther regs with -1's when no match was found. */
465 if (result != REG_NOERROR)
466 rval = -1;
467 else if (regs != NULL)
468 {
469 /* If caller wants register contents data back, copy them. */
470 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
471 bufp->regs_allocated);
472 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
473 rval = -2;
474 }
475
476 if (BE (rval == 0, 1))
477 {
478 if (ret_len)
479 {
480 assert (pmatch[0].rm_so == start);
481 rval = pmatch[0].rm_eo - start;
482 }
483 else
484 rval = pmatch[0].rm_so;
485 }
486 re_free (pmatch);
487 out:
488 __libc_lock_unlock (dfa->lock);
489 return rval;
490 }
491
492 static unsigned
493 re_copy_regs (regs, pmatch, nregs, regs_allocated)
494 struct re_registers *regs;
495 regmatch_t *pmatch;
496 int nregs, regs_allocated;
497 {
498 int rval = REGS_REALLOCATE;
499 int i;
500 int need_regs = nregs + 1;
501 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
502 uses. */
503
504 /* Have the register data arrays been allocated? */
505 if (regs_allocated == REGS_UNALLOCATED)
506 { /* No. So allocate them with malloc. */
507 regs->start = re_malloc (regoff_t, need_regs);
508 if (BE (regs->start == NULL, 0))
509 return REGS_UNALLOCATED;
510 regs->end = re_malloc (regoff_t, need_regs);
511 if (BE (regs->end == NULL, 0))
512 {
513 re_free (regs->start);
514 return REGS_UNALLOCATED;
515 }
516 regs->num_regs = need_regs;
517 }
518 else if (regs_allocated == REGS_REALLOCATE)
519 { /* Yes. If we need more elements than were already
520 allocated, reallocate them. If we need fewer, just
521 leave it alone. */
522 if (BE (need_regs > regs->num_regs, 0))
523 {
524 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
525 regoff_t *new_end;
526 if (BE (new_start == NULL, 0))
527 return REGS_UNALLOCATED;
528 new_end = re_realloc (regs->end, regoff_t, need_regs);
529 if (BE (new_end == NULL, 0))
530 {
531 re_free (new_start);
532 return REGS_UNALLOCATED;
533 }
534 regs->start = new_start;
535 regs->end = new_end;
536 regs->num_regs = need_regs;
537 }
538 }
539 else
540 {
541 assert (regs_allocated == REGS_FIXED);
542 /* This function may not be called with REGS_FIXED and nregs too big. */
543 assert (regs->num_regs >= nregs);
544 rval = REGS_FIXED;
545 }
546
547 /* Copy the regs. */
548 for (i = 0; i < nregs; ++i)
549 {
550 regs->start[i] = pmatch[i].rm_so;
551 regs->end[i] = pmatch[i].rm_eo;
552 }
553 for ( ; i < regs->num_regs; ++i)
554 regs->start[i] = regs->end[i] = -1;
555
556 return rval;
557 }
558
559 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
560 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
561 this memory for recording register information. STARTS and ENDS
562 must be allocated using the malloc library routine, and must each
563 be at least NUM_REGS * sizeof (regoff_t) bytes long.
564
565 If NUM_REGS == 0, then subsequent matches should allocate their own
566 register data.
567
568 Unless this function is called, the first search or match using
569 PATTERN_BUFFER will allocate its own register data, without
570 freeing the old data. */
571
572 void
573 re_set_registers (bufp, regs, num_regs, starts, ends)
574 struct re_pattern_buffer *bufp;
575 struct re_registers *regs;
576 unsigned num_regs;
577 regoff_t *starts, *ends;
578 {
579 if (num_regs)
580 {
581 bufp->regs_allocated = REGS_REALLOCATE;
582 regs->num_regs = num_regs;
583 regs->start = starts;
584 regs->end = ends;
585 }
586 else
587 {
588 bufp->regs_allocated = REGS_UNALLOCATED;
589 regs->num_regs = 0;
590 regs->start = regs->end = (regoff_t *) 0;
591 }
592 }
593 #ifdef _LIBC
594 weak_alias (__re_set_registers, re_set_registers)
595 #endif
596 \f
597 /* Entry points compatible with 4.2 BSD regex library. We don't define
598 them unless specifically requested. */
599
600 #if defined _REGEX_RE_COMP || defined _LIBC
601 int
602 # ifdef _LIBC
603 weak_function
604 # endif
605 re_exec (s)
606 const char *s;
607 {
608 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
609 }
610 #endif /* _REGEX_RE_COMP */
611 \f
612 /* Internal entry point. */
613
614 /* Searches for a compiled pattern PREG in the string STRING, whose
615 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
616 mingings with regexec. START, and RANGE have the same meanings
617 with re_search.
618 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
619 otherwise return the error code.
620 Note: We assume front end functions already check ranges.
621 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
622
623 static reg_errcode_t
624 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
625 eflags)
626 const regex_t *preg;
627 const char *string;
628 int length, start, range, stop, eflags;
629 size_t nmatch;
630 regmatch_t pmatch[];
631 {
632 reg_errcode_t err;
633 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
634 int left_lim, right_lim, incr;
635 int fl_longest_match, match_first, match_kind, match_last = -1;
636 int extra_nmatch;
637 int sb, ch;
638 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
639 re_match_context_t mctx = { .dfa = dfa };
640 #else
641 re_match_context_t mctx;
642 #endif
643 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
644 && range && !preg->can_be_null) ? preg->fastmap : NULL;
645 RE_TRANSLATE_TYPE t = preg->translate;
646
647 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
648 memset (&mctx, '\0', sizeof (re_match_context_t));
649 mctx.dfa = dfa;
650 #endif
651
652 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
653 nmatch -= extra_nmatch;
654
655 /* Check if the DFA haven't been compiled. */
656 if (BE (preg->used == 0 || dfa->init_state == NULL
657 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
658 || dfa->init_state_begbuf == NULL, 0))
659 return REG_NOMATCH;
660
661 #ifdef DEBUG
662 /* We assume front-end functions already check them. */
663 assert (start + range >= 0 && start + range <= length);
664 #endif
665
666 /* If initial states with non-begbuf contexts have no elements,
667 the regex must be anchored. If preg->newline_anchor is set,
668 we'll never use init_state_nl, so do not check it. */
669 if (dfa->init_state->nodes.nelem == 0
670 && dfa->init_state_word->nodes.nelem == 0
671 && (dfa->init_state_nl->nodes.nelem == 0
672 || !preg->newline_anchor))
673 {
674 if (start != 0 && start + range != 0)
675 return REG_NOMATCH;
676 start = range = 0;
677 }
678
679 /* We must check the longest matching, if nmatch > 0. */
680 fl_longest_match = (nmatch != 0 || dfa->nbackref);
681
682 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
683 preg->translate, preg->syntax & RE_ICASE, dfa);
684 if (BE (err != REG_NOERROR, 0))
685 goto free_return;
686 mctx.input.stop = stop;
687 mctx.input.raw_stop = stop;
688 mctx.input.newline_anchor = preg->newline_anchor;
689
690 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
691 if (BE (err != REG_NOERROR, 0))
692 goto free_return;
693
694 /* We will log all the DFA states through which the dfa pass,
695 if nmatch > 1, or this dfa has "multibyte node", which is a
696 back-reference or a node which can accept multibyte character or
697 multi character collating element. */
698 if (nmatch > 1 || dfa->has_mb_node)
699 {
700 /* Avoid overflow. */
701 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
702 {
703 err = REG_ESPACE;
704 goto free_return;
705 }
706
707 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
708 if (BE (mctx.state_log == NULL, 0))
709 {
710 err = REG_ESPACE;
711 goto free_return;
712 }
713 }
714 else
715 mctx.state_log = NULL;
716
717 match_first = start;
718 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
719 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
720
721 /* Check incrementally whether of not the input string match. */
722 incr = (range < 0) ? -1 : 1;
723 left_lim = (range < 0) ? start + range : start;
724 right_lim = (range < 0) ? start : start + range;
725 sb = dfa->mb_cur_max == 1;
726 match_kind =
727 (fastmap
728 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
729 | (range >= 0 ? 2 : 0)
730 | (t != NULL ? 1 : 0))
731 : 8);
732
733 for (;; match_first += incr)
734 {
735 err = REG_NOMATCH;
736 if (match_first < left_lim || right_lim < match_first)
737 goto free_return;
738
739 /* Advance as rapidly as possible through the string, until we
740 find a plausible place to start matching. This may be done
741 with varying efficiency, so there are various possibilities:
742 only the most common of them are specialized, in order to
743 save on code size. We use a switch statement for speed. */
744 switch (match_kind)
745 {
746 case 8:
747 /* No fastmap. */
748 break;
749
750 case 7:
751 /* Fastmap with single-byte translation, match forward. */
752 while (BE (match_first < right_lim, 1)
753 && !fastmap[t[(unsigned char) string[match_first]]])
754 ++match_first;
755 goto forward_match_found_start_or_reached_end;
756
757 case 6:
758 /* Fastmap without translation, match forward. */
759 while (BE (match_first < right_lim, 1)
760 && !fastmap[(unsigned char) string[match_first]])
761 ++match_first;
762
763 forward_match_found_start_or_reached_end:
764 if (BE (match_first == right_lim, 0))
765 {
766 ch = match_first >= length
767 ? 0 : (unsigned char) string[match_first];
768 if (!fastmap[t ? t[ch] : ch])
769 goto free_return;
770 }
771 break;
772
773 case 4:
774 case 5:
775 /* Fastmap without multi-byte translation, match backwards. */
776 while (match_first >= left_lim)
777 {
778 ch = match_first >= length
779 ? 0 : (unsigned char) string[match_first];
780 if (fastmap[t ? t[ch] : ch])
781 break;
782 --match_first;
783 }
784 if (match_first < left_lim)
785 goto free_return;
786 break;
787
788 default:
789 /* In this case, we can't determine easily the current byte,
790 since it might be a component byte of a multibyte
791 character. Then we use the constructed buffer instead. */
792 for (;;)
793 {
794 /* If MATCH_FIRST is out of the valid range, reconstruct the
795 buffers. */
796 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
797 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
798 {
799 err = re_string_reconstruct (&mctx.input, match_first,
800 eflags);
801 if (BE (err != REG_NOERROR, 0))
802 goto free_return;
803
804 offset = match_first - mctx.input.raw_mbs_idx;
805 }
806 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
807 Note that MATCH_FIRST must not be smaller than 0. */
808 ch = (match_first >= length
809 ? 0 : re_string_byte_at (&mctx.input, offset));
810 if (fastmap[ch])
811 break;
812 match_first += incr;
813 if (match_first < left_lim || match_first > right_lim)
814 {
815 err = REG_NOMATCH;
816 goto free_return;
817 }
818 }
819 break;
820 }
821
822 /* Reconstruct the buffers so that the matcher can assume that
823 the matching starts from the beginning of the buffer. */
824 err = re_string_reconstruct (&mctx.input, match_first, eflags);
825 if (BE (err != REG_NOERROR, 0))
826 goto free_return;
827
828 #ifdef RE_ENABLE_I18N
829 /* Don't consider this char as a possible match start if it part,
830 yet isn't the head, of a multibyte character. */
831 if (!sb && !re_string_first_byte (&mctx.input, 0))
832 continue;
833 #endif
834
835 /* It seems to be appropriate one, then use the matcher. */
836 /* We assume that the matching starts from 0. */
837 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
838 match_last = check_matching (&mctx, fl_longest_match,
839 range >= 0 ? &match_first : NULL);
840 if (match_last != -1)
841 {
842 if (BE (match_last == -2, 0))
843 {
844 err = REG_ESPACE;
845 goto free_return;
846 }
847 else
848 {
849 mctx.match_last = match_last;
850 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
851 {
852 re_dfastate_t *pstate = mctx.state_log[match_last];
853 mctx.last_node = check_halt_state_context (&mctx, pstate,
854 match_last);
855 }
856 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
857 || dfa->nbackref)
858 {
859 err = prune_impossible_nodes (&mctx);
860 if (err == REG_NOERROR)
861 break;
862 if (BE (err != REG_NOMATCH, 0))
863 goto free_return;
864 match_last = -1;
865 }
866 else
867 break; /* We found a match. */
868 }
869 }
870
871 match_ctx_clean (&mctx);
872 }
873
874 #ifdef DEBUG
875 assert (match_last != -1);
876 assert (err == REG_NOERROR);
877 #endif
878
879 /* Set pmatch[] if we need. */
880 if (nmatch > 0)
881 {
882 int reg_idx;
883
884 /* Initialize registers. */
885 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
886 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
887
888 /* Set the points where matching start/end. */
889 pmatch[0].rm_so = 0;
890 pmatch[0].rm_eo = mctx.match_last;
891
892 if (!preg->no_sub && nmatch > 1)
893 {
894 err = set_regs (preg, &mctx, nmatch, pmatch,
895 dfa->has_plural_match && dfa->nbackref > 0);
896 if (BE (err != REG_NOERROR, 0))
897 goto free_return;
898 }
899
900 /* At last, add the offset to the each registers, since we slided
901 the buffers so that we could assume that the matching starts
902 from 0. */
903 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
904 if (pmatch[reg_idx].rm_so != -1)
905 {
906 #ifdef RE_ENABLE_I18N
907 if (BE (mctx.input.offsets_needed != 0, 0))
908 {
909 pmatch[reg_idx].rm_so =
910 (pmatch[reg_idx].rm_so == mctx.input.valid_len
911 ? mctx.input.valid_raw_len
912 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
913 pmatch[reg_idx].rm_eo =
914 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
915 ? mctx.input.valid_raw_len
916 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
917 }
918 #else
919 assert (mctx.input.offsets_needed == 0);
920 #endif
921 pmatch[reg_idx].rm_so += match_first;
922 pmatch[reg_idx].rm_eo += match_first;
923 }
924 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
925 {
926 pmatch[nmatch + reg_idx].rm_so = -1;
927 pmatch[nmatch + reg_idx].rm_eo = -1;
928 }
929
930 if (dfa->subexp_map)
931 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
932 if (dfa->subexp_map[reg_idx] != reg_idx)
933 {
934 pmatch[reg_idx + 1].rm_so
935 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
936 pmatch[reg_idx + 1].rm_eo
937 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
938 }
939 }
940
941 free_return:
942 re_free (mctx.state_log);
943 if (dfa->nbackref)
944 match_ctx_free (&mctx);
945 re_string_destruct (&mctx.input);
946 return err;
947 }
948
949 static reg_errcode_t
950 prune_impossible_nodes (mctx)
951 re_match_context_t *mctx;
952 {
953 const re_dfa_t *const dfa = mctx->dfa;
954 int halt_node, match_last;
955 reg_errcode_t ret;
956 re_dfastate_t **sifted_states;
957 re_dfastate_t **lim_states = NULL;
958 re_sift_context_t sctx;
959 #ifdef DEBUG
960 assert (mctx->state_log != NULL);
961 #endif
962 match_last = mctx->match_last;
963 halt_node = mctx->last_node;
964
965 /* Avoid overflow. */
966 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
967 return REG_ESPACE;
968
969 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
970 if (BE (sifted_states == NULL, 0))
971 {
972 ret = REG_ESPACE;
973 goto free_return;
974 }
975 if (dfa->nbackref)
976 {
977 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
978 if (BE (lim_states == NULL, 0))
979 {
980 ret = REG_ESPACE;
981 goto free_return;
982 }
983 while (1)
984 {
985 memset (lim_states, '\0',
986 sizeof (re_dfastate_t *) * (match_last + 1));
987 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
988 match_last);
989 ret = sift_states_backward (mctx, &sctx);
990 re_node_set_free (&sctx.limits);
991 if (BE (ret != REG_NOERROR, 0))
992 goto free_return;
993 if (sifted_states[0] != NULL || lim_states[0] != NULL)
994 break;
995 do
996 {
997 --match_last;
998 if (match_last < 0)
999 {
1000 ret = REG_NOMATCH;
1001 goto free_return;
1002 }
1003 } while (mctx->state_log[match_last] == NULL
1004 || !mctx->state_log[match_last]->halt);
1005 halt_node = check_halt_state_context (mctx,
1006 mctx->state_log[match_last],
1007 match_last);
1008 }
1009 ret = merge_state_array (dfa, sifted_states, lim_states,
1010 match_last + 1);
1011 re_free (lim_states);
1012 lim_states = NULL;
1013 if (BE (ret != REG_NOERROR, 0))
1014 goto free_return;
1015 }
1016 else
1017 {
1018 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1019 ret = sift_states_backward (mctx, &sctx);
1020 re_node_set_free (&sctx.limits);
1021 if (BE (ret != REG_NOERROR, 0))
1022 goto free_return;
1023 if (sifted_states[0] == NULL)
1024 {
1025 ret = REG_NOMATCH;
1026 goto free_return;
1027 }
1028 }
1029 re_free (mctx->state_log);
1030 mctx->state_log = sifted_states;
1031 sifted_states = NULL;
1032 mctx->last_node = halt_node;
1033 mctx->match_last = match_last;
1034 ret = REG_NOERROR;
1035 free_return:
1036 re_free (sifted_states);
1037 re_free (lim_states);
1038 return ret;
1039 }
1040
1041 /* Acquire an initial state and return it.
1042 We must select appropriate initial state depending on the context,
1043 since initial states may have constraints like "\<", "^", etc.. */
1044
1045 static inline re_dfastate_t *
1046 __attribute ((always_inline)) internal_function
1047 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1048 int idx)
1049 {
1050 const re_dfa_t *const dfa = mctx->dfa;
1051 if (dfa->init_state->has_constraint)
1052 {
1053 unsigned int context;
1054 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1055 if (IS_WORD_CONTEXT (context))
1056 return dfa->init_state_word;
1057 else if (IS_ORDINARY_CONTEXT (context))
1058 return dfa->init_state;
1059 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1060 return dfa->init_state_begbuf;
1061 else if (IS_NEWLINE_CONTEXT (context))
1062 return dfa->init_state_nl;
1063 else if (IS_BEGBUF_CONTEXT (context))
1064 {
1065 /* It is relatively rare case, then calculate on demand. */
1066 return re_acquire_state_context (err, dfa,
1067 dfa->init_state->entrance_nodes,
1068 context);
1069 }
1070 else
1071 /* Must not happen? */
1072 return dfa->init_state;
1073 }
1074 else
1075 return dfa->init_state;
1076 }
1077
1078 /* Check whether the regular expression match input string INPUT or not,
1079 and return the index where the matching end, return -1 if not match,
1080 or return -2 in case of an error.
1081 FL_LONGEST_MATCH means we want the POSIX longest matching.
1082 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1083 next place where we may want to try matching.
1084 Note that the matcher assume that the maching starts from the current
1085 index of the buffer. */
1086
1087 static int
1088 internal_function
1089 check_matching (re_match_context_t *mctx, int fl_longest_match,
1090 int *p_match_first)
1091 {
1092 const re_dfa_t *const dfa = mctx->dfa;
1093 reg_errcode_t err;
1094 int match = 0;
1095 int match_last = -1;
1096 int cur_str_idx = re_string_cur_idx (&mctx->input);
1097 re_dfastate_t *cur_state;
1098 int at_init_state = p_match_first != NULL;
1099 int next_start_idx = cur_str_idx;
1100
1101 err = REG_NOERROR;
1102 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1103 /* An initial state must not be NULL (invalid). */
1104 if (BE (cur_state == NULL, 0))
1105 {
1106 assert (err == REG_ESPACE);
1107 return -2;
1108 }
1109
1110 if (mctx->state_log != NULL)
1111 {
1112 mctx->state_log[cur_str_idx] = cur_state;
1113
1114 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1115 later. E.g. Processing back references. */
1116 if (BE (dfa->nbackref, 0))
1117 {
1118 at_init_state = 0;
1119 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1120 if (BE (err != REG_NOERROR, 0))
1121 return err;
1122
1123 if (cur_state->has_backref)
1124 {
1125 err = transit_state_bkref (mctx, &cur_state->nodes);
1126 if (BE (err != REG_NOERROR, 0))
1127 return err;
1128 }
1129 }
1130 }
1131
1132 /* If the RE accepts NULL string. */
1133 if (BE (cur_state->halt, 0))
1134 {
1135 if (!cur_state->has_constraint
1136 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1137 {
1138 if (!fl_longest_match)
1139 return cur_str_idx;
1140 else
1141 {
1142 match_last = cur_str_idx;
1143 match = 1;
1144 }
1145 }
1146 }
1147
1148 while (!re_string_eoi (&mctx->input))
1149 {
1150 re_dfastate_t *old_state = cur_state;
1151 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1152
1153 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1154 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1155 && mctx->input.valid_len < mctx->input.len))
1156 {
1157 err = extend_buffers (mctx);
1158 if (BE (err != REG_NOERROR, 0))
1159 {
1160 assert (err == REG_ESPACE);
1161 return -2;
1162 }
1163 }
1164
1165 cur_state = transit_state (&err, mctx, cur_state);
1166 if (mctx->state_log != NULL)
1167 cur_state = merge_state_with_log (&err, mctx, cur_state);
1168
1169 if (cur_state == NULL)
1170 {
1171 /* Reached the invalid state or an error. Try to recover a valid
1172 state using the state log, if available and if we have not
1173 already found a valid (even if not the longest) match. */
1174 if (BE (err != REG_NOERROR, 0))
1175 return -2;
1176
1177 if (mctx->state_log == NULL
1178 || (match && !fl_longest_match)
1179 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1180 break;
1181 }
1182
1183 if (BE (at_init_state, 0))
1184 {
1185 if (old_state == cur_state)
1186 next_start_idx = next_char_idx;
1187 else
1188 at_init_state = 0;
1189 }
1190
1191 if (cur_state->halt)
1192 {
1193 /* Reached a halt state.
1194 Check the halt state can satisfy the current context. */
1195 if (!cur_state->has_constraint
1196 || check_halt_state_context (mctx, cur_state,
1197 re_string_cur_idx (&mctx->input)))
1198 {
1199 /* We found an appropriate halt state. */
1200 match_last = re_string_cur_idx (&mctx->input);
1201 match = 1;
1202
1203 /* We found a match, do not modify match_first below. */
1204 p_match_first = NULL;
1205 if (!fl_longest_match)
1206 break;
1207 }
1208 }
1209 }
1210
1211 if (p_match_first)
1212 *p_match_first += next_start_idx;
1213
1214 return match_last;
1215 }
1216
1217 /* Check NODE match the current context. */
1218
1219 static int
1220 internal_function
1221 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1222 {
1223 re_token_type_t type = dfa->nodes[node].type;
1224 unsigned int constraint = dfa->nodes[node].constraint;
1225 if (type != END_OF_RE)
1226 return 0;
1227 if (!constraint)
1228 return 1;
1229 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1230 return 0;
1231 return 1;
1232 }
1233
1234 /* Check the halt state STATE match the current context.
1235 Return 0 if not match, if the node, STATE has, is a halt node and
1236 match the context, return the node. */
1237
1238 static int
1239 internal_function
1240 check_halt_state_context (const re_match_context_t *mctx,
1241 const re_dfastate_t *state, int idx)
1242 {
1243 int i;
1244 unsigned int context;
1245 #ifdef DEBUG
1246 assert (state->halt);
1247 #endif
1248 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1249 for (i = 0; i < state->nodes.nelem; ++i)
1250 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1251 return state->nodes.elems[i];
1252 return 0;
1253 }
1254
1255 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1256 corresponding to the DFA).
1257 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1258 of errors. */
1259
1260 static int
1261 internal_function
1262 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1263 int *pidx, int node, re_node_set *eps_via_nodes,
1264 struct re_fail_stack_t *fs)
1265 {
1266 const re_dfa_t *const dfa = mctx->dfa;
1267 int i, err;
1268 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1269 {
1270 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1271 re_node_set *edests = &dfa->edests[node];
1272 int dest_node;
1273 err = re_node_set_insert (eps_via_nodes, node);
1274 if (BE (err < 0, 0))
1275 return -2;
1276 /* Pick up a valid destination, or return -1 if none is found. */
1277 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1278 {
1279 int candidate = edests->elems[i];
1280 if (!re_node_set_contains (cur_nodes, candidate))
1281 continue;
1282 if (dest_node == -1)
1283 dest_node = candidate;
1284
1285 else
1286 {
1287 /* In order to avoid infinite loop like "(a*)*", return the second
1288 epsilon-transition if the first was already considered. */
1289 if (re_node_set_contains (eps_via_nodes, dest_node))
1290 return candidate;
1291
1292 /* Otherwise, push the second epsilon-transition on the fail stack. */
1293 else if (fs != NULL
1294 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1295 eps_via_nodes))
1296 return -2;
1297
1298 /* We know we are going to exit. */
1299 break;
1300 }
1301 }
1302 return dest_node;
1303 }
1304 else
1305 {
1306 int naccepted = 0;
1307 re_token_type_t type = dfa->nodes[node].type;
1308
1309 #ifdef RE_ENABLE_I18N
1310 if (dfa->nodes[node].accept_mb)
1311 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1312 else
1313 #endif /* RE_ENABLE_I18N */
1314 if (type == OP_BACK_REF)
1315 {
1316 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1317 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1318 if (fs != NULL)
1319 {
1320 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1321 return -1;
1322 else if (naccepted)
1323 {
1324 char *buf = (char *) re_string_get_buffer (&mctx->input);
1325 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1326 naccepted) != 0)
1327 return -1;
1328 }
1329 }
1330
1331 if (naccepted == 0)
1332 {
1333 int dest_node;
1334 err = re_node_set_insert (eps_via_nodes, node);
1335 if (BE (err < 0, 0))
1336 return -2;
1337 dest_node = dfa->edests[node].elems[0];
1338 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1339 dest_node))
1340 return dest_node;
1341 }
1342 }
1343
1344 if (naccepted != 0
1345 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1346 {
1347 int dest_node = dfa->nexts[node];
1348 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1349 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1350 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1351 dest_node)))
1352 return -1;
1353 re_node_set_empty (eps_via_nodes);
1354 return dest_node;
1355 }
1356 }
1357 return -1;
1358 }
1359
1360 static reg_errcode_t
1361 internal_function
1362 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1363 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1364 {
1365 reg_errcode_t err;
1366 int num = fs->num++;
1367 if (fs->num == fs->alloc)
1368 {
1369 struct re_fail_stack_ent_t *new_array;
1370 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1371 * fs->alloc * 2));
1372 if (new_array == NULL)
1373 return REG_ESPACE;
1374 fs->alloc *= 2;
1375 fs->stack = new_array;
1376 }
1377 fs->stack[num].idx = str_idx;
1378 fs->stack[num].node = dest_node;
1379 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1380 if (fs->stack[num].regs == NULL)
1381 return REG_ESPACE;
1382 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1383 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1384 return err;
1385 }
1386
1387 static int
1388 internal_function
1389 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1390 regmatch_t *regs, re_node_set *eps_via_nodes)
1391 {
1392 int num = --fs->num;
1393 assert (num >= 0);
1394 *pidx = fs->stack[num].idx;
1395 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1396 re_node_set_free (eps_via_nodes);
1397 re_free (fs->stack[num].regs);
1398 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1399 return fs->stack[num].node;
1400 }
1401
1402 /* Set the positions where the subexpressions are starts/ends to registers
1403 PMATCH.
1404 Note: We assume that pmatch[0] is already set, and
1405 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1406
1407 static reg_errcode_t
1408 internal_function
1409 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1410 regmatch_t *pmatch, int fl_backtrack)
1411 {
1412 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1413 int idx, cur_node;
1414 re_node_set eps_via_nodes;
1415 struct re_fail_stack_t *fs;
1416 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1417 regmatch_t *prev_idx_match;
1418 int prev_idx_match_malloced = 0;
1419
1420 #ifdef DEBUG
1421 assert (nmatch > 1);
1422 assert (mctx->state_log != NULL);
1423 #endif
1424 if (fl_backtrack)
1425 {
1426 fs = &fs_body;
1427 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1428 if (fs->stack == NULL)
1429 return REG_ESPACE;
1430 }
1431 else
1432 fs = NULL;
1433
1434 cur_node = dfa->init_node;
1435 re_node_set_init_empty (&eps_via_nodes);
1436
1437 #ifdef HAVE_ALLOCA
1438 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1439 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1440 else
1441 #endif
1442 {
1443 prev_idx_match = re_malloc (regmatch_t, nmatch);
1444 if (prev_idx_match == NULL)
1445 {
1446 free_fail_stack_return (fs);
1447 return REG_ESPACE;
1448 }
1449 prev_idx_match_malloced = 1;
1450 }
1451 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1452
1453 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1454 {
1455 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1456
1457 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1458 {
1459 int reg_idx;
1460 if (fs)
1461 {
1462 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1463 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1464 break;
1465 if (reg_idx == nmatch)
1466 {
1467 re_node_set_free (&eps_via_nodes);
1468 if (prev_idx_match_malloced)
1469 re_free (prev_idx_match);
1470 return free_fail_stack_return (fs);
1471 }
1472 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1473 &eps_via_nodes);
1474 }
1475 else
1476 {
1477 re_node_set_free (&eps_via_nodes);
1478 if (prev_idx_match_malloced)
1479 re_free (prev_idx_match);
1480 return REG_NOERROR;
1481 }
1482 }
1483
1484 /* Proceed to next node. */
1485 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1486 &eps_via_nodes, fs);
1487
1488 if (BE (cur_node < 0, 0))
1489 {
1490 if (BE (cur_node == -2, 0))
1491 {
1492 re_node_set_free (&eps_via_nodes);
1493 if (prev_idx_match_malloced)
1494 re_free (prev_idx_match);
1495 free_fail_stack_return (fs);
1496 return REG_ESPACE;
1497 }
1498 if (fs)
1499 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1500 &eps_via_nodes);
1501 else
1502 {
1503 re_node_set_free (&eps_via_nodes);
1504 if (prev_idx_match_malloced)
1505 re_free (prev_idx_match);
1506 return REG_NOMATCH;
1507 }
1508 }
1509 }
1510 re_node_set_free (&eps_via_nodes);
1511 if (prev_idx_match_malloced)
1512 re_free (prev_idx_match);
1513 return free_fail_stack_return (fs);
1514 }
1515
1516 static reg_errcode_t
1517 internal_function
1518 free_fail_stack_return (struct re_fail_stack_t *fs)
1519 {
1520 if (fs)
1521 {
1522 int fs_idx;
1523 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1524 {
1525 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1526 re_free (fs->stack[fs_idx].regs);
1527 }
1528 re_free (fs->stack);
1529 }
1530 return REG_NOERROR;
1531 }
1532
1533 static void
1534 internal_function
1535 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1536 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1537 {
1538 int type = dfa->nodes[cur_node].type;
1539 if (type == OP_OPEN_SUBEXP)
1540 {
1541 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542
1543 /* We are at the first node of this sub expression. */
1544 if (reg_num < nmatch)
1545 {
1546 pmatch[reg_num].rm_so = cur_idx;
1547 pmatch[reg_num].rm_eo = -1;
1548 }
1549 }
1550 else if (type == OP_CLOSE_SUBEXP)
1551 {
1552 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1553 if (reg_num < nmatch)
1554 {
1555 /* We are at the last node of this sub expression. */
1556 if (pmatch[reg_num].rm_so < cur_idx)
1557 {
1558 pmatch[reg_num].rm_eo = cur_idx;
1559 /* This is a non-empty match or we are not inside an optional
1560 subexpression. Accept this right away. */
1561 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1562 }
1563 else
1564 {
1565 if (dfa->nodes[cur_node].opt_subexp
1566 && prev_idx_match[reg_num].rm_so != -1)
1567 /* We transited through an empty match for an optional
1568 subexpression, like (a?)*, and this is not the subexp's
1569 first match. Copy back the old content of the registers
1570 so that matches of an inner subexpression are undone as
1571 well, like in ((a?))*. */
1572 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1573 else
1574 /* We completed a subexpression, but it may be part of
1575 an optional one, so do not update PREV_IDX_MATCH. */
1576 pmatch[reg_num].rm_eo = cur_idx;
1577 }
1578 }
1579 }
1580 }
1581
1582 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1583 and sift the nodes in each states according to the following rules.
1584 Updated state_log will be wrote to STATE_LOG.
1585
1586 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1587 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1588 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1589 the LAST_NODE, we throw away the node `a'.
1590 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1591 string `s' and transit to `b':
1592 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1593 away the node `a'.
1594 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1595 thrown away, we throw away the node `a'.
1596 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1597 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1598 node `a'.
1599 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1600 we throw away the node `a'. */
1601
1602 #define STATE_NODE_CONTAINS(state,node) \
1603 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1604
1605 static reg_errcode_t
1606 internal_function
1607 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1608 {
1609 reg_errcode_t err;
1610 int null_cnt = 0;
1611 int str_idx = sctx->last_str_idx;
1612 re_node_set cur_dest;
1613
1614 #ifdef DEBUG
1615 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1616 #endif
1617
1618 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1619 transit to the last_node and the last_node itself. */
1620 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1621 if (BE (err != REG_NOERROR, 0))
1622 return err;
1623 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1624 if (BE (err != REG_NOERROR, 0))
1625 goto free_return;
1626
1627 /* Then check each states in the state_log. */
1628 while (str_idx > 0)
1629 {
1630 /* Update counters. */
1631 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1632 if (null_cnt > mctx->max_mb_elem_len)
1633 {
1634 memset (sctx->sifted_states, '\0',
1635 sizeof (re_dfastate_t *) * str_idx);
1636 re_node_set_free (&cur_dest);
1637 return REG_NOERROR;
1638 }
1639 re_node_set_empty (&cur_dest);
1640 --str_idx;
1641
1642 if (mctx->state_log[str_idx])
1643 {
1644 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1645 if (BE (err != REG_NOERROR, 0))
1646 goto free_return;
1647 }
1648
1649 /* Add all the nodes which satisfy the following conditions:
1650 - It can epsilon transit to a node in CUR_DEST.
1651 - It is in CUR_SRC.
1652 And update state_log. */
1653 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1654 if (BE (err != REG_NOERROR, 0))
1655 goto free_return;
1656 }
1657 err = REG_NOERROR;
1658 free_return:
1659 re_node_set_free (&cur_dest);
1660 return err;
1661 }
1662
1663 static reg_errcode_t
1664 internal_function
1665 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1666 int str_idx, re_node_set *cur_dest)
1667 {
1668 const re_dfa_t *const dfa = mctx->dfa;
1669 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1670 int i;
1671
1672 /* Then build the next sifted state.
1673 We build the next sifted state on `cur_dest', and update
1674 `sifted_states[str_idx]' with `cur_dest'.
1675 Note:
1676 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1677 `cur_src' points the node_set of the old `state_log[str_idx]'
1678 (with the epsilon nodes pre-filtered out). */
1679 for (i = 0; i < cur_src->nelem; i++)
1680 {
1681 int prev_node = cur_src->elems[i];
1682 int naccepted = 0;
1683 int ret;
1684
1685 #ifdef DEBUG
1686 re_token_type_t type = dfa->nodes[prev_node].type;
1687 assert (!IS_EPSILON_NODE (type));
1688 #endif
1689 #ifdef RE_ENABLE_I18N
1690 /* If the node may accept `multi byte'. */
1691 if (dfa->nodes[prev_node].accept_mb)
1692 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1693 str_idx, sctx->last_str_idx);
1694 #endif /* RE_ENABLE_I18N */
1695
1696 /* We don't check backreferences here.
1697 See update_cur_sifted_state(). */
1698 if (!naccepted
1699 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1700 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1701 dfa->nexts[prev_node]))
1702 naccepted = 1;
1703
1704 if (naccepted == 0)
1705 continue;
1706
1707 if (sctx->limits.nelem)
1708 {
1709 int to_idx = str_idx + naccepted;
1710 if (check_dst_limits (mctx, &sctx->limits,
1711 dfa->nexts[prev_node], to_idx,
1712 prev_node, str_idx))
1713 continue;
1714 }
1715 ret = re_node_set_insert (cur_dest, prev_node);
1716 if (BE (ret == -1, 0))
1717 return REG_ESPACE;
1718 }
1719
1720 return REG_NOERROR;
1721 }
1722
1723 /* Helper functions. */
1724
1725 static reg_errcode_t
1726 internal_function
1727 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1728 {
1729 int top = mctx->state_log_top;
1730
1731 if (next_state_log_idx >= mctx->input.bufs_len
1732 || (next_state_log_idx >= mctx->input.valid_len
1733 && mctx->input.valid_len < mctx->input.len))
1734 {
1735 reg_errcode_t err;
1736 err = extend_buffers (mctx);
1737 if (BE (err != REG_NOERROR, 0))
1738 return err;
1739 }
1740
1741 if (top < next_state_log_idx)
1742 {
1743 memset (mctx->state_log + top + 1, '\0',
1744 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1745 mctx->state_log_top = next_state_log_idx;
1746 }
1747 return REG_NOERROR;
1748 }
1749
1750 static reg_errcode_t
1751 internal_function
1752 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1753 re_dfastate_t **src, int num)
1754 {
1755 int st_idx;
1756 reg_errcode_t err;
1757 for (st_idx = 0; st_idx < num; ++st_idx)
1758 {
1759 if (dst[st_idx] == NULL)
1760 dst[st_idx] = src[st_idx];
1761 else if (src[st_idx] != NULL)
1762 {
1763 re_node_set merged_set;
1764 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1765 &src[st_idx]->nodes);
1766 if (BE (err != REG_NOERROR, 0))
1767 return err;
1768 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1769 re_node_set_free (&merged_set);
1770 if (BE (err != REG_NOERROR, 0))
1771 return err;
1772 }
1773 }
1774 return REG_NOERROR;
1775 }
1776
1777 static reg_errcode_t
1778 internal_function
1779 update_cur_sifted_state (const re_match_context_t *mctx,
1780 re_sift_context_t *sctx, int str_idx,
1781 re_node_set *dest_nodes)
1782 {
1783 const re_dfa_t *const dfa = mctx->dfa;
1784 reg_errcode_t err = REG_NOERROR;
1785 const re_node_set *candidates;
1786 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1787 : &mctx->state_log[str_idx]->nodes);
1788
1789 if (dest_nodes->nelem == 0)
1790 sctx->sifted_states[str_idx] = NULL;
1791 else
1792 {
1793 if (candidates)
1794 {
1795 /* At first, add the nodes which can epsilon transit to a node in
1796 DEST_NODE. */
1797 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1798 if (BE (err != REG_NOERROR, 0))
1799 return err;
1800
1801 /* Then, check the limitations in the current sift_context. */
1802 if (sctx->limits.nelem)
1803 {
1804 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1805 mctx->bkref_ents, str_idx);
1806 if (BE (err != REG_NOERROR, 0))
1807 return err;
1808 }
1809 }
1810
1811 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1812 if (BE (err != REG_NOERROR, 0))
1813 return err;
1814 }
1815
1816 if (candidates && mctx->state_log[str_idx]->has_backref)
1817 {
1818 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1819 if (BE (err != REG_NOERROR, 0))
1820 return err;
1821 }
1822 return REG_NOERROR;
1823 }
1824
1825 static reg_errcode_t
1826 internal_function
1827 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1828 const re_node_set *candidates)
1829 {
1830 reg_errcode_t err = REG_NOERROR;
1831 int i;
1832
1833 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1834 if (BE (err != REG_NOERROR, 0))
1835 return err;
1836
1837 if (!state->inveclosure.alloc)
1838 {
1839 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1840 if (BE (err != REG_NOERROR, 0))
1841 return REG_ESPACE;
1842 for (i = 0; i < dest_nodes->nelem; i++)
1843 {
1844 err = re_node_set_merge (&state->inveclosure,
1845 dfa->inveclosures + dest_nodes->elems[i]);
1846 if (BE (err != REG_NOERROR, 0))
1847 return REG_ESPACE;
1848 }
1849 }
1850 return re_node_set_add_intersect (dest_nodes, candidates,
1851 &state->inveclosure);
1852 }
1853
1854 static reg_errcode_t
1855 internal_function
1856 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1857 const re_node_set *candidates)
1858 {
1859 int ecl_idx;
1860 reg_errcode_t err;
1861 re_node_set *inv_eclosure = dfa->inveclosures + node;
1862 re_node_set except_nodes;
1863 re_node_set_init_empty (&except_nodes);
1864 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1865 {
1866 int cur_node = inv_eclosure->elems[ecl_idx];
1867 if (cur_node == node)
1868 continue;
1869 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1870 {
1871 int edst1 = dfa->edests[cur_node].elems[0];
1872 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1873 ? dfa->edests[cur_node].elems[1] : -1);
1874 if ((!re_node_set_contains (inv_eclosure, edst1)
1875 && re_node_set_contains (dest_nodes, edst1))
1876 || (edst2 > 0
1877 && !re_node_set_contains (inv_eclosure, edst2)
1878 && re_node_set_contains (dest_nodes, edst2)))
1879 {
1880 err = re_node_set_add_intersect (&except_nodes, candidates,
1881 dfa->inveclosures + cur_node);
1882 if (BE (err != REG_NOERROR, 0))
1883 {
1884 re_node_set_free (&except_nodes);
1885 return err;
1886 }
1887 }
1888 }
1889 }
1890 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1891 {
1892 int cur_node = inv_eclosure->elems[ecl_idx];
1893 if (!re_node_set_contains (&except_nodes, cur_node))
1894 {
1895 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1896 re_node_set_remove_at (dest_nodes, idx);
1897 }
1898 }
1899 re_node_set_free (&except_nodes);
1900 return REG_NOERROR;
1901 }
1902
1903 static int
1904 internal_function
1905 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1906 int dst_node, int dst_idx, int src_node, int src_idx)
1907 {
1908 const re_dfa_t *const dfa = mctx->dfa;
1909 int lim_idx, src_pos, dst_pos;
1910
1911 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1912 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1913 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1914 {
1915 int subexp_idx;
1916 struct re_backref_cache_entry *ent;
1917 ent = mctx->bkref_ents + limits->elems[lim_idx];
1918 subexp_idx = dfa->nodes[ent->node].opr.idx;
1919
1920 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1921 subexp_idx, dst_node, dst_idx,
1922 dst_bkref_idx);
1923 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1924 subexp_idx, src_node, src_idx,
1925 src_bkref_idx);
1926
1927 /* In case of:
1928 <src> <dst> ( <subexp> )
1929 ( <subexp> ) <src> <dst>
1930 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1931 if (src_pos == dst_pos)
1932 continue; /* This is unrelated limitation. */
1933 else
1934 return 1;
1935 }
1936 return 0;
1937 }
1938
1939 static int
1940 internal_function
1941 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1942 int subexp_idx, int from_node, int bkref_idx)
1943 {
1944 const re_dfa_t *const dfa = mctx->dfa;
1945 const re_node_set *eclosures = dfa->eclosures + from_node;
1946 int node_idx;
1947
1948 /* Else, we are on the boundary: examine the nodes on the epsilon
1949 closure. */
1950 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1951 {
1952 int node = eclosures->elems[node_idx];
1953 switch (dfa->nodes[node].type)
1954 {
1955 case OP_BACK_REF:
1956 if (bkref_idx != -1)
1957 {
1958 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1959 do
1960 {
1961 int dst, cpos;
1962
1963 if (ent->node != node)
1964 continue;
1965
1966 if (subexp_idx < BITSET_WORD_BITS
1967 && !(ent->eps_reachable_subexps_map
1968 & ((bitset_word_t) 1 << subexp_idx)))
1969 continue;
1970
1971 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1972 OP_CLOSE_SUBEXP cases below. But, if the
1973 destination node is the same node as the source
1974 node, don't recurse because it would cause an
1975 infinite loop: a regex that exhibits this behavior
1976 is ()\1*\1* */
1977 dst = dfa->edests[node].elems[0];
1978 if (dst == from_node)
1979 {
1980 if (boundaries & 1)
1981 return -1;
1982 else /* if (boundaries & 2) */
1983 return 0;
1984 }
1985
1986 cpos =
1987 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988 dst, bkref_idx);
1989 if (cpos == -1 /* && (boundaries & 1) */)
1990 return -1;
1991 if (cpos == 0 && (boundaries & 2))
1992 return 0;
1993
1994 if (subexp_idx < BITSET_WORD_BITS)
1995 ent->eps_reachable_subexps_map
1996 &= ~((bitset_word_t) 1 << subexp_idx);
1997 }
1998 while (ent++->more);
1999 }
2000 break;
2001
2002 case OP_OPEN_SUBEXP:
2003 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2004 return -1;
2005 break;
2006
2007 case OP_CLOSE_SUBEXP:
2008 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2009 return 0;
2010 break;
2011
2012 default:
2013 break;
2014 }
2015 }
2016
2017 return (boundaries & 2) ? 1 : 0;
2018 }
2019
2020 static int
2021 internal_function
2022 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2023 int subexp_idx, int from_node, int str_idx,
2024 int bkref_idx)
2025 {
2026 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2027 int boundaries;
2028
2029 /* If we are outside the range of the subexpression, return -1 or 1. */
2030 if (str_idx < lim->subexp_from)
2031 return -1;
2032
2033 if (lim->subexp_to < str_idx)
2034 return 1;
2035
2036 /* If we are within the subexpression, return 0. */
2037 boundaries = (str_idx == lim->subexp_from);
2038 boundaries |= (str_idx == lim->subexp_to) << 1;
2039 if (boundaries == 0)
2040 return 0;
2041
2042 /* Else, examine epsilon closure. */
2043 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2044 from_node, bkref_idx);
2045 }
2046
2047 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2048 which are against limitations from DEST_NODES. */
2049
2050 static reg_errcode_t
2051 internal_function
2052 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2053 const re_node_set *candidates, re_node_set *limits,
2054 struct re_backref_cache_entry *bkref_ents, int str_idx)
2055 {
2056 reg_errcode_t err;
2057 int node_idx, lim_idx;
2058
2059 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2060 {
2061 int subexp_idx;
2062 struct re_backref_cache_entry *ent;
2063 ent = bkref_ents + limits->elems[lim_idx];
2064
2065 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2066 continue; /* This is unrelated limitation. */
2067
2068 subexp_idx = dfa->nodes[ent->node].opr.idx;
2069 if (ent->subexp_to == str_idx)
2070 {
2071 int ops_node = -1;
2072 int cls_node = -1;
2073 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074 {
2075 int node = dest_nodes->elems[node_idx];
2076 re_token_type_t type = dfa->nodes[node].type;
2077 if (type == OP_OPEN_SUBEXP
2078 && subexp_idx == dfa->nodes[node].opr.idx)
2079 ops_node = node;
2080 else if (type == OP_CLOSE_SUBEXP
2081 && subexp_idx == dfa->nodes[node].opr.idx)
2082 cls_node = node;
2083 }
2084
2085 /* Check the limitation of the open subexpression. */
2086 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2087 if (ops_node >= 0)
2088 {
2089 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2090 candidates);
2091 if (BE (err != REG_NOERROR, 0))
2092 return err;
2093 }
2094
2095 /* Check the limitation of the close subexpression. */
2096 if (cls_node >= 0)
2097 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098 {
2099 int node = dest_nodes->elems[node_idx];
2100 if (!re_node_set_contains (dfa->inveclosures + node,
2101 cls_node)
2102 && !re_node_set_contains (dfa->eclosures + node,
2103 cls_node))
2104 {
2105 /* It is against this limitation.
2106 Remove it form the current sifted state. */
2107 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2108 candidates);
2109 if (BE (err != REG_NOERROR, 0))
2110 return err;
2111 --node_idx;
2112 }
2113 }
2114 }
2115 else /* (ent->subexp_to != str_idx) */
2116 {
2117 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2118 {
2119 int node = dest_nodes->elems[node_idx];
2120 re_token_type_t type = dfa->nodes[node].type;
2121 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2122 {
2123 if (subexp_idx != dfa->nodes[node].opr.idx)
2124 continue;
2125 /* It is against this limitation.
2126 Remove it form the current sifted state. */
2127 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2128 candidates);
2129 if (BE (err != REG_NOERROR, 0))
2130 return err;
2131 }
2132 }
2133 }
2134 }
2135 return REG_NOERROR;
2136 }
2137
2138 static reg_errcode_t
2139 internal_function
2140 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2141 int str_idx, const re_node_set *candidates)
2142 {
2143 const re_dfa_t *const dfa = mctx->dfa;
2144 reg_errcode_t err;
2145 int node_idx, node;
2146 re_sift_context_t local_sctx;
2147 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2148
2149 if (first_idx == -1)
2150 return REG_NOERROR;
2151
2152 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2153
2154 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2155 {
2156 int enabled_idx;
2157 re_token_type_t type;
2158 struct re_backref_cache_entry *entry;
2159 node = candidates->elems[node_idx];
2160 type = dfa->nodes[node].type;
2161 /* Avoid infinite loop for the REs like "()\1+". */
2162 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2163 continue;
2164 if (type != OP_BACK_REF)
2165 continue;
2166
2167 entry = mctx->bkref_ents + first_idx;
2168 enabled_idx = first_idx;
2169 do
2170 {
2171 int subexp_len;
2172 int to_idx;
2173 int dst_node;
2174 int ret;
2175 re_dfastate_t *cur_state;
2176
2177 if (entry->node != node)
2178 continue;
2179 subexp_len = entry->subexp_to - entry->subexp_from;
2180 to_idx = str_idx + subexp_len;
2181 dst_node = (subexp_len ? dfa->nexts[node]
2182 : dfa->edests[node].elems[0]);
2183
2184 if (to_idx > sctx->last_str_idx
2185 || sctx->sifted_states[to_idx] == NULL
2186 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2187 || check_dst_limits (mctx, &sctx->limits, node,
2188 str_idx, dst_node, to_idx))
2189 continue;
2190
2191 if (local_sctx.sifted_states == NULL)
2192 {
2193 local_sctx = *sctx;
2194 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2195 if (BE (err != REG_NOERROR, 0))
2196 goto free_return;
2197 }
2198 local_sctx.last_node = node;
2199 local_sctx.last_str_idx = str_idx;
2200 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2201 if (BE (ret < 0, 0))
2202 {
2203 err = REG_ESPACE;
2204 goto free_return;
2205 }
2206 cur_state = local_sctx.sifted_states[str_idx];
2207 err = sift_states_backward (mctx, &local_sctx);
2208 if (BE (err != REG_NOERROR, 0))
2209 goto free_return;
2210 if (sctx->limited_states != NULL)
2211 {
2212 err = merge_state_array (dfa, sctx->limited_states,
2213 local_sctx.sifted_states,
2214 str_idx + 1);
2215 if (BE (err != REG_NOERROR, 0))
2216 goto free_return;
2217 }
2218 local_sctx.sifted_states[str_idx] = cur_state;
2219 re_node_set_remove (&local_sctx.limits, enabled_idx);
2220
2221 /* mctx->bkref_ents may have changed, reload the pointer. */
2222 entry = mctx->bkref_ents + enabled_idx;
2223 }
2224 while (enabled_idx++, entry++->more);
2225 }
2226 err = REG_NOERROR;
2227 free_return:
2228 if (local_sctx.sifted_states != NULL)
2229 {
2230 re_node_set_free (&local_sctx.limits);
2231 }
2232
2233 return err;
2234 }
2235
2236
2237 #ifdef RE_ENABLE_I18N
2238 static int
2239 internal_function
2240 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2241 int node_idx, int str_idx, int max_str_idx)
2242 {
2243 const re_dfa_t *const dfa = mctx->dfa;
2244 int naccepted;
2245 /* Check the node can accept `multi byte'. */
2246 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2247 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2248 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2249 dfa->nexts[node_idx]))
2250 /* The node can't accept the `multi byte', or the
2251 destination was already thrown away, then the node
2252 could't accept the current input `multi byte'. */
2253 naccepted = 0;
2254 /* Otherwise, it is sure that the node could accept
2255 `naccepted' bytes input. */
2256 return naccepted;
2257 }
2258 #endif /* RE_ENABLE_I18N */
2259
2260 \f
2261 /* Functions for state transition. */
2262
2263 /* Return the next state to which the current state STATE will transit by
2264 accepting the current input byte, and update STATE_LOG if necessary.
2265 If STATE can accept a multibyte char/collating element/back reference
2266 update the destination of STATE_LOG. */
2267
2268 static re_dfastate_t *
2269 internal_function
2270 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2271 re_dfastate_t *state)
2272 {
2273 re_dfastate_t **trtable;
2274 unsigned char ch;
2275
2276 #ifdef RE_ENABLE_I18N
2277 /* If the current state can accept multibyte. */
2278 if (BE (state->accept_mb, 0))
2279 {
2280 *err = transit_state_mb (mctx, state);
2281 if (BE (*err != REG_NOERROR, 0))
2282 return NULL;
2283 }
2284 #endif /* RE_ENABLE_I18N */
2285
2286 /* Then decide the next state with the single byte. */
2287 #if 0
2288 if (0)
2289 /* don't use transition table */
2290 return transit_state_sb (err, mctx, state);
2291 #endif
2292
2293 /* Use transition table */
2294 ch = re_string_fetch_byte (&mctx->input);
2295 for (;;)
2296 {
2297 trtable = state->trtable;
2298 if (BE (trtable != NULL, 1))
2299 return trtable[ch];
2300
2301 trtable = state->word_trtable;
2302 if (BE (trtable != NULL, 1))
2303 {
2304 unsigned int context;
2305 context
2306 = re_string_context_at (&mctx->input,
2307 re_string_cur_idx (&mctx->input) - 1,
2308 mctx->eflags);
2309 if (IS_WORD_CONTEXT (context))
2310 return trtable[ch + SBC_MAX];
2311 else
2312 return trtable[ch];
2313 }
2314
2315 if (!build_trtable (mctx->dfa, state))
2316 {
2317 *err = REG_ESPACE;
2318 return NULL;
2319 }
2320
2321 /* Retry, we now have a transition table. */
2322 }
2323 }
2324
2325 /* Update the state_log if we need */
2326 re_dfastate_t *
2327 internal_function
2328 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2329 re_dfastate_t *next_state)
2330 {
2331 const re_dfa_t *const dfa = mctx->dfa;
2332 int cur_idx = re_string_cur_idx (&mctx->input);
2333
2334 if (cur_idx > mctx->state_log_top)
2335 {
2336 mctx->state_log[cur_idx] = next_state;
2337 mctx->state_log_top = cur_idx;
2338 }
2339 else if (mctx->state_log[cur_idx] == 0)
2340 {
2341 mctx->state_log[cur_idx] = next_state;
2342 }
2343 else
2344 {
2345 re_dfastate_t *pstate;
2346 unsigned int context;
2347 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2348 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2349 the destination of a multibyte char/collating element/
2350 back reference. Then the next state is the union set of
2351 these destinations and the results of the transition table. */
2352 pstate = mctx->state_log[cur_idx];
2353 log_nodes = pstate->entrance_nodes;
2354 if (next_state != NULL)
2355 {
2356 table_nodes = next_state->entrance_nodes;
2357 *err = re_node_set_init_union (&next_nodes, table_nodes,
2358 log_nodes);
2359 if (BE (*err != REG_NOERROR, 0))
2360 return NULL;
2361 }
2362 else
2363 next_nodes = *log_nodes;
2364 /* Note: We already add the nodes of the initial state,
2365 then we don't need to add them here. */
2366
2367 context = re_string_context_at (&mctx->input,
2368 re_string_cur_idx (&mctx->input) - 1,
2369 mctx->eflags);
2370 next_state = mctx->state_log[cur_idx]
2371 = re_acquire_state_context (err, dfa, &next_nodes, context);
2372 /* We don't need to check errors here, since the return value of
2373 this function is next_state and ERR is already set. */
2374
2375 if (table_nodes != NULL)
2376 re_node_set_free (&next_nodes);
2377 }
2378
2379 if (BE (dfa->nbackref, 0) && next_state != NULL)
2380 {
2381 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2382 later. We must check them here, since the back references in the
2383 next state might use them. */
2384 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2385 cur_idx);
2386 if (BE (*err != REG_NOERROR, 0))
2387 return NULL;
2388
2389 /* If the next state has back references. */
2390 if (next_state->has_backref)
2391 {
2392 *err = transit_state_bkref (mctx, &next_state->nodes);
2393 if (BE (*err != REG_NOERROR, 0))
2394 return NULL;
2395 next_state = mctx->state_log[cur_idx];
2396 }
2397 }
2398
2399 return next_state;
2400 }
2401
2402 /* Skip bytes in the input that correspond to part of a
2403 multi-byte match, then look in the log for a state
2404 from which to restart matching. */
2405 re_dfastate_t *
2406 internal_function
2407 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2408 {
2409 re_dfastate_t *cur_state;
2410 do
2411 {
2412 int max = mctx->state_log_top;
2413 int cur_str_idx = re_string_cur_idx (&mctx->input);
2414
2415 do
2416 {
2417 if (++cur_str_idx > max)
2418 return NULL;
2419 re_string_skip_bytes (&mctx->input, 1);
2420 }
2421 while (mctx->state_log[cur_str_idx] == NULL);
2422
2423 cur_state = merge_state_with_log (err, mctx, NULL);
2424 }
2425 while (*err == REG_NOERROR && cur_state == NULL);
2426 return cur_state;
2427 }
2428
2429 /* Helper functions for transit_state. */
2430
2431 /* From the node set CUR_NODES, pick up the nodes whose types are
2432 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2433 expression. And register them to use them later for evaluating the
2434 correspoding back references. */
2435
2436 static reg_errcode_t
2437 internal_function
2438 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2439 int str_idx)
2440 {
2441 const re_dfa_t *const dfa = mctx->dfa;
2442 int node_idx;
2443 reg_errcode_t err;
2444
2445 /* TODO: This isn't efficient.
2446 Because there might be more than one nodes whose types are
2447 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2448 nodes.
2449 E.g. RE: (a){2} */
2450 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2451 {
2452 int node = cur_nodes->elems[node_idx];
2453 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2454 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2455 && (dfa->used_bkref_map
2456 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2457 {
2458 err = match_ctx_add_subtop (mctx, node, str_idx);
2459 if (BE (err != REG_NOERROR, 0))
2460 return err;
2461 }
2462 }
2463 return REG_NOERROR;
2464 }
2465
2466 #if 0
2467 /* Return the next state to which the current state STATE will transit by
2468 accepting the current input byte. */
2469
2470 static re_dfastate_t *
2471 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2472 re_dfastate_t *state)
2473 {
2474 const re_dfa_t *const dfa = mctx->dfa;
2475 re_node_set next_nodes;
2476 re_dfastate_t *next_state;
2477 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2478 unsigned int context;
2479
2480 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2481 if (BE (*err != REG_NOERROR, 0))
2482 return NULL;
2483 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2484 {
2485 int cur_node = state->nodes.elems[node_cnt];
2486 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2487 {
2488 *err = re_node_set_merge (&next_nodes,
2489 dfa->eclosures + dfa->nexts[cur_node]);
2490 if (BE (*err != REG_NOERROR, 0))
2491 {
2492 re_node_set_free (&next_nodes);
2493 return NULL;
2494 }
2495 }
2496 }
2497 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2498 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2499 /* We don't need to check errors here, since the return value of
2500 this function is next_state and ERR is already set. */
2501
2502 re_node_set_free (&next_nodes);
2503 re_string_skip_bytes (&mctx->input, 1);
2504 return next_state;
2505 }
2506 #endif
2507
2508 #ifdef RE_ENABLE_I18N
2509 static reg_errcode_t
2510 internal_function
2511 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2512 {
2513 const re_dfa_t *const dfa = mctx->dfa;
2514 reg_errcode_t err;
2515 int i;
2516
2517 for (i = 0; i < pstate->nodes.nelem; ++i)
2518 {
2519 re_node_set dest_nodes, *new_nodes;
2520 int cur_node_idx = pstate->nodes.elems[i];
2521 int naccepted, dest_idx;
2522 unsigned int context;
2523 re_dfastate_t *dest_state;
2524
2525 if (!dfa->nodes[cur_node_idx].accept_mb)
2526 continue;
2527
2528 if (dfa->nodes[cur_node_idx].constraint)
2529 {
2530 context = re_string_context_at (&mctx->input,
2531 re_string_cur_idx (&mctx->input),
2532 mctx->eflags);
2533 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2534 context))
2535 continue;
2536 }
2537
2538 /* How many bytes the node can accept? */
2539 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2540 re_string_cur_idx (&mctx->input));
2541 if (naccepted == 0)
2542 continue;
2543
2544 /* The node can accepts `naccepted' bytes. */
2545 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2546 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2547 : mctx->max_mb_elem_len);
2548 err = clean_state_log_if_needed (mctx, dest_idx);
2549 if (BE (err != REG_NOERROR, 0))
2550 return err;
2551 #ifdef DEBUG
2552 assert (dfa->nexts[cur_node_idx] != -1);
2553 #endif
2554 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2555
2556 dest_state = mctx->state_log[dest_idx];
2557 if (dest_state == NULL)
2558 dest_nodes = *new_nodes;
2559 else
2560 {
2561 err = re_node_set_init_union (&dest_nodes,
2562 dest_state->entrance_nodes, new_nodes);
2563 if (BE (err != REG_NOERROR, 0))
2564 return err;
2565 }
2566 context = re_string_context_at (&mctx->input, dest_idx - 1,
2567 mctx->eflags);
2568 mctx->state_log[dest_idx]
2569 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2570 if (dest_state != NULL)
2571 re_node_set_free (&dest_nodes);
2572 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2573 return err;
2574 }
2575 return REG_NOERROR;
2576 }
2577 #endif /* RE_ENABLE_I18N */
2578
2579 static reg_errcode_t
2580 internal_function
2581 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2582 {
2583 const re_dfa_t *const dfa = mctx->dfa;
2584 reg_errcode_t err;
2585 int i;
2586 int cur_str_idx = re_string_cur_idx (&mctx->input);
2587
2588 for (i = 0; i < nodes->nelem; ++i)
2589 {
2590 int dest_str_idx, prev_nelem, bkc_idx;
2591 int node_idx = nodes->elems[i];
2592 unsigned int context;
2593 const re_token_t *node = dfa->nodes + node_idx;
2594 re_node_set *new_dest_nodes;
2595
2596 /* Check whether `node' is a backreference or not. */
2597 if (node->type != OP_BACK_REF)
2598 continue;
2599
2600 if (node->constraint)
2601 {
2602 context = re_string_context_at (&mctx->input, cur_str_idx,
2603 mctx->eflags);
2604 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2605 continue;
2606 }
2607
2608 /* `node' is a backreference.
2609 Check the substring which the substring matched. */
2610 bkc_idx = mctx->nbkref_ents;
2611 err = get_subexp (mctx, node_idx, cur_str_idx);
2612 if (BE (err != REG_NOERROR, 0))
2613 goto free_return;
2614
2615 /* And add the epsilon closures (which is `new_dest_nodes') of
2616 the backreference to appropriate state_log. */
2617 #ifdef DEBUG
2618 assert (dfa->nexts[node_idx] != -1);
2619 #endif
2620 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2621 {
2622 int subexp_len;
2623 re_dfastate_t *dest_state;
2624 struct re_backref_cache_entry *bkref_ent;
2625 bkref_ent = mctx->bkref_ents + bkc_idx;
2626 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2627 continue;
2628 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2629 new_dest_nodes = (subexp_len == 0
2630 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2631 : dfa->eclosures + dfa->nexts[node_idx]);
2632 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2633 - bkref_ent->subexp_from);
2634 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2635 mctx->eflags);
2636 dest_state = mctx->state_log[dest_str_idx];
2637 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2638 : mctx->state_log[cur_str_idx]->nodes.nelem);
2639 /* Add `new_dest_node' to state_log. */
2640 if (dest_state == NULL)
2641 {
2642 mctx->state_log[dest_str_idx]
2643 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2644 context);
2645 if (BE (mctx->state_log[dest_str_idx] == NULL
2646 && err != REG_NOERROR, 0))
2647 goto free_return;
2648 }
2649 else
2650 {
2651 re_node_set dest_nodes;
2652 err = re_node_set_init_union (&dest_nodes,
2653 dest_state->entrance_nodes,
2654 new_dest_nodes);
2655 if (BE (err != REG_NOERROR, 0))
2656 {
2657 re_node_set_free (&dest_nodes);
2658 goto free_return;
2659 }
2660 mctx->state_log[dest_str_idx]
2661 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2662 re_node_set_free (&dest_nodes);
2663 if (BE (mctx->state_log[dest_str_idx] == NULL
2664 && err != REG_NOERROR, 0))
2665 goto free_return;
2666 }
2667 /* We need to check recursively if the backreference can epsilon
2668 transit. */
2669 if (subexp_len == 0
2670 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2671 {
2672 err = check_subexp_matching_top (mctx, new_dest_nodes,
2673 cur_str_idx);
2674 if (BE (err != REG_NOERROR, 0))
2675 goto free_return;
2676 err = transit_state_bkref (mctx, new_dest_nodes);
2677 if (BE (err != REG_NOERROR, 0))
2678 goto free_return;
2679 }
2680 }
2681 }
2682 err = REG_NOERROR;
2683 free_return:
2684 return err;
2685 }
2686
2687 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2688 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2689 Note that we might collect inappropriate candidates here.
2690 However, the cost of checking them strictly here is too high, then we
2691 delay these checking for prune_impossible_nodes(). */
2692
2693 static reg_errcode_t
2694 internal_function
2695 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2696 {
2697 const re_dfa_t *const dfa = mctx->dfa;
2698 int subexp_num, sub_top_idx;
2699 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2700 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2701 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2702 if (cache_idx != -1)
2703 {
2704 const struct re_backref_cache_entry *entry
2705 = mctx->bkref_ents + cache_idx;
2706 do
2707 if (entry->node == bkref_node)
2708 return REG_NOERROR; /* We already checked it. */
2709 while (entry++->more);
2710 }
2711
2712 subexp_num = dfa->nodes[bkref_node].opr.idx;
2713
2714 /* For each sub expression */
2715 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2716 {
2717 reg_errcode_t err;
2718 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2719 re_sub_match_last_t *sub_last;
2720 int sub_last_idx, sl_str, bkref_str_off;
2721
2722 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2723 continue; /* It isn't related. */
2724
2725 sl_str = sub_top->str_idx;
2726 bkref_str_off = bkref_str_idx;
2727 /* At first, check the last node of sub expressions we already
2728 evaluated. */
2729 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2730 {
2731 int sl_str_diff;
2732 sub_last = sub_top->lasts[sub_last_idx];
2733 sl_str_diff = sub_last->str_idx - sl_str;
2734 /* The matched string by the sub expression match with the substring
2735 at the back reference? */
2736 if (sl_str_diff > 0)
2737 {
2738 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2739 {
2740 /* Not enough chars for a successful match. */
2741 if (bkref_str_off + sl_str_diff > mctx->input.len)
2742 break;
2743
2744 err = clean_state_log_if_needed (mctx,
2745 bkref_str_off
2746 + sl_str_diff);
2747 if (BE (err != REG_NOERROR, 0))
2748 return err;
2749 buf = (const char *) re_string_get_buffer (&mctx->input);
2750 }
2751 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2752 /* We don't need to search this sub expression any more. */
2753 break;
2754 }
2755 bkref_str_off += sl_str_diff;
2756 sl_str += sl_str_diff;
2757 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2758 bkref_str_idx);
2759
2760 /* Reload buf, since the preceding call might have reallocated
2761 the buffer. */
2762 buf = (const char *) re_string_get_buffer (&mctx->input);
2763
2764 if (err == REG_NOMATCH)
2765 continue;
2766 if (BE (err != REG_NOERROR, 0))
2767 return err;
2768 }
2769
2770 if (sub_last_idx < sub_top->nlasts)
2771 continue;
2772 if (sub_last_idx > 0)
2773 ++sl_str;
2774 /* Then, search for the other last nodes of the sub expression. */
2775 for (; sl_str <= bkref_str_idx; ++sl_str)
2776 {
2777 int cls_node, sl_str_off;
2778 const re_node_set *nodes;
2779 sl_str_off = sl_str - sub_top->str_idx;
2780 /* The matched string by the sub expression match with the substring
2781 at the back reference? */
2782 if (sl_str_off > 0)
2783 {
2784 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2785 {
2786 /* If we are at the end of the input, we cannot match. */
2787 if (bkref_str_off >= mctx->input.len)
2788 break;
2789
2790 err = extend_buffers (mctx);
2791 if (BE (err != REG_NOERROR, 0))
2792 return err;
2793
2794 buf = (const char *) re_string_get_buffer (&mctx->input);
2795 }
2796 if (buf [bkref_str_off++] != buf[sl_str - 1])
2797 break; /* We don't need to search this sub expression
2798 any more. */
2799 }
2800 if (mctx->state_log[sl_str] == NULL)
2801 continue;
2802 /* Does this state have a ')' of the sub expression? */
2803 nodes = &mctx->state_log[sl_str]->nodes;
2804 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2805 OP_CLOSE_SUBEXP);
2806 if (cls_node == -1)
2807 continue; /* No. */
2808 if (sub_top->path == NULL)
2809 {
2810 sub_top->path = calloc (sizeof (state_array_t),
2811 sl_str - sub_top->str_idx + 1);
2812 if (sub_top->path == NULL)
2813 return REG_ESPACE;
2814 }
2815 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2816 in the current context? */
2817 err = check_arrival (mctx, sub_top->path, sub_top->node,
2818 sub_top->str_idx, cls_node, sl_str,
2819 OP_CLOSE_SUBEXP);
2820 if (err == REG_NOMATCH)
2821 continue;
2822 if (BE (err != REG_NOERROR, 0))
2823 return err;
2824 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2825 if (BE (sub_last == NULL, 0))
2826 return REG_ESPACE;
2827 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2828 bkref_str_idx);
2829 if (err == REG_NOMATCH)
2830 continue;
2831 }
2832 }
2833 return REG_NOERROR;
2834 }
2835
2836 /* Helper functions for get_subexp(). */
2837
2838 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2839 If it can arrive, register the sub expression expressed with SUB_TOP
2840 and SUB_LAST. */
2841
2842 static reg_errcode_t
2843 internal_function
2844 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2845 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2846 {
2847 reg_errcode_t err;
2848 int to_idx;
2849 /* Can the subexpression arrive the back reference? */
2850 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2851 sub_last->str_idx, bkref_node, bkref_str,
2852 OP_OPEN_SUBEXP);
2853 if (err != REG_NOERROR)
2854 return err;
2855 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2856 sub_last->str_idx);
2857 if (BE (err != REG_NOERROR, 0))
2858 return err;
2859 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2860 return clean_state_log_if_needed (mctx, to_idx);
2861 }
2862
2863 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2864 Search '(' if FL_OPEN, or search ')' otherwise.
2865 TODO: This function isn't efficient...
2866 Because there might be more than one nodes whose types are
2867 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2868 nodes.
2869 E.g. RE: (a){2} */
2870
2871 static int
2872 internal_function
2873 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2874 int subexp_idx, int type)
2875 {
2876 int cls_idx;
2877 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2878 {
2879 int cls_node = nodes->elems[cls_idx];
2880 const re_token_t *node = dfa->nodes + cls_node;
2881 if (node->type == type
2882 && node->opr.idx == subexp_idx)
2883 return cls_node;
2884 }
2885 return -1;
2886 }
2887
2888 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2889 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2890 heavily reused.
2891 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2892
2893 static reg_errcode_t
2894 internal_function
2895 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2896 int top_str, int last_node, int last_str, int type)
2897 {
2898 const re_dfa_t *const dfa = mctx->dfa;
2899 reg_errcode_t err = REG_NOERROR;
2900 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2901 re_dfastate_t *cur_state = NULL;
2902 re_node_set *cur_nodes, next_nodes;
2903 re_dfastate_t **backup_state_log;
2904 unsigned int context;
2905
2906 subexp_num = dfa->nodes[top_node].opr.idx;
2907 /* Extend the buffer if we need. */
2908 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2909 {
2910 re_dfastate_t **new_array;
2911 int old_alloc = path->alloc;
2912 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2913 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2914 if (BE (new_array == NULL, 0))
2915 {
2916 path->alloc = old_alloc;
2917 return REG_ESPACE;
2918 }
2919 path->array = new_array;
2920 memset (new_array + old_alloc, '\0',
2921 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2922 }
2923
2924 str_idx = path->next_idx ? path->next_idx : top_str;
2925
2926 /* Temporary modify MCTX. */
2927 backup_state_log = mctx->state_log;
2928 backup_cur_idx = mctx->input.cur_idx;
2929 mctx->state_log = path->array;
2930 mctx->input.cur_idx = str_idx;
2931
2932 /* Setup initial node set. */
2933 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2934 if (str_idx == top_str)
2935 {
2936 err = re_node_set_init_1 (&next_nodes, top_node);
2937 if (BE (err != REG_NOERROR, 0))
2938 return err;
2939 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2940 if (BE (err != REG_NOERROR, 0))
2941 {
2942 re_node_set_free (&next_nodes);
2943 return err;
2944 }
2945 }
2946 else
2947 {
2948 cur_state = mctx->state_log[str_idx];
2949 if (cur_state && cur_state->has_backref)
2950 {
2951 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2952 if (BE (err != REG_NOERROR, 0))
2953 return err;
2954 }
2955 else
2956 re_node_set_init_empty (&next_nodes);
2957 }
2958 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2959 {
2960 if (next_nodes.nelem)
2961 {
2962 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2963 subexp_num, type);
2964 if (BE (err != REG_NOERROR, 0))
2965 {
2966 re_node_set_free (&next_nodes);
2967 return err;
2968 }
2969 }
2970 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2971 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2972 {
2973 re_node_set_free (&next_nodes);
2974 return err;
2975 }
2976 mctx->state_log[str_idx] = cur_state;
2977 }
2978
2979 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2980 {
2981 re_node_set_empty (&next_nodes);
2982 if (mctx->state_log[str_idx + 1])
2983 {
2984 err = re_node_set_merge (&next_nodes,
2985 &mctx->state_log[str_idx + 1]->nodes);
2986 if (BE (err != REG_NOERROR, 0))
2987 {
2988 re_node_set_free (&next_nodes);
2989 return err;
2990 }
2991 }
2992 if (cur_state)
2993 {
2994 err = check_arrival_add_next_nodes (mctx, str_idx,
2995 &cur_state->non_eps_nodes,
2996 &next_nodes);
2997 if (BE (err != REG_NOERROR, 0))
2998 {
2999 re_node_set_free (&next_nodes);
3000 return err;
3001 }
3002 }
3003 ++str_idx;
3004 if (next_nodes.nelem)
3005 {
3006 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3007 if (BE (err != REG_NOERROR, 0))
3008 {
3009 re_node_set_free (&next_nodes);
3010 return err;
3011 }
3012 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3013 subexp_num, type);
3014 if (BE (err != REG_NOERROR, 0))
3015 {
3016 re_node_set_free (&next_nodes);
3017 return err;
3018 }
3019 }
3020 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3021 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3022 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3023 {
3024 re_node_set_free (&next_nodes);
3025 return err;
3026 }
3027 mctx->state_log[str_idx] = cur_state;
3028 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3029 }
3030 re_node_set_free (&next_nodes);
3031 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3032 : &mctx->state_log[last_str]->nodes);
3033 path->next_idx = str_idx;
3034
3035 /* Fix MCTX. */
3036 mctx->state_log = backup_state_log;
3037 mctx->input.cur_idx = backup_cur_idx;
3038
3039 /* Then check the current node set has the node LAST_NODE. */
3040 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3041 return REG_NOERROR;
3042
3043 return REG_NOMATCH;
3044 }
3045
3046 /* Helper functions for check_arrival. */
3047
3048 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3049 to NEXT_NODES.
3050 TODO: This function is similar to the functions transit_state*(),
3051 however this function has many additional works.
3052 Can't we unify them? */
3053
3054 static reg_errcode_t
3055 internal_function
3056 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3057 re_node_set *cur_nodes, re_node_set *next_nodes)
3058 {
3059 const re_dfa_t *const dfa = mctx->dfa;
3060 int result;
3061 int cur_idx;
3062 #ifdef RE_ENABLE_I18N
3063 reg_errcode_t err = REG_NOERROR;
3064 #endif
3065 re_node_set union_set;
3066 re_node_set_init_empty (&union_set);
3067 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3068 {
3069 int naccepted = 0;
3070 int cur_node = cur_nodes->elems[cur_idx];
3071 #ifdef DEBUG
3072 re_token_type_t type = dfa->nodes[cur_node].type;
3073 assert (!IS_EPSILON_NODE (type));
3074 #endif
3075 #ifdef RE_ENABLE_I18N
3076 /* If the node may accept `multi byte'. */
3077 if (dfa->nodes[cur_node].accept_mb)
3078 {
3079 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3080 str_idx);
3081 if (naccepted > 1)
3082 {
3083 re_dfastate_t *dest_state;
3084 int next_node = dfa->nexts[cur_node];
3085 int next_idx = str_idx + naccepted;
3086 dest_state = mctx->state_log[next_idx];
3087 re_node_set_empty (&union_set);
3088 if (dest_state)
3089 {
3090 err = re_node_set_merge (&union_set, &dest_state->nodes);
3091 if (BE (err != REG_NOERROR, 0))
3092 {
3093 re_node_set_free (&union_set);
3094 return err;
3095 }
3096 }
3097 result = re_node_set_insert (&union_set, next_node);
3098 if (BE (result < 0, 0))
3099 {
3100 re_node_set_free (&union_set);
3101 return REG_ESPACE;
3102 }
3103 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3104 &union_set);
3105 if (BE (mctx->state_log[next_idx] == NULL
3106 && err != REG_NOERROR, 0))
3107 {
3108 re_node_set_free (&union_set);
3109 return err;
3110 }
3111 }
3112 }
3113 #endif /* RE_ENABLE_I18N */
3114 if (naccepted
3115 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3116 {
3117 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3118 if (BE (result < 0, 0))
3119 {
3120 re_node_set_free (&union_set);
3121 return REG_ESPACE;
3122 }
3123 }
3124 }
3125 re_node_set_free (&union_set);
3126 return REG_NOERROR;
3127 }
3128
3129 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3130 CUR_NODES, however exclude the nodes which are:
3131 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3132 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3133 */
3134
3135 static reg_errcode_t
3136 internal_function
3137 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3138 int ex_subexp, int type)
3139 {
3140 reg_errcode_t err;
3141 int idx, outside_node;
3142 re_node_set new_nodes;
3143 #ifdef DEBUG
3144 assert (cur_nodes->nelem);
3145 #endif
3146 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3147 if (BE (err != REG_NOERROR, 0))
3148 return err;
3149 /* Create a new node set NEW_NODES with the nodes which are epsilon
3150 closures of the node in CUR_NODES. */
3151
3152 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3153 {
3154 int cur_node = cur_nodes->elems[idx];
3155 const re_node_set *eclosure = dfa->eclosures + cur_node;
3156 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3157 if (outside_node == -1)
3158 {
3159 /* There are no problematic nodes, just merge them. */
3160 err = re_node_set_merge (&new_nodes, eclosure);
3161 if (BE (err != REG_NOERROR, 0))
3162 {
3163 re_node_set_free (&new_nodes);
3164 return err;
3165 }
3166 }
3167 else
3168 {
3169 /* There are problematic nodes, re-calculate incrementally. */
3170 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3171 ex_subexp, type);
3172 if (BE (err != REG_NOERROR, 0))
3173 {
3174 re_node_set_free (&new_nodes);
3175 return err;
3176 }
3177 }
3178 }
3179 re_node_set_free (cur_nodes);
3180 *cur_nodes = new_nodes;
3181 return REG_NOERROR;
3182 }
3183
3184 /* Helper function for check_arrival_expand_ecl.
3185 Check incrementally the epsilon closure of TARGET, and if it isn't
3186 problematic append it to DST_NODES. */
3187
3188 static reg_errcode_t
3189 internal_function
3190 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3191 int target, int ex_subexp, int type)
3192 {
3193 int cur_node;
3194 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3195 {
3196 int err;
3197
3198 if (dfa->nodes[cur_node].type == type
3199 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3200 {
3201 if (type == OP_CLOSE_SUBEXP)
3202 {
3203 err = re_node_set_insert (dst_nodes, cur_node);
3204 if (BE (err == -1, 0))
3205 return REG_ESPACE;
3206 }
3207 break;
3208 }
3209 err = re_node_set_insert (dst_nodes, cur_node);
3210 if (BE (err == -1, 0))
3211 return REG_ESPACE;
3212 if (dfa->edests[cur_node].nelem == 0)
3213 break;
3214 if (dfa->edests[cur_node].nelem == 2)
3215 {
3216 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3217 dfa->edests[cur_node].elems[1],
3218 ex_subexp, type);
3219 if (BE (err != REG_NOERROR, 0))
3220 return err;
3221 }
3222 cur_node = dfa->edests[cur_node].elems[0];
3223 }
3224 return REG_NOERROR;
3225 }
3226
3227
3228 /* For all the back references in the current state, calculate the
3229 destination of the back references by the appropriate entry
3230 in MCTX->BKREF_ENTS. */
3231
3232 static reg_errcode_t
3233 internal_function
3234 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3235 int cur_str, int subexp_num, int type)
3236 {
3237 const re_dfa_t *const dfa = mctx->dfa;
3238 reg_errcode_t err;
3239 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3240 struct re_backref_cache_entry *ent;
3241
3242 if (cache_idx_start == -1)
3243 return REG_NOERROR;
3244
3245 restart:
3246 ent = mctx->bkref_ents + cache_idx_start;
3247 do
3248 {
3249 int to_idx, next_node;
3250
3251 /* Is this entry ENT is appropriate? */
3252 if (!re_node_set_contains (cur_nodes, ent->node))
3253 continue; /* No. */
3254
3255 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3256 /* Calculate the destination of the back reference, and append it
3257 to MCTX->STATE_LOG. */
3258 if (to_idx == cur_str)
3259 {
3260 /* The backreference did epsilon transit, we must re-check all the
3261 node in the current state. */
3262 re_node_set new_dests;
3263 reg_errcode_t err2, err3;
3264 next_node = dfa->edests[ent->node].elems[0];
3265 if (re_node_set_contains (cur_nodes, next_node))
3266 continue;
3267 err = re_node_set_init_1 (&new_dests, next_node);
3268 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3269 err3 = re_node_set_merge (cur_nodes, &new_dests);
3270 re_node_set_free (&new_dests);
3271 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3272 || err3 != REG_NOERROR, 0))
3273 {
3274 err = (err != REG_NOERROR ? err
3275 : (err2 != REG_NOERROR ? err2 : err3));
3276 return err;
3277 }
3278 /* TODO: It is still inefficient... */
3279 goto restart;
3280 }
3281 else
3282 {
3283 re_node_set union_set;
3284 next_node = dfa->nexts[ent->node];
3285 if (mctx->state_log[to_idx])
3286 {
3287 int ret;
3288 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3289 next_node))
3290 continue;
3291 err = re_node_set_init_copy (&union_set,
3292 &mctx->state_log[to_idx]->nodes);
3293 ret = re_node_set_insert (&union_set, next_node);
3294 if (BE (err != REG_NOERROR || ret < 0, 0))
3295 {
3296 re_node_set_free (&union_set);
3297 err = err != REG_NOERROR ? err : REG_ESPACE;
3298 return err;
3299 }
3300 }
3301 else
3302 {
3303 err = re_node_set_init_1 (&union_set, next_node);
3304 if (BE (err != REG_NOERROR, 0))
3305 return err;
3306 }
3307 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3308 re_node_set_free (&union_set);
3309 if (BE (mctx->state_log[to_idx] == NULL
3310 && err != REG_NOERROR, 0))
3311 return err;
3312 }
3313 }
3314 while (ent++->more);
3315 return REG_NOERROR;
3316 }
3317
3318 /* Build transition table for the state.
3319 Return 1 if succeeded, otherwise return NULL. */
3320
3321 static int
3322 internal_function
3323 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3324 {
3325 reg_errcode_t err;
3326 int i, j, ch, need_word_trtable = 0;
3327 bitset_word_t elem, mask;
3328 bool dests_node_malloced = false;
3329 bool dest_states_malloced = false;
3330 int ndests; /* Number of the destination states from `state'. */
3331 re_dfastate_t **trtable;
3332 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3333 re_node_set follows, *dests_node;
3334 bitset_t *dests_ch;
3335 bitset_t acceptable;
3336
3337 struct dests_alloc
3338 {
3339 re_node_set dests_node[SBC_MAX];
3340 bitset_t dests_ch[SBC_MAX];
3341 } *dests_alloc;
3342
3343 /* We build DFA states which corresponds to the destination nodes
3344 from `state'. `dests_node[i]' represents the nodes which i-th
3345 destination state contains, and `dests_ch[i]' represents the
3346 characters which i-th destination state accepts. */
3347 #ifdef HAVE_ALLOCA
3348 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3349 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3350 else
3351 #endif
3352 {
3353 dests_alloc = re_malloc (struct dests_alloc, 1);
3354 if (BE (dests_alloc == NULL, 0))
3355 return 0;
3356 dests_node_malloced = true;
3357 }
3358 dests_node = dests_alloc->dests_node;
3359 dests_ch = dests_alloc->dests_ch;
3360
3361 /* Initialize transiton table. */
3362 state->word_trtable = state->trtable = NULL;
3363
3364 /* At first, group all nodes belonging to `state' into several
3365 destinations. */
3366 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3367 if (BE (ndests <= 0, 0))
3368 {
3369 if (dests_node_malloced)
3370 free (dests_alloc);
3371 /* Return 0 in case of an error, 1 otherwise. */
3372 if (ndests == 0)
3373 {
3374 state->trtable = (re_dfastate_t **)
3375 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3376 return 1;
3377 }
3378 return 0;
3379 }
3380
3381 err = re_node_set_alloc (&follows, ndests + 1);
3382 if (BE (err != REG_NOERROR, 0))
3383 goto out_free;
3384
3385 /* Avoid arithmetic overflow in size calculation. */
3386 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3387 / (3 * sizeof (re_dfastate_t *)))
3388 < ndests),
3389 0))
3390 goto out_free;
3391
3392 #ifdef HAVE_ALLOCA
3393 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3394 + ndests * 3 * sizeof (re_dfastate_t *)))
3395 dest_states = (re_dfastate_t **)
3396 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3397 else
3398 #endif
3399 {
3400 dest_states = (re_dfastate_t **)
3401 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3402 if (BE (dest_states == NULL, 0))
3403 {
3404 out_free:
3405 if (dest_states_malloced)
3406 free (dest_states);
3407 re_node_set_free (&follows);
3408 for (i = 0; i < ndests; ++i)
3409 re_node_set_free (dests_node + i);
3410 if (dests_node_malloced)
3411 free (dests_alloc);
3412 return 0;
3413 }
3414 dest_states_malloced = true;
3415 }
3416 dest_states_word = dest_states + ndests;
3417 dest_states_nl = dest_states_word + ndests;
3418 bitset_empty (acceptable);
3419
3420 /* Then build the states for all destinations. */
3421 for (i = 0; i < ndests; ++i)
3422 {
3423 int next_node;
3424 re_node_set_empty (&follows);
3425 /* Merge the follows of this destination states. */
3426 for (j = 0; j < dests_node[i].nelem; ++j)
3427 {
3428 next_node = dfa->nexts[dests_node[i].elems[j]];
3429 if (next_node != -1)
3430 {
3431 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3432 if (BE (err != REG_NOERROR, 0))
3433 goto out_free;
3434 }
3435 }
3436 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3437 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3438 goto out_free;
3439 /* If the new state has context constraint,
3440 build appropriate states for these contexts. */
3441 if (dest_states[i]->has_constraint)
3442 {
3443 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3444 CONTEXT_WORD);
3445 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3446 goto out_free;
3447
3448 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3449 need_word_trtable = 1;
3450
3451 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3452 CONTEXT_NEWLINE);
3453 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3454 goto out_free;
3455 }
3456 else
3457 {
3458 dest_states_word[i] = dest_states[i];
3459 dest_states_nl[i] = dest_states[i];
3460 }
3461 bitset_merge (acceptable, dests_ch[i]);
3462 }
3463
3464 if (!BE (need_word_trtable, 0))
3465 {
3466 /* We don't care about whether the following character is a word
3467 character, or we are in a single-byte character set so we can
3468 discern by looking at the character code: allocate a
3469 256-entry transition table. */
3470 trtable = state->trtable =
3471 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3472 if (BE (trtable == NULL, 0))
3473 goto out_free;
3474
3475 /* For all characters ch...: */
3476 for (i = 0; i < BITSET_WORDS; ++i)
3477 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3478 elem;
3479 mask <<= 1, elem >>= 1, ++ch)
3480 if (BE (elem & 1, 0))
3481 {
3482 /* There must be exactly one destination which accepts
3483 character ch. See group_nodes_into_DFAstates. */
3484 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3485 ;
3486
3487 /* j-th destination accepts the word character ch. */
3488 if (dfa->word_char[i] & mask)
3489 trtable[ch] = dest_states_word[j];
3490 else
3491 trtable[ch] = dest_states[j];
3492 }
3493 }
3494 else
3495 {
3496 /* We care about whether the following character is a word
3497 character, and we are in a multi-byte character set: discern
3498 by looking at the character code: build two 256-entry
3499 transition tables, one starting at trtable[0] and one
3500 starting at trtable[SBC_MAX]. */
3501 trtable = state->word_trtable =
3502 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3503 if (BE (trtable == NULL, 0))
3504 goto out_free;
3505
3506 /* For all characters ch...: */
3507 for (i = 0; i < BITSET_WORDS; ++i)
3508 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3509 elem;
3510 mask <<= 1, elem >>= 1, ++ch)
3511 if (BE (elem & 1, 0))
3512 {
3513 /* There must be exactly one destination which accepts
3514 character ch. See group_nodes_into_DFAstates. */
3515 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3516 ;
3517
3518 /* j-th destination accepts the word character ch. */
3519 trtable[ch] = dest_states[j];
3520 trtable[ch + SBC_MAX] = dest_states_word[j];
3521 }
3522 }
3523
3524 /* new line */
3525 if (bitset_contain (acceptable, NEWLINE_CHAR))
3526 {
3527 /* The current state accepts newline character. */
3528 for (j = 0; j < ndests; ++j)
3529 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3530 {
3531 /* k-th destination accepts newline character. */
3532 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3533 if (need_word_trtable)
3534 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3535 /* There must be only one destination which accepts
3536 newline. See group_nodes_into_DFAstates. */
3537 break;
3538 }
3539 }
3540
3541 if (dest_states_malloced)
3542 free (dest_states);
3543
3544 re_node_set_free (&follows);
3545 for (i = 0; i < ndests; ++i)
3546 re_node_set_free (dests_node + i);
3547
3548 if (dests_node_malloced)
3549 free (dests_alloc);
3550
3551 return 1;
3552 }
3553
3554 /* Group all nodes belonging to STATE into several destinations.
3555 Then for all destinations, set the nodes belonging to the destination
3556 to DESTS_NODE[i] and set the characters accepted by the destination
3557 to DEST_CH[i]. This function return the number of destinations. */
3558
3559 static int
3560 internal_function
3561 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3562 re_node_set *dests_node, bitset_t *dests_ch)
3563 {
3564 reg_errcode_t err;
3565 int result;
3566 int i, j, k;
3567 int ndests; /* Number of the destinations from `state'. */
3568 bitset_t accepts; /* Characters a node can accept. */
3569 const re_node_set *cur_nodes = &state->nodes;
3570 bitset_empty (accepts);
3571 ndests = 0;
3572
3573 /* For all the nodes belonging to `state', */
3574 for (i = 0; i < cur_nodes->nelem; ++i)
3575 {
3576 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3577 re_token_type_t type = node->type;
3578 unsigned int constraint = node->constraint;
3579
3580 /* Enumerate all single byte character this node can accept. */
3581 if (type == CHARACTER)
3582 bitset_set (accepts, node->opr.c);
3583 else if (type == SIMPLE_BRACKET)
3584 {
3585 bitset_merge (accepts, node->opr.sbcset);
3586 }
3587 else if (type == OP_PERIOD)
3588 {
3589 #ifdef RE_ENABLE_I18N
3590 if (dfa->mb_cur_max > 1)
3591 bitset_merge (accepts, dfa->sb_char);
3592 else
3593 #endif
3594 bitset_set_all (accepts);
3595 if (!(dfa->syntax & RE_DOT_NEWLINE))
3596 bitset_clear (accepts, '\n');
3597 if (dfa->syntax & RE_DOT_NOT_NULL)
3598 bitset_clear (accepts, '\0');
3599 }
3600 #ifdef RE_ENABLE_I18N
3601 else if (type == OP_UTF8_PERIOD)
3602 {
3603 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3604 if (!(dfa->syntax & RE_DOT_NEWLINE))
3605 bitset_clear (accepts, '\n');
3606 if (dfa->syntax & RE_DOT_NOT_NULL)
3607 bitset_clear (accepts, '\0');
3608 }
3609 #endif
3610 else
3611 continue;
3612
3613 /* Check the `accepts' and sift the characters which are not
3614 match it the context. */
3615 if (constraint)
3616 {
3617 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3618 {
3619 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3620 bitset_empty (accepts);
3621 if (accepts_newline)
3622 bitset_set (accepts, NEWLINE_CHAR);
3623 else
3624 continue;
3625 }
3626 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3627 {
3628 bitset_empty (accepts);
3629 continue;
3630 }
3631
3632 if (constraint & NEXT_WORD_CONSTRAINT)
3633 {
3634 bitset_word_t any_set = 0;
3635 if (type == CHARACTER && !node->word_char)
3636 {
3637 bitset_empty (accepts);
3638 continue;
3639 }
3640 #ifdef RE_ENABLE_I18N
3641 if (dfa->mb_cur_max > 1)
3642 for (j = 0; j < BITSET_WORDS; ++j)
3643 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3644 else
3645 #endif
3646 for (j = 0; j < BITSET_WORDS; ++j)
3647 any_set |= (accepts[j] &= dfa->word_char[j]);
3648 if (!any_set)
3649 continue;
3650 }
3651 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3652 {
3653 bitset_word_t any_set = 0;
3654 if (type == CHARACTER && node->word_char)
3655 {
3656 bitset_empty (accepts);
3657 continue;
3658 }
3659 #ifdef RE_ENABLE_I18N
3660 if (dfa->mb_cur_max > 1)
3661 for (j = 0; j < BITSET_WORDS; ++j)
3662 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3663 else
3664 #endif
3665 for (j = 0; j < BITSET_WORDS; ++j)
3666 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3667 if (!any_set)
3668 continue;
3669 }
3670 }
3671
3672 /* Then divide `accepts' into DFA states, or create a new
3673 state. Above, we make sure that accepts is not empty. */
3674 for (j = 0; j < ndests; ++j)
3675 {
3676 bitset_t intersec; /* Intersection sets, see below. */
3677 bitset_t remains;
3678 /* Flags, see below. */
3679 bitset_word_t has_intersec, not_subset, not_consumed;
3680
3681 /* Optimization, skip if this state doesn't accept the character. */
3682 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3683 continue;
3684
3685 /* Enumerate the intersection set of this state and `accepts'. */
3686 has_intersec = 0;
3687 for (k = 0; k < BITSET_WORDS; ++k)
3688 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3689 /* And skip if the intersection set is empty. */
3690 if (!has_intersec)
3691 continue;
3692
3693 /* Then check if this state is a subset of `accepts'. */
3694 not_subset = not_consumed = 0;
3695 for (k = 0; k < BITSET_WORDS; ++k)
3696 {
3697 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3698 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3699 }
3700
3701 /* If this state isn't a subset of `accepts', create a
3702 new group state, which has the `remains'. */
3703 if (not_subset)
3704 {
3705 bitset_copy (dests_ch[ndests], remains);
3706 bitset_copy (dests_ch[j], intersec);
3707 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3708 if (BE (err != REG_NOERROR, 0))
3709 goto error_return;
3710 ++ndests;
3711 }
3712
3713 /* Put the position in the current group. */
3714 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3715 if (BE (result < 0, 0))
3716 goto error_return;
3717
3718 /* If all characters are consumed, go to next node. */
3719 if (!not_consumed)
3720 break;
3721 }
3722 /* Some characters remain, create a new group. */
3723 if (j == ndests)
3724 {
3725 bitset_copy (dests_ch[ndests], accepts);
3726 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3727 if (BE (err != REG_NOERROR, 0))
3728 goto error_return;
3729 ++ndests;
3730 bitset_empty (accepts);
3731 }
3732 }
3733 return ndests;
3734 error_return:
3735 for (j = 0; j < ndests; ++j)
3736 re_node_set_free (dests_node + j);
3737 return -1;
3738 }
3739
3740 #ifdef RE_ENABLE_I18N
3741 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3742 Return the number of the bytes the node accepts.
3743 STR_IDX is the current index of the input string.
3744
3745 This function handles the nodes which can accept one character, or
3746 one collating element like '.', '[a-z]', opposite to the other nodes
3747 can only accept one byte. */
3748
3749 static int
3750 internal_function
3751 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3752 const re_string_t *input, int str_idx)
3753 {
3754 const re_token_t *node = dfa->nodes + node_idx;
3755 int char_len, elem_len;
3756 int i;
3757 wint_t wc;
3758
3759 if (BE (node->type == OP_UTF8_PERIOD, 0))
3760 {
3761 unsigned char c = re_string_byte_at (input, str_idx), d;
3762 if (BE (c < 0xc2, 1))
3763 return 0;
3764
3765 if (str_idx + 2 > input->len)
3766 return 0;
3767
3768 d = re_string_byte_at (input, str_idx + 1);
3769 if (c < 0xe0)
3770 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3771 else if (c < 0xf0)
3772 {
3773 char_len = 3;
3774 if (c == 0xe0 && d < 0xa0)
3775 return 0;
3776 }
3777 else if (c < 0xf8)
3778 {
3779 char_len = 4;
3780 if (c == 0xf0 && d < 0x90)
3781 return 0;
3782 }
3783 else if (c < 0xfc)
3784 {
3785 char_len = 5;
3786 if (c == 0xf8 && d < 0x88)
3787 return 0;
3788 }
3789 else if (c < 0xfe)
3790 {
3791 char_len = 6;
3792 if (c == 0xfc && d < 0x84)
3793 return 0;
3794 }
3795 else
3796 return 0;
3797
3798 if (str_idx + char_len > input->len)
3799 return 0;
3800
3801 for (i = 1; i < char_len; ++i)
3802 {
3803 d = re_string_byte_at (input, str_idx + i);
3804 if (d < 0x80 || d > 0xbf)
3805 return 0;
3806 }
3807 return char_len;
3808 }
3809
3810 char_len = re_string_char_size_at (input, str_idx);
3811 if (node->type == OP_PERIOD)
3812 {
3813 if (char_len <= 1)
3814 return 0;
3815 /* FIXME: I don't think this if is needed, as both '\n'
3816 and '\0' are char_len == 1. */
3817 /* '.' accepts any one character except the following two cases. */
3818 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3819 re_string_byte_at (input, str_idx) == '\n') ||
3820 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3821 re_string_byte_at (input, str_idx) == '\0'))
3822 return 0;
3823 return char_len;
3824 }
3825
3826 elem_len = re_string_elem_size_at (input, str_idx);
3827 wc = __btowc(*(input->mbs+str_idx));
3828 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3829 return 0;
3830
3831 if (node->type == COMPLEX_BRACKET)
3832 {
3833 const re_charset_t *cset = node->opr.mbcset;
3834 # ifdef _LIBC
3835 const unsigned char *pin
3836 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3837 int j;
3838 uint32_t nrules;
3839 # endif /* _LIBC */
3840 int match_len = 0;
3841 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3842 ? re_string_wchar_at (input, str_idx) : 0);
3843
3844 /* match with multibyte character? */
3845 for (i = 0; i < cset->nmbchars; ++i)
3846 if (wc == cset->mbchars[i])
3847 {
3848 match_len = char_len;
3849 goto check_node_accept_bytes_match;
3850 }
3851 /* match with character_class? */
3852 for (i = 0; i < cset->nchar_classes; ++i)
3853 {
3854 wctype_t wt = cset->char_classes[i];
3855 if (__iswctype (wc, wt))
3856 {
3857 match_len = char_len;
3858 goto check_node_accept_bytes_match;
3859 }
3860 }
3861
3862 # ifdef _LIBC
3863 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3864 if (nrules != 0)
3865 {
3866 unsigned int in_collseq = 0;
3867 const int32_t *table, *indirect;
3868 const unsigned char *weights, *extra;
3869 const char *collseqwc;
3870 /* This #include defines a local function! */
3871 # include <locale/weight.h>
3872
3873 /* match with collating_symbol? */
3874 if (cset->ncoll_syms)
3875 extra = (const unsigned char *)
3876 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3877 for (i = 0; i < cset->ncoll_syms; ++i)
3878 {
3879 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3880 /* Compare the length of input collating element and
3881 the length of current collating element. */
3882 if (*coll_sym != elem_len)
3883 continue;
3884 /* Compare each bytes. */
3885 for (j = 0; j < *coll_sym; j++)
3886 if (pin[j] != coll_sym[1 + j])
3887 break;
3888 if (j == *coll_sym)
3889 {
3890 /* Match if every bytes is equal. */
3891 match_len = j;
3892 goto check_node_accept_bytes_match;
3893 }
3894 }
3895
3896 if (cset->nranges)
3897 {
3898 if (elem_len <= char_len)
3899 {
3900 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3901 in_collseq = __collseq_table_lookup (collseqwc, wc);
3902 }
3903 else
3904 in_collseq = find_collation_sequence_value (pin, elem_len);
3905 }
3906 /* match with range expression? */
3907 for (i = 0; i < cset->nranges; ++i)
3908 if (cset->range_starts[i] <= in_collseq
3909 && in_collseq <= cset->range_ends[i])
3910 {
3911 match_len = elem_len;
3912 goto check_node_accept_bytes_match;
3913 }
3914
3915 /* match with equivalence_class? */
3916 if (cset->nequiv_classes)
3917 {
3918 const unsigned char *cp = pin;
3919 table = (const int32_t *)
3920 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3921 weights = (const unsigned char *)
3922 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3923 extra = (const unsigned char *)
3924 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3925 indirect = (const int32_t *)
3926 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3927 int32_t idx = findidx (&cp);
3928 if (idx > 0)
3929 for (i = 0; i < cset->nequiv_classes; ++i)
3930 {
3931 int32_t equiv_class_idx = cset->equiv_classes[i];
3932 size_t weight_len = weights[idx & 0xffffff];
3933 if (weight_len == weights[equiv_class_idx & 0xffffff]
3934 && (idx >> 24) == (equiv_class_idx >> 24))
3935 {
3936 int cnt = 0;
3937
3938 idx &= 0xffffff;
3939 equiv_class_idx &= 0xffffff;
3940
3941 while (cnt <= weight_len
3942 && (weights[equiv_class_idx + 1 + cnt]
3943 == weights[idx + 1 + cnt]))
3944 ++cnt;
3945 if (cnt > weight_len)
3946 {
3947 match_len = elem_len;
3948 goto check_node_accept_bytes_match;
3949 }
3950 }
3951 }
3952 }
3953 }
3954 else
3955 # endif /* _LIBC */
3956 {
3957 /* match with range expression? */
3958 #if __GNUC__ >= 2
3959 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3960 #else
3961 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3962 cmp_buf[2] = wc;
3963 #endif
3964 for (i = 0; i < cset->nranges; ++i)
3965 {
3966 cmp_buf[0] = cset->range_starts[i];
3967 cmp_buf[4] = cset->range_ends[i];
3968 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3969 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3970 {
3971 match_len = char_len;
3972 goto check_node_accept_bytes_match;
3973 }
3974 }
3975 }
3976 check_node_accept_bytes_match:
3977 if (!cset->non_match)
3978 return match_len;
3979 else
3980 {
3981 if (match_len > 0)
3982 return 0;
3983 else
3984 return (elem_len > char_len) ? elem_len : char_len;
3985 }
3986 }
3987 return 0;
3988 }
3989
3990 # ifdef _LIBC
3991 static unsigned int
3992 internal_function
3993 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3994 {
3995 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3996 if (nrules == 0)
3997 {
3998 if (mbs_len == 1)
3999 {
4000 /* No valid character. Match it as a single byte character. */
4001 const unsigned char *collseq = (const unsigned char *)
4002 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4003 return collseq[mbs[0]];
4004 }
4005 return UINT_MAX;
4006 }
4007 else
4008 {
4009 int32_t idx;
4010 const unsigned char *extra = (const unsigned char *)
4011 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4012 int32_t extrasize = (const unsigned char *)
4013 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4014
4015 for (idx = 0; idx < extrasize;)
4016 {
4017 int mbs_cnt, found = 0;
4018 int32_t elem_mbs_len;
4019 /* Skip the name of collating element name. */
4020 idx = idx + extra[idx] + 1;
4021 elem_mbs_len = extra[idx++];
4022 if (mbs_len == elem_mbs_len)
4023 {
4024 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4025 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4026 break;
4027 if (mbs_cnt == elem_mbs_len)
4028 /* Found the entry. */
4029 found = 1;
4030 }
4031 /* Skip the byte sequence of the collating element. */
4032 idx += elem_mbs_len;
4033 /* Adjust for the alignment. */
4034 idx = (idx + 3) & ~3;
4035 /* Skip the collation sequence value. */
4036 idx += sizeof (uint32_t);
4037 /* Skip the wide char sequence of the collating element. */
4038 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4039 /* If we found the entry, return the sequence value. */
4040 if (found)
4041 return *(uint32_t *) (extra + idx);
4042 /* Skip the collation sequence value. */
4043 idx += sizeof (uint32_t);
4044 }
4045 return UINT_MAX;
4046 }
4047 }
4048 # endif /* _LIBC */
4049 #endif /* RE_ENABLE_I18N */
4050
4051 /* Check whether the node accepts the byte which is IDX-th
4052 byte of the INPUT. */
4053
4054 static int
4055 internal_function
4056 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4057 int idx)
4058 {
4059 unsigned char ch;
4060 ch = re_string_byte_at (&mctx->input, idx);
4061 switch (node->type)
4062 {
4063 case CHARACTER:
4064 if (node->opr.c != ch)
4065 return 0;
4066 break;
4067
4068 case SIMPLE_BRACKET:
4069 if (!bitset_contain (node->opr.sbcset, ch))
4070 return 0;
4071 break;
4072
4073 #ifdef RE_ENABLE_I18N
4074 case OP_UTF8_PERIOD:
4075 if (ch >= 0x80)
4076 return 0;
4077 /* FALLTHROUGH */
4078 #endif
4079 case OP_PERIOD:
4080 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4081 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4082 return 0;
4083 break;
4084
4085 default:
4086 return 0;
4087 }
4088
4089 if (node->constraint)
4090 {
4091 /* The node has constraints. Check whether the current context
4092 satisfies the constraints. */
4093 unsigned int context = re_string_context_at (&mctx->input, idx,
4094 mctx->eflags);
4095 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4096 return 0;
4097 }
4098
4099 return 1;
4100 }
4101
4102 /* Extend the buffers, if the buffers have run out. */
4103
4104 static reg_errcode_t
4105 internal_function
4106 extend_buffers (re_match_context_t *mctx)
4107 {
4108 reg_errcode_t ret;
4109 re_string_t *pstr = &mctx->input;
4110
4111 /* Avoid overflow. */
4112 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4113 return REG_ESPACE;
4114
4115 /* Double the lengthes of the buffers. */
4116 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4117 if (BE (ret != REG_NOERROR, 0))
4118 return ret;
4119
4120 if (mctx->state_log != NULL)
4121 {
4122 /* And double the length of state_log. */
4123 /* XXX We have no indication of the size of this buffer. If this
4124 allocation fail we have no indication that the state_log array
4125 does not have the right size. */
4126 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4127 pstr->bufs_len + 1);
4128 if (BE (new_array == NULL, 0))
4129 return REG_ESPACE;
4130 mctx->state_log = new_array;
4131 }
4132
4133 /* Then reconstruct the buffers. */
4134 if (pstr->icase)
4135 {
4136 #ifdef RE_ENABLE_I18N
4137 if (pstr->mb_cur_max > 1)
4138 {
4139 ret = build_wcs_upper_buffer (pstr);
4140 if (BE (ret != REG_NOERROR, 0))
4141 return ret;
4142 }
4143 else
4144 #endif /* RE_ENABLE_I18N */
4145 build_upper_buffer (pstr);
4146 }
4147 else
4148 {
4149 #ifdef RE_ENABLE_I18N
4150 if (pstr->mb_cur_max > 1)
4151 build_wcs_buffer (pstr);
4152 else
4153 #endif /* RE_ENABLE_I18N */
4154 {
4155 if (pstr->trans != NULL)
4156 re_string_translate_buffer (pstr);
4157 }
4158 }
4159 return REG_NOERROR;
4160 }
4161
4162 \f
4163 /* Functions for matching context. */
4164
4165 /* Initialize MCTX. */
4166
4167 static reg_errcode_t
4168 internal_function
4169 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4170 {
4171 mctx->eflags = eflags;
4172 mctx->match_last = -1;
4173 if (n > 0)
4174 {
4175 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4176 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4177 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4178 return REG_ESPACE;
4179 }
4180 /* Already zero-ed by the caller.
4181 else
4182 mctx->bkref_ents = NULL;
4183 mctx->nbkref_ents = 0;
4184 mctx->nsub_tops = 0; */
4185 mctx->abkref_ents = n;
4186 mctx->max_mb_elem_len = 1;
4187 mctx->asub_tops = n;
4188 return REG_NOERROR;
4189 }
4190
4191 /* Clean the entries which depend on the current input in MCTX.
4192 This function must be invoked when the matcher changes the start index
4193 of the input, or changes the input string. */
4194
4195 static void
4196 internal_function
4197 match_ctx_clean (re_match_context_t *mctx)
4198 {
4199 int st_idx;
4200 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4201 {
4202 int sl_idx;
4203 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4204 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4205 {
4206 re_sub_match_last_t *last = top->lasts[sl_idx];
4207 re_free (last->path.array);
4208 re_free (last);
4209 }
4210 re_free (top->lasts);
4211 if (top->path)
4212 {
4213 re_free (top->path->array);
4214 re_free (top->path);
4215 }
4216 free (top);
4217 }
4218
4219 mctx->nsub_tops = 0;
4220 mctx->nbkref_ents = 0;
4221 }
4222
4223 /* Free all the memory associated with MCTX. */
4224
4225 static void
4226 internal_function
4227 match_ctx_free (re_match_context_t *mctx)
4228 {
4229 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4230 match_ctx_clean (mctx);
4231 re_free (mctx->sub_tops);
4232 re_free (mctx->bkref_ents);
4233 }
4234
4235 /* Add a new backreference entry to MCTX.
4236 Note that we assume that caller never call this function with duplicate
4237 entry, and call with STR_IDX which isn't smaller than any existing entry.
4238 */
4239
4240 static reg_errcode_t
4241 internal_function
4242 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4243 int to)
4244 {
4245 if (mctx->nbkref_ents >= mctx->abkref_ents)
4246 {
4247 struct re_backref_cache_entry* new_entry;
4248 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4249 mctx->abkref_ents * 2);
4250 if (BE (new_entry == NULL, 0))
4251 {
4252 re_free (mctx->bkref_ents);
4253 return REG_ESPACE;
4254 }
4255 mctx->bkref_ents = new_entry;
4256 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4257 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4258 mctx->abkref_ents *= 2;
4259 }
4260 if (mctx->nbkref_ents > 0
4261 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4262 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4263
4264 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4265 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4266 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4267 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4268
4269 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4270 If bit N is clear, means that this entry won't epsilon-transition to
4271 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4272 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4273 such node.
4274
4275 A backreference does not epsilon-transition unless it is empty, so set
4276 to all zeros if FROM != TO. */
4277 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4278 = (from == to ? ~0 : 0);
4279
4280 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4281 if (mctx->max_mb_elem_len < to - from)
4282 mctx->max_mb_elem_len = to - from;
4283 return REG_NOERROR;
4284 }
4285
4286 /* Search for the first entry which has the same str_idx, or -1 if none is
4287 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4288
4289 static int
4290 internal_function
4291 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4292 {
4293 int left, right, mid, last;
4294 last = right = mctx->nbkref_ents;
4295 for (left = 0; left < right;)
4296 {
4297 mid = (left + right) / 2;
4298 if (mctx->bkref_ents[mid].str_idx < str_idx)
4299 left = mid + 1;
4300 else
4301 right = mid;
4302 }
4303 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4304 return left;
4305 else
4306 return -1;
4307 }
4308
4309 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4310 at STR_IDX. */
4311
4312 static reg_errcode_t
4313 internal_function
4314 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4315 {
4316 #ifdef DEBUG
4317 assert (mctx->sub_tops != NULL);
4318 assert (mctx->asub_tops > 0);
4319 #endif
4320 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4321 {
4322 int new_asub_tops = mctx->asub_tops * 2;
4323 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4324 re_sub_match_top_t *,
4325 new_asub_tops);
4326 if (BE (new_array == NULL, 0))
4327 return REG_ESPACE;
4328 mctx->sub_tops = new_array;
4329 mctx->asub_tops = new_asub_tops;
4330 }
4331 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4332 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4333 return REG_ESPACE;
4334 mctx->sub_tops[mctx->nsub_tops]->node = node;
4335 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4336 return REG_NOERROR;
4337 }
4338
4339 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4340 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4341
4342 static re_sub_match_last_t *
4343 internal_function
4344 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4345 {
4346 re_sub_match_last_t *new_entry;
4347 if (BE (subtop->nlasts == subtop->alasts, 0))
4348 {
4349 int new_alasts = 2 * subtop->alasts + 1;
4350 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4351 re_sub_match_last_t *,
4352 new_alasts);
4353 if (BE (new_array == NULL, 0))
4354 return NULL;
4355 subtop->lasts = new_array;
4356 subtop->alasts = new_alasts;
4357 }
4358 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4359 if (BE (new_entry != NULL, 1))
4360 {
4361 subtop->lasts[subtop->nlasts] = new_entry;
4362 new_entry->node = node;
4363 new_entry->str_idx = str_idx;
4364 ++subtop->nlasts;
4365 }
4366 return new_entry;
4367 }
4368
4369 static void
4370 internal_function
4371 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4372 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4373 {
4374 sctx->sifted_states = sifted_sts;
4375 sctx->limited_states = limited_sts;
4376 sctx->last_node = last_node;
4377 sctx->last_str_idx = last_str_idx;
4378 re_node_set_init_empty (&sctx->limits);
4379 }