1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
26 int str_idx
, int from
, int to
)
28 static int search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
31 int str_idx
) internal_function
;
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
)
35 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
36 re_dfastate_t
**limited_sts
, int last_node
,
39 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
40 const char *string
, int length
,
41 int start
, int range
, int stop
,
42 size_t nmatch
, regmatch_t pmatch
[],
43 int eflags
) internal_function
;
44 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
45 const char *string1
, int length1
,
46 const char *string2
, int length2
,
47 int start
, int range
, struct re_registers
*regs
,
48 int stop
, int ret_len
) internal_function
;
49 static int re_search_stub (struct re_pattern_buffer
*bufp
,
50 const char *string
, int length
, int start
,
51 int range
, int stop
, struct re_registers
*regs
,
52 int ret_len
) internal_function
;
53 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
54 int nregs
, int regs_allocated
) internal_function
;
55 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
57 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
58 int *p_match_first
) internal_function
;
59 static int check_halt_state_context (const re_match_context_t
*mctx
,
60 const re_dfastate_t
*state
, int idx
)
62 static void update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
63 regmatch_t
*prev_idx_match
, int cur_node
,
64 int cur_idx
, int nmatch
) internal_function
;
65 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
66 int str_idx
, int dest_node
, int nregs
,
68 re_node_set
*eps_via_nodes
)
70 static reg_errcode_t
set_regs (const regex_t
*preg
,
71 const re_match_context_t
*mctx
,
72 size_t nmatch
, regmatch_t
*pmatch
,
73 int fl_backtrack
) internal_function
;
74 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
)
78 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
79 re_sift_context_t
*sctx
,
80 int node_idx
, int str_idx
, int max_str_idx
)
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t
sift_states_backward (const re_match_context_t
*mctx
,
84 re_sift_context_t
*sctx
)
86 static reg_errcode_t
build_sifted_states (const re_match_context_t
*mctx
,
87 re_sift_context_t
*sctx
, int str_idx
,
88 re_node_set
*cur_dest
)
90 static reg_errcode_t
update_cur_sifted_state (const re_match_context_t
*mctx
,
91 re_sift_context_t
*sctx
,
93 re_node_set
*dest_nodes
)
95 static reg_errcode_t
add_epsilon_src_nodes (const re_dfa_t
*dfa
,
96 re_node_set
*dest_nodes
,
97 const re_node_set
*candidates
)
99 static int check_dst_limits (const re_match_context_t
*mctx
,
101 int dst_node
, int dst_idx
, int src_node
,
102 int src_idx
) internal_function
;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
,
104 int boundaries
, int subexp_idx
,
105 int from_node
, int bkref_idx
)
107 static int check_dst_limits_calc_pos (const re_match_context_t
*mctx
,
108 int limit
, int subexp_idx
,
109 int node
, int str_idx
,
110 int bkref_idx
) internal_function
;
111 static reg_errcode_t
check_subexp_limits (const re_dfa_t
*dfa
,
112 re_node_set
*dest_nodes
,
113 const re_node_set
*candidates
,
115 struct re_backref_cache_entry
*bkref_ents
,
116 int str_idx
) internal_function
;
117 static reg_errcode_t
sift_states_bkref (const re_match_context_t
*mctx
,
118 re_sift_context_t
*sctx
,
119 int str_idx
, const re_node_set
*candidates
)
121 static reg_errcode_t
merge_state_array (const re_dfa_t
*dfa
,
123 re_dfastate_t
**src
, int num
)
125 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
126 re_match_context_t
*mctx
) internal_function
;
127 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
128 re_match_context_t
*mctx
,
129 re_dfastate_t
*state
) internal_function
;
130 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
131 re_match_context_t
*mctx
,
132 re_dfastate_t
*next_state
)
134 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
135 re_node_set
*cur_nodes
,
136 int str_idx
) internal_function
;
138 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
139 re_match_context_t
*mctx
,
140 re_dfastate_t
*pstate
)
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
145 re_dfastate_t
*pstate
)
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
149 const re_node_set
*nodes
)
151 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
152 int bkref_node
, int bkref_str_idx
)
154 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
155 const re_sub_match_top_t
*sub_top
,
156 re_sub_match_last_t
*sub_last
,
157 int bkref_node
, int bkref_str
)
159 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
160 int subexp_idx
, int type
) internal_function
;
161 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
162 state_array_t
*path
, int top_node
,
163 int top_str
, int last_node
, int last_str
,
164 int type
) internal_function
;
165 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
167 re_node_set
*cur_nodes
,
168 re_node_set
*next_nodes
)
170 static reg_errcode_t
check_arrival_expand_ecl (const re_dfa_t
*dfa
,
171 re_node_set
*cur_nodes
,
172 int ex_subexp
, int type
)
174 static reg_errcode_t
check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
,
175 re_node_set
*dst_nodes
,
176 int target
, int ex_subexp
,
177 int type
) internal_function
;
178 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
179 re_node_set
*cur_nodes
, int cur_str
,
180 int subexp_num
, int type
)
182 static int build_trtable (const re_dfa_t
*dfa
,
183 re_dfastate_t
*state
) internal_function
;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
186 const re_string_t
*input
, int idx
)
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t
*dfa
,
195 const re_dfastate_t
*state
,
196 re_node_set
*states_node
,
197 bitset_t
*states_ch
) internal_function
;
198 static int check_node_accept (const re_match_context_t
*mctx
,
199 const re_token_t
*node
, int idx
)
201 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
)
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
222 const regex_t
*__restrict preg
;
223 const char *__restrict string
;
231 if (eflags
& ~(REG_NOTBOL
| REG_NOTEOL
| REG_STARTEND
))
234 if (eflags
& REG_STARTEND
)
236 start
= pmatch
[0].rm_so
;
237 length
= pmatch
[0].rm_eo
;
242 length
= strlen (string
);
245 __libc_lock_lock (dfa
->lock
);
247 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
248 length
, 0, NULL
, eflags
);
250 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
251 length
, nmatch
, pmatch
, eflags
);
252 __libc_lock_unlock (dfa
->lock
);
253 return err
!= REG_NOERROR
;
257 # include <shlib-compat.h>
258 versioned_symbol (libc
, __regexec
, regexec
, GLIBC_2_3_4
);
260 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
261 __typeof__ (__regexec
) __compat_regexec
;
264 attribute_compat_text_section
265 __compat_regexec (const regex_t
*__restrict preg
,
266 const char *__restrict string
, size_t nmatch
,
267 regmatch_t pmatch
[], int eflags
)
269 return regexec (preg
, string
, nmatch
, pmatch
,
270 eflags
& (REG_NOTBOL
| REG_NOTEOL
));
272 compat_symbol (libc
, __compat_regexec
, regexec
, GLIBC_2_0
);
276 /* Entry points for GNU code. */
278 /* re_match, re_search, re_match_2, re_search_2
280 The former two functions operate on STRING with length LENGTH,
281 while the later two operate on concatenation of STRING1 and STRING2
282 with lengths LENGTH1 and LENGTH2, respectively.
284 re_match() matches the compiled pattern in BUFP against the string,
285 starting at index START.
287 re_search() first tries matching at index START, then it tries to match
288 starting from index START + 1, and so on. The last start position tried
289 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
292 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
293 the first STOP characters of the concatenation of the strings should be
296 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
297 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
298 computed relative to the concatenation, not relative to the individual
301 On success, re_match* functions return the length of the match, re_search*
302 return the position of the start of the match. Return value -1 means no
303 match was found and -2 indicates an internal error. */
306 re_match (bufp
, string
, length
, start
, regs
)
307 struct re_pattern_buffer
*bufp
;
310 struct re_registers
*regs
;
312 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
315 weak_alias (__re_match
, re_match
)
319 re_search (bufp
, string
, length
, start
, range
, regs
)
320 struct re_pattern_buffer
*bufp
;
322 int length
, start
, range
;
323 struct re_registers
*regs
;
325 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
328 weak_alias (__re_search
, re_search
)
332 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
333 struct re_pattern_buffer
*bufp
;
334 const char *string1
, *string2
;
335 int length1
, length2
, start
, stop
;
336 struct re_registers
*regs
;
338 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
339 start
, 0, regs
, stop
, 1);
342 weak_alias (__re_match_2
, re_match_2
)
346 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
347 struct re_pattern_buffer
*bufp
;
348 const char *string1
, *string2
;
349 int length1
, length2
, start
, range
, stop
;
350 struct re_registers
*regs
;
352 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
353 start
, range
, regs
, stop
, 0);
356 weak_alias (__re_search_2
, re_search_2
)
360 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
362 struct re_pattern_buffer
*bufp
;
363 const char *string1
, *string2
;
364 int length1
, length2
, start
, range
, stop
, ret_len
;
365 struct re_registers
*regs
;
369 int len
= length1
+ length2
;
372 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
375 /* Concatenate the strings. */
379 char *s
= re_malloc (char, len
);
381 if (BE (s
== NULL
, 0))
383 memcpy (s
, string1
, length1
);
384 memcpy (s
+ length1
, string2
, length2
);
393 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
, ret_len
);
395 re_free ((char *) str
);
399 /* The parameters have the same meaning as those of re_search.
400 Additional parameters:
401 If RET_LEN is nonzero the length of the match is returned (re_match style);
402 otherwise the position of the match is returned. */
405 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
406 struct re_pattern_buffer
*bufp
;
408 int length
, start
, range
, stop
, ret_len
;
409 struct re_registers
*regs
;
411 reg_errcode_t result
;
416 /* Check for out-of-range. */
417 if (BE (start
< 0 || start
> length
, 0))
419 if (BE (start
+ range
> length
, 0))
420 range
= length
- start
;
421 else if (BE (start
+ range
< 0, 0))
424 __libc_lock_lock (dfa
->lock
);
426 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
427 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
429 /* Compile fastmap if we haven't yet. */
430 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
431 re_compile_fastmap (bufp
);
433 if (BE (bufp
->no_sub
, 0))
436 /* We need at least 1 register. */
439 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
440 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
442 nregs
= regs
->num_regs
;
443 if (BE (nregs
< 1, 0))
445 /* Nothing can be copied to regs. */
451 nregs
= bufp
->re_nsub
+ 1;
452 pmatch
= re_malloc (regmatch_t
, nregs
);
453 if (BE (pmatch
== NULL
, 0))
459 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
460 nregs
, pmatch
, eflags
);
464 /* I hope we needn't fill ther regs with -1's when no match was found. */
465 if (result
!= REG_NOERROR
)
467 else if (regs
!= NULL
)
469 /* If caller wants register contents data back, copy them. */
470 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
471 bufp
->regs_allocated
);
472 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
476 if (BE (rval
== 0, 1))
480 assert (pmatch
[0].rm_so
== start
);
481 rval
= pmatch
[0].rm_eo
- start
;
484 rval
= pmatch
[0].rm_so
;
488 __libc_lock_unlock (dfa
->lock
);
493 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
494 struct re_registers
*regs
;
496 int nregs
, regs_allocated
;
498 int rval
= REGS_REALLOCATE
;
500 int need_regs
= nregs
+ 1;
501 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
504 /* Have the register data arrays been allocated? */
505 if (regs_allocated
== REGS_UNALLOCATED
)
506 { /* No. So allocate them with malloc. */
507 regs
->start
= re_malloc (regoff_t
, need_regs
);
508 if (BE (regs
->start
== NULL
, 0))
509 return REGS_UNALLOCATED
;
510 regs
->end
= re_malloc (regoff_t
, need_regs
);
511 if (BE (regs
->end
== NULL
, 0))
513 re_free (regs
->start
);
514 return REGS_UNALLOCATED
;
516 regs
->num_regs
= need_regs
;
518 else if (regs_allocated
== REGS_REALLOCATE
)
519 { /* Yes. If we need more elements than were already
520 allocated, reallocate them. If we need fewer, just
522 if (BE (need_regs
> regs
->num_regs
, 0))
524 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
526 if (BE (new_start
== NULL
, 0))
527 return REGS_UNALLOCATED
;
528 new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
529 if (BE (new_end
== NULL
, 0))
532 return REGS_UNALLOCATED
;
534 regs
->start
= new_start
;
536 regs
->num_regs
= need_regs
;
541 assert (regs_allocated
== REGS_FIXED
);
542 /* This function may not be called with REGS_FIXED and nregs too big. */
543 assert (regs
->num_regs
>= nregs
);
548 for (i
= 0; i
< nregs
; ++i
)
550 regs
->start
[i
] = pmatch
[i
].rm_so
;
551 regs
->end
[i
] = pmatch
[i
].rm_eo
;
553 for ( ; i
< regs
->num_regs
; ++i
)
554 regs
->start
[i
] = regs
->end
[i
] = -1;
559 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
560 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
561 this memory for recording register information. STARTS and ENDS
562 must be allocated using the malloc library routine, and must each
563 be at least NUM_REGS * sizeof (regoff_t) bytes long.
565 If NUM_REGS == 0, then subsequent matches should allocate their own
568 Unless this function is called, the first search or match using
569 PATTERN_BUFFER will allocate its own register data, without
570 freeing the old data. */
573 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
574 struct re_pattern_buffer
*bufp
;
575 struct re_registers
*regs
;
577 regoff_t
*starts
, *ends
;
581 bufp
->regs_allocated
= REGS_REALLOCATE
;
582 regs
->num_regs
= num_regs
;
583 regs
->start
= starts
;
588 bufp
->regs_allocated
= REGS_UNALLOCATED
;
590 regs
->start
= regs
->end
= (regoff_t
*) 0;
594 weak_alias (__re_set_registers
, re_set_registers
)
597 /* Entry points compatible with 4.2 BSD regex library. We don't define
598 them unless specifically requested. */
600 #if defined _REGEX_RE_COMP || defined _LIBC
608 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
610 #endif /* _REGEX_RE_COMP */
612 /* Internal entry point. */
614 /* Searches for a compiled pattern PREG in the string STRING, whose
615 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
616 mingings with regexec. START, and RANGE have the same meanings
618 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
619 otherwise return the error code.
620 Note: We assume front end functions already check ranges.
621 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
624 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
628 int length
, start
, range
, stop
, eflags
;
633 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
634 int left_lim
, right_lim
, incr
;
635 int fl_longest_match
, match_first
, match_kind
, match_last
= -1;
638 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
639 re_match_context_t mctx
= { .dfa
= dfa
};
641 re_match_context_t mctx
;
643 char *fastmap
= (preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
644 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
;
645 RE_TRANSLATE_TYPE t
= preg
->translate
;
647 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
648 memset (&mctx
, '\0', sizeof (re_match_context_t
));
652 extra_nmatch
= (nmatch
> preg
->re_nsub
) ? nmatch
- (preg
->re_nsub
+ 1) : 0;
653 nmatch
-= extra_nmatch
;
655 /* Check if the DFA haven't been compiled. */
656 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
657 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
658 || dfa
->init_state_begbuf
== NULL
, 0))
662 /* We assume front-end functions already check them. */
663 assert (start
+ range
>= 0 && start
+ range
<= length
);
666 /* If initial states with non-begbuf contexts have no elements,
667 the regex must be anchored. If preg->newline_anchor is set,
668 we'll never use init_state_nl, so do not check it. */
669 if (dfa
->init_state
->nodes
.nelem
== 0
670 && dfa
->init_state_word
->nodes
.nelem
== 0
671 && (dfa
->init_state_nl
->nodes
.nelem
== 0
672 || !preg
->newline_anchor
))
674 if (start
!= 0 && start
+ range
!= 0)
679 /* We must check the longest matching, if nmatch > 0. */
680 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
682 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
683 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
684 if (BE (err
!= REG_NOERROR
, 0))
686 mctx
.input
.stop
= stop
;
687 mctx
.input
.raw_stop
= stop
;
688 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
690 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
691 if (BE (err
!= REG_NOERROR
, 0))
694 /* We will log all the DFA states through which the dfa pass,
695 if nmatch > 1, or this dfa has "multibyte node", which is a
696 back-reference or a node which can accept multibyte character or
697 multi character collating element. */
698 if (nmatch
> 1 || dfa
->has_mb_node
)
700 /* Avoid overflow. */
701 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= mctx
.input
.bufs_len
, 0))
707 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
708 if (BE (mctx
.state_log
== NULL
, 0))
715 mctx
.state_log
= NULL
;
718 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
719 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
721 /* Check incrementally whether of not the input string match. */
722 incr
= (range
< 0) ? -1 : 1;
723 left_lim
= (range
< 0) ? start
+ range
: start
;
724 right_lim
= (range
< 0) ? start
: start
+ range
;
725 sb
= dfa
->mb_cur_max
== 1;
728 ? ((sb
|| !(preg
->syntax
& RE_ICASE
|| t
) ? 4 : 0)
729 | (range
>= 0 ? 2 : 0)
730 | (t
!= NULL
? 1 : 0))
733 for (;; match_first
+= incr
)
736 if (match_first
< left_lim
|| right_lim
< match_first
)
739 /* Advance as rapidly as possible through the string, until we
740 find a plausible place to start matching. This may be done
741 with varying efficiency, so there are various possibilities:
742 only the most common of them are specialized, in order to
743 save on code size. We use a switch statement for speed. */
751 /* Fastmap with single-byte translation, match forward. */
752 while (BE (match_first
< right_lim
, 1)
753 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
755 goto forward_match_found_start_or_reached_end
;
758 /* Fastmap without translation, match forward. */
759 while (BE (match_first
< right_lim
, 1)
760 && !fastmap
[(unsigned char) string
[match_first
]])
763 forward_match_found_start_or_reached_end
:
764 if (BE (match_first
== right_lim
, 0))
766 ch
= match_first
>= length
767 ? 0 : (unsigned char) string
[match_first
];
768 if (!fastmap
[t
? t
[ch
] : ch
])
775 /* Fastmap without multi-byte translation, match backwards. */
776 while (match_first
>= left_lim
)
778 ch
= match_first
>= length
779 ? 0 : (unsigned char) string
[match_first
];
780 if (fastmap
[t
? t
[ch
] : ch
])
784 if (match_first
< left_lim
)
789 /* In this case, we can't determine easily the current byte,
790 since it might be a component byte of a multibyte
791 character. Then we use the constructed buffer instead. */
794 /* If MATCH_FIRST is out of the valid range, reconstruct the
796 unsigned int offset
= match_first
- mctx
.input
.raw_mbs_idx
;
797 if (BE (offset
>= (unsigned int) mctx
.input
.valid_raw_len
, 0))
799 err
= re_string_reconstruct (&mctx
.input
, match_first
,
801 if (BE (err
!= REG_NOERROR
, 0))
804 offset
= match_first
- mctx
.input
.raw_mbs_idx
;
806 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
807 Note that MATCH_FIRST must not be smaller than 0. */
808 ch
= (match_first
>= length
809 ? 0 : re_string_byte_at (&mctx
.input
, offset
));
813 if (match_first
< left_lim
|| match_first
> right_lim
)
822 /* Reconstruct the buffers so that the matcher can assume that
823 the matching starts from the beginning of the buffer. */
824 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
825 if (BE (err
!= REG_NOERROR
, 0))
828 #ifdef RE_ENABLE_I18N
829 /* Don't consider this char as a possible match start if it part,
830 yet isn't the head, of a multibyte character. */
831 if (!sb
&& !re_string_first_byte (&mctx
.input
, 0))
835 /* It seems to be appropriate one, then use the matcher. */
836 /* We assume that the matching starts from 0. */
837 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
838 match_last
= check_matching (&mctx
, fl_longest_match
,
839 range
>= 0 ? &match_first
: NULL
);
840 if (match_last
!= -1)
842 if (BE (match_last
== -2, 0))
849 mctx
.match_last
= match_last
;
850 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
852 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
853 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
856 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
859 err
= prune_impossible_nodes (&mctx
);
860 if (err
== REG_NOERROR
)
862 if (BE (err
!= REG_NOMATCH
, 0))
867 break; /* We found a match. */
871 match_ctx_clean (&mctx
);
875 assert (match_last
!= -1);
876 assert (err
== REG_NOERROR
);
879 /* Set pmatch[] if we need. */
884 /* Initialize registers. */
885 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
886 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
888 /* Set the points where matching start/end. */
890 pmatch
[0].rm_eo
= mctx
.match_last
;
892 if (!preg
->no_sub
&& nmatch
> 1)
894 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
895 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
896 if (BE (err
!= REG_NOERROR
, 0))
900 /* At last, add the offset to the each registers, since we slided
901 the buffers so that we could assume that the matching starts
903 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
904 if (pmatch
[reg_idx
].rm_so
!= -1)
906 #ifdef RE_ENABLE_I18N
907 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
909 pmatch
[reg_idx
].rm_so
=
910 (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
911 ? mctx
.input
.valid_raw_len
912 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
]);
913 pmatch
[reg_idx
].rm_eo
=
914 (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
915 ? mctx
.input
.valid_raw_len
916 : mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
]);
919 assert (mctx
.input
.offsets_needed
== 0);
921 pmatch
[reg_idx
].rm_so
+= match_first
;
922 pmatch
[reg_idx
].rm_eo
+= match_first
;
924 for (reg_idx
= 0; reg_idx
< extra_nmatch
; ++reg_idx
)
926 pmatch
[nmatch
+ reg_idx
].rm_so
= -1;
927 pmatch
[nmatch
+ reg_idx
].rm_eo
= -1;
931 for (reg_idx
= 0; reg_idx
+ 1 < nmatch
; reg_idx
++)
932 if (dfa
->subexp_map
[reg_idx
] != reg_idx
)
934 pmatch
[reg_idx
+ 1].rm_so
935 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_so
;
936 pmatch
[reg_idx
+ 1].rm_eo
937 = pmatch
[dfa
->subexp_map
[reg_idx
] + 1].rm_eo
;
942 re_free (mctx
.state_log
);
944 match_ctx_free (&mctx
);
945 re_string_destruct (&mctx
.input
);
950 prune_impossible_nodes (mctx
)
951 re_match_context_t
*mctx
;
953 const re_dfa_t
*const dfa
= mctx
->dfa
;
954 int halt_node
, match_last
;
956 re_dfastate_t
**sifted_states
;
957 re_dfastate_t
**lim_states
= NULL
;
958 re_sift_context_t sctx
;
960 assert (mctx
->state_log
!= NULL
);
962 match_last
= mctx
->match_last
;
963 halt_node
= mctx
->last_node
;
965 /* Avoid overflow. */
966 if (BE (SIZE_MAX
/ sizeof (re_dfastate_t
*) <= match_last
, 0))
969 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
970 if (BE (sifted_states
== NULL
, 0))
977 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
978 if (BE (lim_states
== NULL
, 0))
985 memset (lim_states
, '\0',
986 sizeof (re_dfastate_t
*) * (match_last
+ 1));
987 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
989 ret
= sift_states_backward (mctx
, &sctx
);
990 re_node_set_free (&sctx
.limits
);
991 if (BE (ret
!= REG_NOERROR
, 0))
993 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
1003 } while (mctx
->state_log
[match_last
] == NULL
1004 || !mctx
->state_log
[match_last
]->halt
);
1005 halt_node
= check_halt_state_context (mctx
,
1006 mctx
->state_log
[match_last
],
1009 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
1011 re_free (lim_states
);
1013 if (BE (ret
!= REG_NOERROR
, 0))
1018 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
, match_last
);
1019 ret
= sift_states_backward (mctx
, &sctx
);
1020 re_node_set_free (&sctx
.limits
);
1021 if (BE (ret
!= REG_NOERROR
, 0))
1023 if (sifted_states
[0] == NULL
)
1029 re_free (mctx
->state_log
);
1030 mctx
->state_log
= sifted_states
;
1031 sifted_states
= NULL
;
1032 mctx
->last_node
= halt_node
;
1033 mctx
->match_last
= match_last
;
1036 re_free (sifted_states
);
1037 re_free (lim_states
);
1041 /* Acquire an initial state and return it.
1042 We must select appropriate initial state depending on the context,
1043 since initial states may have constraints like "\<", "^", etc.. */
1045 static inline re_dfastate_t
*
1046 __attribute ((always_inline
)) internal_function
1047 acquire_init_state_context (reg_errcode_t
*err
, const re_match_context_t
*mctx
,
1050 const re_dfa_t
*const dfa
= mctx
->dfa
;
1051 if (dfa
->init_state
->has_constraint
)
1053 unsigned int context
;
1054 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
1055 if (IS_WORD_CONTEXT (context
))
1056 return dfa
->init_state_word
;
1057 else if (IS_ORDINARY_CONTEXT (context
))
1058 return dfa
->init_state
;
1059 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
1060 return dfa
->init_state_begbuf
;
1061 else if (IS_NEWLINE_CONTEXT (context
))
1062 return dfa
->init_state_nl
;
1063 else if (IS_BEGBUF_CONTEXT (context
))
1065 /* It is relatively rare case, then calculate on demand. */
1066 return re_acquire_state_context (err
, dfa
,
1067 dfa
->init_state
->entrance_nodes
,
1071 /* Must not happen? */
1072 return dfa
->init_state
;
1075 return dfa
->init_state
;
1078 /* Check whether the regular expression match input string INPUT or not,
1079 and return the index where the matching end, return -1 if not match,
1080 or return -2 in case of an error.
1081 FL_LONGEST_MATCH means we want the POSIX longest matching.
1082 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1083 next place where we may want to try matching.
1084 Note that the matcher assume that the maching starts from the current
1085 index of the buffer. */
1089 check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
1092 const re_dfa_t
*const dfa
= mctx
->dfa
;
1095 int match_last
= -1;
1096 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1097 re_dfastate_t
*cur_state
;
1098 int at_init_state
= p_match_first
!= NULL
;
1099 int next_start_idx
= cur_str_idx
;
1102 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1103 /* An initial state must not be NULL (invalid). */
1104 if (BE (cur_state
== NULL
, 0))
1106 assert (err
== REG_ESPACE
);
1110 if (mctx
->state_log
!= NULL
)
1112 mctx
->state_log
[cur_str_idx
] = cur_state
;
1114 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1115 later. E.g. Processing back references. */
1116 if (BE (dfa
->nbackref
, 0))
1119 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1120 if (BE (err
!= REG_NOERROR
, 0))
1123 if (cur_state
->has_backref
)
1125 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1126 if (BE (err
!= REG_NOERROR
, 0))
1132 /* If the RE accepts NULL string. */
1133 if (BE (cur_state
->halt
, 0))
1135 if (!cur_state
->has_constraint
1136 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1138 if (!fl_longest_match
)
1142 match_last
= cur_str_idx
;
1148 while (!re_string_eoi (&mctx
->input
))
1150 re_dfastate_t
*old_state
= cur_state
;
1151 int next_char_idx
= re_string_cur_idx (&mctx
->input
) + 1;
1153 if (BE (next_char_idx
>= mctx
->input
.bufs_len
, 0)
1154 || (BE (next_char_idx
>= mctx
->input
.valid_len
, 0)
1155 && mctx
->input
.valid_len
< mctx
->input
.len
))
1157 err
= extend_buffers (mctx
);
1158 if (BE (err
!= REG_NOERROR
, 0))
1160 assert (err
== REG_ESPACE
);
1165 cur_state
= transit_state (&err
, mctx
, cur_state
);
1166 if (mctx
->state_log
!= NULL
)
1167 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1169 if (cur_state
== NULL
)
1171 /* Reached the invalid state or an error. Try to recover a valid
1172 state using the state log, if available and if we have not
1173 already found a valid (even if not the longest) match. */
1174 if (BE (err
!= REG_NOERROR
, 0))
1177 if (mctx
->state_log
== NULL
1178 || (match
&& !fl_longest_match
)
1179 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1183 if (BE (at_init_state
, 0))
1185 if (old_state
== cur_state
)
1186 next_start_idx
= next_char_idx
;
1191 if (cur_state
->halt
)
1193 /* Reached a halt state.
1194 Check the halt state can satisfy the current context. */
1195 if (!cur_state
->has_constraint
1196 || check_halt_state_context (mctx
, cur_state
,
1197 re_string_cur_idx (&mctx
->input
)))
1199 /* We found an appropriate halt state. */
1200 match_last
= re_string_cur_idx (&mctx
->input
);
1203 /* We found a match, do not modify match_first below. */
1204 p_match_first
= NULL
;
1205 if (!fl_longest_match
)
1212 *p_match_first
+= next_start_idx
;
1217 /* Check NODE match the current context. */
1221 check_halt_node_context (const re_dfa_t
*dfa
, int node
, unsigned int context
)
1223 re_token_type_t type
= dfa
->nodes
[node
].type
;
1224 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1225 if (type
!= END_OF_RE
)
1229 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1234 /* Check the halt state STATE match the current context.
1235 Return 0 if not match, if the node, STATE has, is a halt node and
1236 match the context, return the node. */
1240 check_halt_state_context (const re_match_context_t
*mctx
,
1241 const re_dfastate_t
*state
, int idx
)
1244 unsigned int context
;
1246 assert (state
->halt
);
1248 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1249 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1250 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1251 return state
->nodes
.elems
[i
];
1255 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1256 corresponding to the DFA).
1257 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1262 proceed_next_node (const re_match_context_t
*mctx
, int nregs
, regmatch_t
*regs
,
1263 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
1264 struct re_fail_stack_t
*fs
)
1266 const re_dfa_t
*const dfa
= mctx
->dfa
;
1268 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1270 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1271 re_node_set
*edests
= &dfa
->edests
[node
];
1273 err
= re_node_set_insert (eps_via_nodes
, node
);
1274 if (BE (err
< 0, 0))
1276 /* Pick up a valid destination, or return -1 if none is found. */
1277 for (dest_node
= -1, i
= 0; i
< edests
->nelem
; ++i
)
1279 int candidate
= edests
->elems
[i
];
1280 if (!re_node_set_contains (cur_nodes
, candidate
))
1282 if (dest_node
== -1)
1283 dest_node
= candidate
;
1287 /* In order to avoid infinite loop like "(a*)*", return the second
1288 epsilon-transition if the first was already considered. */
1289 if (re_node_set_contains (eps_via_nodes
, dest_node
))
1292 /* Otherwise, push the second epsilon-transition on the fail stack. */
1294 && push_fail_stack (fs
, *pidx
, candidate
, nregs
, regs
,
1298 /* We know we are going to exit. */
1307 re_token_type_t type
= dfa
->nodes
[node
].type
;
1309 #ifdef RE_ENABLE_I18N
1310 if (dfa
->nodes
[node
].accept_mb
)
1311 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1313 #endif /* RE_ENABLE_I18N */
1314 if (type
== OP_BACK_REF
)
1316 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
+ 1;
1317 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1320 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1324 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1325 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1334 err
= re_node_set_insert (eps_via_nodes
, node
);
1335 if (BE (err
< 0, 0))
1337 dest_node
= dfa
->edests
[node
].elems
[0];
1338 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1345 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1347 int dest_node
= dfa
->nexts
[node
];
1348 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1349 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1350 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1353 re_node_set_empty (eps_via_nodes
);
1360 static reg_errcode_t
1362 push_fail_stack (struct re_fail_stack_t
*fs
, int str_idx
, int dest_node
,
1363 int nregs
, regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1366 int num
= fs
->num
++;
1367 if (fs
->num
== fs
->alloc
)
1369 struct re_fail_stack_ent_t
*new_array
;
1370 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1372 if (new_array
== NULL
)
1375 fs
->stack
= new_array
;
1377 fs
->stack
[num
].idx
= str_idx
;
1378 fs
->stack
[num
].node
= dest_node
;
1379 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1380 if (fs
->stack
[num
].regs
== NULL
)
1382 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1383 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1389 pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
1390 regmatch_t
*regs
, re_node_set
*eps_via_nodes
)
1392 int num
= --fs
->num
;
1394 *pidx
= fs
->stack
[num
].idx
;
1395 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1396 re_node_set_free (eps_via_nodes
);
1397 re_free (fs
->stack
[num
].regs
);
1398 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1399 return fs
->stack
[num
].node
;
1402 /* Set the positions where the subexpressions are starts/ends to registers
1404 Note: We assume that pmatch[0] is already set, and
1405 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1407 static reg_errcode_t
1409 set_regs (const regex_t
*preg
, const re_match_context_t
*mctx
, size_t nmatch
,
1410 regmatch_t
*pmatch
, int fl_backtrack
)
1412 const re_dfa_t
*dfa
= (const re_dfa_t
*) preg
->buffer
;
1414 re_node_set eps_via_nodes
;
1415 struct re_fail_stack_t
*fs
;
1416 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1417 regmatch_t
*prev_idx_match
;
1418 int prev_idx_match_malloced
= 0;
1421 assert (nmatch
> 1);
1422 assert (mctx
->state_log
!= NULL
);
1427 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1428 if (fs
->stack
== NULL
)
1434 cur_node
= dfa
->init_node
;
1435 re_node_set_init_empty (&eps_via_nodes
);
1438 if (__libc_use_alloca (nmatch
* sizeof (regmatch_t
)))
1439 prev_idx_match
= (regmatch_t
*) alloca (nmatch
* sizeof (regmatch_t
));
1443 prev_idx_match
= re_malloc (regmatch_t
, nmatch
);
1444 if (prev_idx_match
== NULL
)
1446 free_fail_stack_return (fs
);
1449 prev_idx_match_malloced
= 1;
1451 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1453 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1455 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, nmatch
);
1457 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1462 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1463 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1465 if (reg_idx
== nmatch
)
1467 re_node_set_free (&eps_via_nodes
);
1468 if (prev_idx_match_malloced
)
1469 re_free (prev_idx_match
);
1470 return free_fail_stack_return (fs
);
1472 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1477 re_node_set_free (&eps_via_nodes
);
1478 if (prev_idx_match_malloced
)
1479 re_free (prev_idx_match
);
1484 /* Proceed to next node. */
1485 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1486 &eps_via_nodes
, fs
);
1488 if (BE (cur_node
< 0, 0))
1490 if (BE (cur_node
== -2, 0))
1492 re_node_set_free (&eps_via_nodes
);
1493 if (prev_idx_match_malloced
)
1494 re_free (prev_idx_match
);
1495 free_fail_stack_return (fs
);
1499 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1503 re_node_set_free (&eps_via_nodes
);
1504 if (prev_idx_match_malloced
)
1505 re_free (prev_idx_match
);
1510 re_node_set_free (&eps_via_nodes
);
1511 if (prev_idx_match_malloced
)
1512 re_free (prev_idx_match
);
1513 return free_fail_stack_return (fs
);
1516 static reg_errcode_t
1518 free_fail_stack_return (struct re_fail_stack_t
*fs
)
1523 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1525 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1526 re_free (fs
->stack
[fs_idx
].regs
);
1528 re_free (fs
->stack
);
1535 update_regs (const re_dfa_t
*dfa
, regmatch_t
*pmatch
,
1536 regmatch_t
*prev_idx_match
, int cur_node
, int cur_idx
, int nmatch
)
1538 int type
= dfa
->nodes
[cur_node
].type
;
1539 if (type
== OP_OPEN_SUBEXP
)
1541 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1543 /* We are at the first node of this sub expression. */
1544 if (reg_num
< nmatch
)
1546 pmatch
[reg_num
].rm_so
= cur_idx
;
1547 pmatch
[reg_num
].rm_eo
= -1;
1550 else if (type
== OP_CLOSE_SUBEXP
)
1552 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1553 if (reg_num
< nmatch
)
1555 /* We are at the last node of this sub expression. */
1556 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1558 pmatch
[reg_num
].rm_eo
= cur_idx
;
1559 /* This is a non-empty match or we are not inside an optional
1560 subexpression. Accept this right away. */
1561 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1565 if (dfa
->nodes
[cur_node
].opt_subexp
1566 && prev_idx_match
[reg_num
].rm_so
!= -1)
1567 /* We transited through an empty match for an optional
1568 subexpression, like (a?)*, and this is not the subexp's
1569 first match. Copy back the old content of the registers
1570 so that matches of an inner subexpression are undone as
1571 well, like in ((a?))*. */
1572 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1574 /* We completed a subexpression, but it may be part of
1575 an optional one, so do not update PREV_IDX_MATCH. */
1576 pmatch
[reg_num
].rm_eo
= cur_idx
;
1582 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1583 and sift the nodes in each states according to the following rules.
1584 Updated state_log will be wrote to STATE_LOG.
1586 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1587 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1588 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1589 the LAST_NODE, we throw away the node `a'.
1590 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1591 string `s' and transit to `b':
1592 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1594 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1595 thrown away, we throw away the node `a'.
1596 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1597 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1599 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1600 we throw away the node `a'. */
1602 #define STATE_NODE_CONTAINS(state,node) \
1603 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1605 static reg_errcode_t
1607 sift_states_backward (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
)
1611 int str_idx
= sctx
->last_str_idx
;
1612 re_node_set cur_dest
;
1615 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1618 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1619 transit to the last_node and the last_node itself. */
1620 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1621 if (BE (err
!= REG_NOERROR
, 0))
1623 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1624 if (BE (err
!= REG_NOERROR
, 0))
1627 /* Then check each states in the state_log. */
1630 /* Update counters. */
1631 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1632 if (null_cnt
> mctx
->max_mb_elem_len
)
1634 memset (sctx
->sifted_states
, '\0',
1635 sizeof (re_dfastate_t
*) * str_idx
);
1636 re_node_set_free (&cur_dest
);
1639 re_node_set_empty (&cur_dest
);
1642 if (mctx
->state_log
[str_idx
])
1644 err
= build_sifted_states (mctx
, sctx
, str_idx
, &cur_dest
);
1645 if (BE (err
!= REG_NOERROR
, 0))
1649 /* Add all the nodes which satisfy the following conditions:
1650 - It can epsilon transit to a node in CUR_DEST.
1652 And update state_log. */
1653 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1654 if (BE (err
!= REG_NOERROR
, 0))
1659 re_node_set_free (&cur_dest
);
1663 static reg_errcode_t
1665 build_sifted_states (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
1666 int str_idx
, re_node_set
*cur_dest
)
1668 const re_dfa_t
*const dfa
= mctx
->dfa
;
1669 const re_node_set
*cur_src
= &mctx
->state_log
[str_idx
]->non_eps_nodes
;
1672 /* Then build the next sifted state.
1673 We build the next sifted state on `cur_dest', and update
1674 `sifted_states[str_idx]' with `cur_dest'.
1676 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1677 `cur_src' points the node_set of the old `state_log[str_idx]'
1678 (with the epsilon nodes pre-filtered out). */
1679 for (i
= 0; i
< cur_src
->nelem
; i
++)
1681 int prev_node
= cur_src
->elems
[i
];
1686 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1687 assert (!IS_EPSILON_NODE (type
));
1689 #ifdef RE_ENABLE_I18N
1690 /* If the node may accept `multi byte'. */
1691 if (dfa
->nodes
[prev_node
].accept_mb
)
1692 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1693 str_idx
, sctx
->last_str_idx
);
1694 #endif /* RE_ENABLE_I18N */
1696 /* We don't check backreferences here.
1697 See update_cur_sifted_state(). */
1699 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1700 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1701 dfa
->nexts
[prev_node
]))
1707 if (sctx
->limits
.nelem
)
1709 int to_idx
= str_idx
+ naccepted
;
1710 if (check_dst_limits (mctx
, &sctx
->limits
,
1711 dfa
->nexts
[prev_node
], to_idx
,
1712 prev_node
, str_idx
))
1715 ret
= re_node_set_insert (cur_dest
, prev_node
);
1716 if (BE (ret
== -1, 0))
1723 /* Helper functions. */
1725 static reg_errcode_t
1727 clean_state_log_if_needed (re_match_context_t
*mctx
, int next_state_log_idx
)
1729 int top
= mctx
->state_log_top
;
1731 if (next_state_log_idx
>= mctx
->input
.bufs_len
1732 || (next_state_log_idx
>= mctx
->input
.valid_len
1733 && mctx
->input
.valid_len
< mctx
->input
.len
))
1736 err
= extend_buffers (mctx
);
1737 if (BE (err
!= REG_NOERROR
, 0))
1741 if (top
< next_state_log_idx
)
1743 memset (mctx
->state_log
+ top
+ 1, '\0',
1744 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1745 mctx
->state_log_top
= next_state_log_idx
;
1750 static reg_errcode_t
1752 merge_state_array (const re_dfa_t
*dfa
, re_dfastate_t
**dst
,
1753 re_dfastate_t
**src
, int num
)
1757 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1759 if (dst
[st_idx
] == NULL
)
1760 dst
[st_idx
] = src
[st_idx
];
1761 else if (src
[st_idx
] != NULL
)
1763 re_node_set merged_set
;
1764 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1765 &src
[st_idx
]->nodes
);
1766 if (BE (err
!= REG_NOERROR
, 0))
1768 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1769 re_node_set_free (&merged_set
);
1770 if (BE (err
!= REG_NOERROR
, 0))
1777 static reg_errcode_t
1779 update_cur_sifted_state (const re_match_context_t
*mctx
,
1780 re_sift_context_t
*sctx
, int str_idx
,
1781 re_node_set
*dest_nodes
)
1783 const re_dfa_t
*const dfa
= mctx
->dfa
;
1784 reg_errcode_t err
= REG_NOERROR
;
1785 const re_node_set
*candidates
;
1786 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? NULL
1787 : &mctx
->state_log
[str_idx
]->nodes
);
1789 if (dest_nodes
->nelem
== 0)
1790 sctx
->sifted_states
[str_idx
] = NULL
;
1795 /* At first, add the nodes which can epsilon transit to a node in
1797 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1798 if (BE (err
!= REG_NOERROR
, 0))
1801 /* Then, check the limitations in the current sift_context. */
1802 if (sctx
->limits
.nelem
)
1804 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1805 mctx
->bkref_ents
, str_idx
);
1806 if (BE (err
!= REG_NOERROR
, 0))
1811 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1812 if (BE (err
!= REG_NOERROR
, 0))
1816 if (candidates
&& mctx
->state_log
[str_idx
]->has_backref
)
1818 err
= sift_states_bkref (mctx
, sctx
, str_idx
, candidates
);
1819 if (BE (err
!= REG_NOERROR
, 0))
1825 static reg_errcode_t
1827 add_epsilon_src_nodes (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
1828 const re_node_set
*candidates
)
1830 reg_errcode_t err
= REG_NOERROR
;
1833 re_dfastate_t
*state
= re_acquire_state (&err
, dfa
, dest_nodes
);
1834 if (BE (err
!= REG_NOERROR
, 0))
1837 if (!state
->inveclosure
.alloc
)
1839 err
= re_node_set_alloc (&state
->inveclosure
, dest_nodes
->nelem
);
1840 if (BE (err
!= REG_NOERROR
, 0))
1842 for (i
= 0; i
< dest_nodes
->nelem
; i
++)
1844 err
= re_node_set_merge (&state
->inveclosure
,
1845 dfa
->inveclosures
+ dest_nodes
->elems
[i
]);
1846 if (BE (err
!= REG_NOERROR
, 0))
1850 return re_node_set_add_intersect (dest_nodes
, candidates
,
1851 &state
->inveclosure
);
1854 static reg_errcode_t
1856 sub_epsilon_src_nodes (const re_dfa_t
*dfa
, int node
, re_node_set
*dest_nodes
,
1857 const re_node_set
*candidates
)
1861 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1862 re_node_set except_nodes
;
1863 re_node_set_init_empty (&except_nodes
);
1864 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1866 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1867 if (cur_node
== node
)
1869 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1871 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1872 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1873 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1874 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1875 && re_node_set_contains (dest_nodes
, edst1
))
1877 && !re_node_set_contains (inv_eclosure
, edst2
)
1878 && re_node_set_contains (dest_nodes
, edst2
)))
1880 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1881 dfa
->inveclosures
+ cur_node
);
1882 if (BE (err
!= REG_NOERROR
, 0))
1884 re_node_set_free (&except_nodes
);
1890 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1892 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1893 if (!re_node_set_contains (&except_nodes
, cur_node
))
1895 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1896 re_node_set_remove_at (dest_nodes
, idx
);
1899 re_node_set_free (&except_nodes
);
1905 check_dst_limits (const re_match_context_t
*mctx
, re_node_set
*limits
,
1906 int dst_node
, int dst_idx
, int src_node
, int src_idx
)
1908 const re_dfa_t
*const dfa
= mctx
->dfa
;
1909 int lim_idx
, src_pos
, dst_pos
;
1911 int dst_bkref_idx
= search_cur_bkref_entry (mctx
, dst_idx
);
1912 int src_bkref_idx
= search_cur_bkref_entry (mctx
, src_idx
);
1913 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1916 struct re_backref_cache_entry
*ent
;
1917 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1918 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
1920 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1921 subexp_idx
, dst_node
, dst_idx
,
1923 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1924 subexp_idx
, src_node
, src_idx
,
1928 <src> <dst> ( <subexp> )
1929 ( <subexp> ) <src> <dst>
1930 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1931 if (src_pos
== dst_pos
)
1932 continue; /* This is unrelated limitation. */
1941 check_dst_limits_calc_pos_1 (const re_match_context_t
*mctx
, int boundaries
,
1942 int subexp_idx
, int from_node
, int bkref_idx
)
1944 const re_dfa_t
*const dfa
= mctx
->dfa
;
1945 const re_node_set
*eclosures
= dfa
->eclosures
+ from_node
;
1948 /* Else, we are on the boundary: examine the nodes on the epsilon
1950 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1952 int node
= eclosures
->elems
[node_idx
];
1953 switch (dfa
->nodes
[node
].type
)
1956 if (bkref_idx
!= -1)
1958 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bkref_idx
;
1963 if (ent
->node
!= node
)
1966 if (subexp_idx
< BITSET_WORD_BITS
1967 && !(ent
->eps_reachable_subexps_map
1968 & ((bitset_word_t
) 1 << subexp_idx
)))
1971 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1972 OP_CLOSE_SUBEXP cases below. But, if the
1973 destination node is the same node as the source
1974 node, don't recurse because it would cause an
1975 infinite loop: a regex that exhibits this behavior
1977 dst
= dfa
->edests
[node
].elems
[0];
1978 if (dst
== from_node
)
1982 else /* if (boundaries & 2) */
1987 check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
1989 if (cpos
== -1 /* && (boundaries & 1) */)
1991 if (cpos
== 0 && (boundaries
& 2))
1994 if (subexp_idx
< BITSET_WORD_BITS
)
1995 ent
->eps_reachable_subexps_map
1996 &= ~((bitset_word_t
) 1 << subexp_idx
);
1998 while (ent
++->more
);
2002 case OP_OPEN_SUBEXP
:
2003 if ((boundaries
& 1) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2007 case OP_CLOSE_SUBEXP
:
2008 if ((boundaries
& 2) && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2017 return (boundaries
& 2) ? 1 : 0;
2022 check_dst_limits_calc_pos (const re_match_context_t
*mctx
, int limit
,
2023 int subexp_idx
, int from_node
, int str_idx
,
2026 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
2029 /* If we are outside the range of the subexpression, return -1 or 1. */
2030 if (str_idx
< lim
->subexp_from
)
2033 if (lim
->subexp_to
< str_idx
)
2036 /* If we are within the subexpression, return 0. */
2037 boundaries
= (str_idx
== lim
->subexp_from
);
2038 boundaries
|= (str_idx
== lim
->subexp_to
) << 1;
2039 if (boundaries
== 0)
2042 /* Else, examine epsilon closure. */
2043 return check_dst_limits_calc_pos_1 (mctx
, boundaries
, subexp_idx
,
2044 from_node
, bkref_idx
);
2047 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2048 which are against limitations from DEST_NODES. */
2050 static reg_errcode_t
2052 check_subexp_limits (const re_dfa_t
*dfa
, re_node_set
*dest_nodes
,
2053 const re_node_set
*candidates
, re_node_set
*limits
,
2054 struct re_backref_cache_entry
*bkref_ents
, int str_idx
)
2057 int node_idx
, lim_idx
;
2059 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
2062 struct re_backref_cache_entry
*ent
;
2063 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
2065 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
2066 continue; /* This is unrelated limitation. */
2068 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
;
2069 if (ent
->subexp_to
== str_idx
)
2073 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2075 int node
= dest_nodes
->elems
[node_idx
];
2076 re_token_type_t type
= dfa
->nodes
[node
].type
;
2077 if (type
== OP_OPEN_SUBEXP
2078 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2080 else if (type
== OP_CLOSE_SUBEXP
2081 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
2085 /* Check the limitation of the open subexpression. */
2086 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2089 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
2091 if (BE (err
!= REG_NOERROR
, 0))
2095 /* Check the limitation of the close subexpression. */
2097 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2099 int node
= dest_nodes
->elems
[node_idx
];
2100 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
2102 && !re_node_set_contains (dfa
->eclosures
+ node
,
2105 /* It is against this limitation.
2106 Remove it form the current sifted state. */
2107 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2109 if (BE (err
!= REG_NOERROR
, 0))
2115 else /* (ent->subexp_to != str_idx) */
2117 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
2119 int node
= dest_nodes
->elems
[node_idx
];
2120 re_token_type_t type
= dfa
->nodes
[node
].type
;
2121 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
2123 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
2125 /* It is against this limitation.
2126 Remove it form the current sifted state. */
2127 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
2129 if (BE (err
!= REG_NOERROR
, 0))
2138 static reg_errcode_t
2140 sift_states_bkref (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2141 int str_idx
, const re_node_set
*candidates
)
2143 const re_dfa_t
*const dfa
= mctx
->dfa
;
2146 re_sift_context_t local_sctx
;
2147 int first_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2149 if (first_idx
== -1)
2152 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2154 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2157 re_token_type_t type
;
2158 struct re_backref_cache_entry
*entry
;
2159 node
= candidates
->elems
[node_idx
];
2160 type
= dfa
->nodes
[node
].type
;
2161 /* Avoid infinite loop for the REs like "()\1+". */
2162 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2164 if (type
!= OP_BACK_REF
)
2167 entry
= mctx
->bkref_ents
+ first_idx
;
2168 enabled_idx
= first_idx
;
2175 re_dfastate_t
*cur_state
;
2177 if (entry
->node
!= node
)
2179 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2180 to_idx
= str_idx
+ subexp_len
;
2181 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2182 : dfa
->edests
[node
].elems
[0]);
2184 if (to_idx
> sctx
->last_str_idx
2185 || sctx
->sifted_states
[to_idx
] == NULL
2186 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
], dst_node
)
2187 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2188 str_idx
, dst_node
, to_idx
))
2191 if (local_sctx
.sifted_states
== NULL
)
2194 err
= re_node_set_init_copy (&local_sctx
.limits
, &sctx
->limits
);
2195 if (BE (err
!= REG_NOERROR
, 0))
2198 local_sctx
.last_node
= node
;
2199 local_sctx
.last_str_idx
= str_idx
;
2200 ret
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2201 if (BE (ret
< 0, 0))
2206 cur_state
= local_sctx
.sifted_states
[str_idx
];
2207 err
= sift_states_backward (mctx
, &local_sctx
);
2208 if (BE (err
!= REG_NOERROR
, 0))
2210 if (sctx
->limited_states
!= NULL
)
2212 err
= merge_state_array (dfa
, sctx
->limited_states
,
2213 local_sctx
.sifted_states
,
2215 if (BE (err
!= REG_NOERROR
, 0))
2218 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2219 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2221 /* mctx->bkref_ents may have changed, reload the pointer. */
2222 entry
= mctx
->bkref_ents
+ enabled_idx
;
2224 while (enabled_idx
++, entry
++->more
);
2228 if (local_sctx
.sifted_states
!= NULL
)
2230 re_node_set_free (&local_sctx
.limits
);
2237 #ifdef RE_ENABLE_I18N
2240 sift_states_iter_mb (const re_match_context_t
*mctx
, re_sift_context_t
*sctx
,
2241 int node_idx
, int str_idx
, int max_str_idx
)
2243 const re_dfa_t
*const dfa
= mctx
->dfa
;
2245 /* Check the node can accept `multi byte'. */
2246 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2247 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2248 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2249 dfa
->nexts
[node_idx
]))
2250 /* The node can't accept the `multi byte', or the
2251 destination was already thrown away, then the node
2252 could't accept the current input `multi byte'. */
2254 /* Otherwise, it is sure that the node could accept
2255 `naccepted' bytes input. */
2258 #endif /* RE_ENABLE_I18N */
2261 /* Functions for state transition. */
2263 /* Return the next state to which the current state STATE will transit by
2264 accepting the current input byte, and update STATE_LOG if necessary.
2265 If STATE can accept a multibyte char/collating element/back reference
2266 update the destination of STATE_LOG. */
2268 static re_dfastate_t
*
2270 transit_state (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2271 re_dfastate_t
*state
)
2273 re_dfastate_t
**trtable
;
2276 #ifdef RE_ENABLE_I18N
2277 /* If the current state can accept multibyte. */
2278 if (BE (state
->accept_mb
, 0))
2280 *err
= transit_state_mb (mctx
, state
);
2281 if (BE (*err
!= REG_NOERROR
, 0))
2284 #endif /* RE_ENABLE_I18N */
2286 /* Then decide the next state with the single byte. */
2289 /* don't use transition table */
2290 return transit_state_sb (err
, mctx
, state
);
2293 /* Use transition table */
2294 ch
= re_string_fetch_byte (&mctx
->input
);
2297 trtable
= state
->trtable
;
2298 if (BE (trtable
!= NULL
, 1))
2301 trtable
= state
->word_trtable
;
2302 if (BE (trtable
!= NULL
, 1))
2304 unsigned int context
;
2306 = re_string_context_at (&mctx
->input
,
2307 re_string_cur_idx (&mctx
->input
) - 1,
2309 if (IS_WORD_CONTEXT (context
))
2310 return trtable
[ch
+ SBC_MAX
];
2315 if (!build_trtable (mctx
->dfa
, state
))
2321 /* Retry, we now have a transition table. */
2325 /* Update the state_log if we need */
2328 merge_state_with_log (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2329 re_dfastate_t
*next_state
)
2331 const re_dfa_t
*const dfa
= mctx
->dfa
;
2332 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2334 if (cur_idx
> mctx
->state_log_top
)
2336 mctx
->state_log
[cur_idx
] = next_state
;
2337 mctx
->state_log_top
= cur_idx
;
2339 else if (mctx
->state_log
[cur_idx
] == 0)
2341 mctx
->state_log
[cur_idx
] = next_state
;
2345 re_dfastate_t
*pstate
;
2346 unsigned int context
;
2347 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2348 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2349 the destination of a multibyte char/collating element/
2350 back reference. Then the next state is the union set of
2351 these destinations and the results of the transition table. */
2352 pstate
= mctx
->state_log
[cur_idx
];
2353 log_nodes
= pstate
->entrance_nodes
;
2354 if (next_state
!= NULL
)
2356 table_nodes
= next_state
->entrance_nodes
;
2357 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2359 if (BE (*err
!= REG_NOERROR
, 0))
2363 next_nodes
= *log_nodes
;
2364 /* Note: We already add the nodes of the initial state,
2365 then we don't need to add them here. */
2367 context
= re_string_context_at (&mctx
->input
,
2368 re_string_cur_idx (&mctx
->input
) - 1,
2370 next_state
= mctx
->state_log
[cur_idx
]
2371 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2372 /* We don't need to check errors here, since the return value of
2373 this function is next_state and ERR is already set. */
2375 if (table_nodes
!= NULL
)
2376 re_node_set_free (&next_nodes
);
2379 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2381 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2382 later. We must check them here, since the back references in the
2383 next state might use them. */
2384 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2386 if (BE (*err
!= REG_NOERROR
, 0))
2389 /* If the next state has back references. */
2390 if (next_state
->has_backref
)
2392 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2393 if (BE (*err
!= REG_NOERROR
, 0))
2395 next_state
= mctx
->state_log
[cur_idx
];
2402 /* Skip bytes in the input that correspond to part of a
2403 multi-byte match, then look in the log for a state
2404 from which to restart matching. */
2407 find_recover_state (reg_errcode_t
*err
, re_match_context_t
*mctx
)
2409 re_dfastate_t
*cur_state
;
2412 int max
= mctx
->state_log_top
;
2413 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2417 if (++cur_str_idx
> max
)
2419 re_string_skip_bytes (&mctx
->input
, 1);
2421 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2423 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2425 while (*err
== REG_NOERROR
&& cur_state
== NULL
);
2429 /* Helper functions for transit_state. */
2431 /* From the node set CUR_NODES, pick up the nodes whose types are
2432 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2433 expression. And register them to use them later for evaluating the
2434 correspoding back references. */
2436 static reg_errcode_t
2438 check_subexp_matching_top (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
2441 const re_dfa_t
*const dfa
= mctx
->dfa
;
2445 /* TODO: This isn't efficient.
2446 Because there might be more than one nodes whose types are
2447 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2450 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2452 int node
= cur_nodes
->elems
[node_idx
];
2453 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2454 && dfa
->nodes
[node
].opr
.idx
< BITSET_WORD_BITS
2455 && (dfa
->used_bkref_map
2456 & ((bitset_word_t
) 1 << dfa
->nodes
[node
].opr
.idx
)))
2458 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2459 if (BE (err
!= REG_NOERROR
, 0))
2467 /* Return the next state to which the current state STATE will transit by
2468 accepting the current input byte. */
2470 static re_dfastate_t
*
2471 transit_state_sb (reg_errcode_t
*err
, re_match_context_t
*mctx
,
2472 re_dfastate_t
*state
)
2474 const re_dfa_t
*const dfa
= mctx
->dfa
;
2475 re_node_set next_nodes
;
2476 re_dfastate_t
*next_state
;
2477 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2478 unsigned int context
;
2480 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2481 if (BE (*err
!= REG_NOERROR
, 0))
2483 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2485 int cur_node
= state
->nodes
.elems
[node_cnt
];
2486 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2488 *err
= re_node_set_merge (&next_nodes
,
2489 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2490 if (BE (*err
!= REG_NOERROR
, 0))
2492 re_node_set_free (&next_nodes
);
2497 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2498 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2499 /* We don't need to check errors here, since the return value of
2500 this function is next_state and ERR is already set. */
2502 re_node_set_free (&next_nodes
);
2503 re_string_skip_bytes (&mctx
->input
, 1);
2508 #ifdef RE_ENABLE_I18N
2509 static reg_errcode_t
2511 transit_state_mb (re_match_context_t
*mctx
, re_dfastate_t
*pstate
)
2513 const re_dfa_t
*const dfa
= mctx
->dfa
;
2517 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2519 re_node_set dest_nodes
, *new_nodes
;
2520 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2521 int naccepted
, dest_idx
;
2522 unsigned int context
;
2523 re_dfastate_t
*dest_state
;
2525 if (!dfa
->nodes
[cur_node_idx
].accept_mb
)
2528 if (dfa
->nodes
[cur_node_idx
].constraint
)
2530 context
= re_string_context_at (&mctx
->input
,
2531 re_string_cur_idx (&mctx
->input
),
2533 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2538 /* How many bytes the node can accept? */
2539 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2540 re_string_cur_idx (&mctx
->input
));
2544 /* The node can accepts `naccepted' bytes. */
2545 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2546 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2547 : mctx
->max_mb_elem_len
);
2548 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2549 if (BE (err
!= REG_NOERROR
, 0))
2552 assert (dfa
->nexts
[cur_node_idx
] != -1);
2554 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[cur_node_idx
];
2556 dest_state
= mctx
->state_log
[dest_idx
];
2557 if (dest_state
== NULL
)
2558 dest_nodes
= *new_nodes
;
2561 err
= re_node_set_init_union (&dest_nodes
,
2562 dest_state
->entrance_nodes
, new_nodes
);
2563 if (BE (err
!= REG_NOERROR
, 0))
2566 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1,
2568 mctx
->state_log
[dest_idx
]
2569 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2570 if (dest_state
!= NULL
)
2571 re_node_set_free (&dest_nodes
);
2572 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2577 #endif /* RE_ENABLE_I18N */
2579 static reg_errcode_t
2581 transit_state_bkref (re_match_context_t
*mctx
, const re_node_set
*nodes
)
2583 const re_dfa_t
*const dfa
= mctx
->dfa
;
2586 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2588 for (i
= 0; i
< nodes
->nelem
; ++i
)
2590 int dest_str_idx
, prev_nelem
, bkc_idx
;
2591 int node_idx
= nodes
->elems
[i
];
2592 unsigned int context
;
2593 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2594 re_node_set
*new_dest_nodes
;
2596 /* Check whether `node' is a backreference or not. */
2597 if (node
->type
!= OP_BACK_REF
)
2600 if (node
->constraint
)
2602 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2604 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2608 /* `node' is a backreference.
2609 Check the substring which the substring matched. */
2610 bkc_idx
= mctx
->nbkref_ents
;
2611 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2612 if (BE (err
!= REG_NOERROR
, 0))
2615 /* And add the epsilon closures (which is `new_dest_nodes') of
2616 the backreference to appropriate state_log. */
2618 assert (dfa
->nexts
[node_idx
] != -1);
2620 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2623 re_dfastate_t
*dest_state
;
2624 struct re_backref_cache_entry
*bkref_ent
;
2625 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2626 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2628 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2629 new_dest_nodes
= (subexp_len
== 0
2630 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2631 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2632 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2633 - bkref_ent
->subexp_from
);
2634 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2636 dest_state
= mctx
->state_log
[dest_str_idx
];
2637 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2638 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2639 /* Add `new_dest_node' to state_log. */
2640 if (dest_state
== NULL
)
2642 mctx
->state_log
[dest_str_idx
]
2643 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2645 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2646 && err
!= REG_NOERROR
, 0))
2651 re_node_set dest_nodes
;
2652 err
= re_node_set_init_union (&dest_nodes
,
2653 dest_state
->entrance_nodes
,
2655 if (BE (err
!= REG_NOERROR
, 0))
2657 re_node_set_free (&dest_nodes
);
2660 mctx
->state_log
[dest_str_idx
]
2661 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2662 re_node_set_free (&dest_nodes
);
2663 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2664 && err
!= REG_NOERROR
, 0))
2667 /* We need to check recursively if the backreference can epsilon
2670 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2672 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2674 if (BE (err
!= REG_NOERROR
, 0))
2676 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2677 if (BE (err
!= REG_NOERROR
, 0))
2687 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2688 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2689 Note that we might collect inappropriate candidates here.
2690 However, the cost of checking them strictly here is too high, then we
2691 delay these checking for prune_impossible_nodes(). */
2693 static reg_errcode_t
2695 get_subexp (re_match_context_t
*mctx
, int bkref_node
, int bkref_str_idx
)
2697 const re_dfa_t
*const dfa
= mctx
->dfa
;
2698 int subexp_num
, sub_top_idx
;
2699 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2700 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2701 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2702 if (cache_idx
!= -1)
2704 const struct re_backref_cache_entry
*entry
2705 = mctx
->bkref_ents
+ cache_idx
;
2707 if (entry
->node
== bkref_node
)
2708 return REG_NOERROR
; /* We already checked it. */
2709 while (entry
++->more
);
2712 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
;
2714 /* For each sub expression */
2715 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2718 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2719 re_sub_match_last_t
*sub_last
;
2720 int sub_last_idx
, sl_str
, bkref_str_off
;
2722 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2723 continue; /* It isn't related. */
2725 sl_str
= sub_top
->str_idx
;
2726 bkref_str_off
= bkref_str_idx
;
2727 /* At first, check the last node of sub expressions we already
2729 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2732 sub_last
= sub_top
->lasts
[sub_last_idx
];
2733 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2734 /* The matched string by the sub expression match with the substring
2735 at the back reference? */
2736 if (sl_str_diff
> 0)
2738 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2740 /* Not enough chars for a successful match. */
2741 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2744 err
= clean_state_log_if_needed (mctx
,
2747 if (BE (err
!= REG_NOERROR
, 0))
2749 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2751 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2752 /* We don't need to search this sub expression any more. */
2755 bkref_str_off
+= sl_str_diff
;
2756 sl_str
+= sl_str_diff
;
2757 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2760 /* Reload buf, since the preceding call might have reallocated
2762 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2764 if (err
== REG_NOMATCH
)
2766 if (BE (err
!= REG_NOERROR
, 0))
2770 if (sub_last_idx
< sub_top
->nlasts
)
2772 if (sub_last_idx
> 0)
2774 /* Then, search for the other last nodes of the sub expression. */
2775 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2777 int cls_node
, sl_str_off
;
2778 const re_node_set
*nodes
;
2779 sl_str_off
= sl_str
- sub_top
->str_idx
;
2780 /* The matched string by the sub expression match with the substring
2781 at the back reference? */
2784 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2786 /* If we are at the end of the input, we cannot match. */
2787 if (bkref_str_off
>= mctx
->input
.len
)
2790 err
= extend_buffers (mctx
);
2791 if (BE (err
!= REG_NOERROR
, 0))
2794 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2796 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2797 break; /* We don't need to search this sub expression
2800 if (mctx
->state_log
[sl_str
] == NULL
)
2802 /* Does this state have a ')' of the sub expression? */
2803 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2804 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
,
2808 if (sub_top
->path
== NULL
)
2810 sub_top
->path
= calloc (sizeof (state_array_t
),
2811 sl_str
- sub_top
->str_idx
+ 1);
2812 if (sub_top
->path
== NULL
)
2815 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2816 in the current context? */
2817 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2818 sub_top
->str_idx
, cls_node
, sl_str
,
2820 if (err
== REG_NOMATCH
)
2822 if (BE (err
!= REG_NOERROR
, 0))
2824 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2825 if (BE (sub_last
== NULL
, 0))
2827 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2829 if (err
== REG_NOMATCH
)
2836 /* Helper functions for get_subexp(). */
2838 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2839 If it can arrive, register the sub expression expressed with SUB_TOP
2842 static reg_errcode_t
2844 get_subexp_sub (re_match_context_t
*mctx
, const re_sub_match_top_t
*sub_top
,
2845 re_sub_match_last_t
*sub_last
, int bkref_node
, int bkref_str
)
2849 /* Can the subexpression arrive the back reference? */
2850 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2851 sub_last
->str_idx
, bkref_node
, bkref_str
,
2853 if (err
!= REG_NOERROR
)
2855 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2857 if (BE (err
!= REG_NOERROR
, 0))
2859 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2860 return clean_state_log_if_needed (mctx
, to_idx
);
2863 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2864 Search '(' if FL_OPEN, or search ')' otherwise.
2865 TODO: This function isn't efficient...
2866 Because there might be more than one nodes whose types are
2867 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2873 find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
2874 int subexp_idx
, int type
)
2877 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2879 int cls_node
= nodes
->elems
[cls_idx
];
2880 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2881 if (node
->type
== type
2882 && node
->opr
.idx
== subexp_idx
)
2888 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2889 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2891 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2893 static reg_errcode_t
2895 check_arrival (re_match_context_t
*mctx
, state_array_t
*path
, int top_node
,
2896 int top_str
, int last_node
, int last_str
, int type
)
2898 const re_dfa_t
*const dfa
= mctx
->dfa
;
2899 reg_errcode_t err
= REG_NOERROR
;
2900 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2901 re_dfastate_t
*cur_state
= NULL
;
2902 re_node_set
*cur_nodes
, next_nodes
;
2903 re_dfastate_t
**backup_state_log
;
2904 unsigned int context
;
2906 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2907 /* Extend the buffer if we need. */
2908 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2910 re_dfastate_t
**new_array
;
2911 int old_alloc
= path
->alloc
;
2912 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2913 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2914 if (BE (new_array
== NULL
, 0))
2916 path
->alloc
= old_alloc
;
2919 path
->array
= new_array
;
2920 memset (new_array
+ old_alloc
, '\0',
2921 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2924 str_idx
= path
->next_idx
? path
->next_idx
: top_str
;
2926 /* Temporary modify MCTX. */
2927 backup_state_log
= mctx
->state_log
;
2928 backup_cur_idx
= mctx
->input
.cur_idx
;
2929 mctx
->state_log
= path
->array
;
2930 mctx
->input
.cur_idx
= str_idx
;
2932 /* Setup initial node set. */
2933 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2934 if (str_idx
== top_str
)
2936 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2937 if (BE (err
!= REG_NOERROR
, 0))
2939 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2940 if (BE (err
!= REG_NOERROR
, 0))
2942 re_node_set_free (&next_nodes
);
2948 cur_state
= mctx
->state_log
[str_idx
];
2949 if (cur_state
&& cur_state
->has_backref
)
2951 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2952 if (BE (err
!= REG_NOERROR
, 0))
2956 re_node_set_init_empty (&next_nodes
);
2958 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2960 if (next_nodes
.nelem
)
2962 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
2964 if (BE (err
!= REG_NOERROR
, 0))
2966 re_node_set_free (&next_nodes
);
2970 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2971 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2973 re_node_set_free (&next_nodes
);
2976 mctx
->state_log
[str_idx
] = cur_state
;
2979 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2981 re_node_set_empty (&next_nodes
);
2982 if (mctx
->state_log
[str_idx
+ 1])
2984 err
= re_node_set_merge (&next_nodes
,
2985 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2986 if (BE (err
!= REG_NOERROR
, 0))
2988 re_node_set_free (&next_nodes
);
2994 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2995 &cur_state
->non_eps_nodes
,
2997 if (BE (err
!= REG_NOERROR
, 0))
2999 re_node_set_free (&next_nodes
);
3004 if (next_nodes
.nelem
)
3006 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
3007 if (BE (err
!= REG_NOERROR
, 0))
3009 re_node_set_free (&next_nodes
);
3012 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
,
3014 if (BE (err
!= REG_NOERROR
, 0))
3016 re_node_set_free (&next_nodes
);
3020 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
3021 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
3022 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
3024 re_node_set_free (&next_nodes
);
3027 mctx
->state_log
[str_idx
] = cur_state
;
3028 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
3030 re_node_set_free (&next_nodes
);
3031 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
3032 : &mctx
->state_log
[last_str
]->nodes
);
3033 path
->next_idx
= str_idx
;
3036 mctx
->state_log
= backup_state_log
;
3037 mctx
->input
.cur_idx
= backup_cur_idx
;
3039 /* Then check the current node set has the node LAST_NODE. */
3040 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
3046 /* Helper functions for check_arrival. */
3048 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3050 TODO: This function is similar to the functions transit_state*(),
3051 however this function has many additional works.
3052 Can't we unify them? */
3054 static reg_errcode_t
3056 check_arrival_add_next_nodes (re_match_context_t
*mctx
, int str_idx
,
3057 re_node_set
*cur_nodes
, re_node_set
*next_nodes
)
3059 const re_dfa_t
*const dfa
= mctx
->dfa
;
3062 reg_errcode_t err
= REG_NOERROR
;
3063 re_node_set union_set
;
3064 re_node_set_init_empty (&union_set
);
3065 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
3068 int cur_node
= cur_nodes
->elems
[cur_idx
];
3070 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
3071 assert (!IS_EPSILON_NODE (type
));
3073 #ifdef RE_ENABLE_I18N
3074 /* If the node may accept `multi byte'. */
3075 if (dfa
->nodes
[cur_node
].accept_mb
)
3077 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
3081 re_dfastate_t
*dest_state
;
3082 int next_node
= dfa
->nexts
[cur_node
];
3083 int next_idx
= str_idx
+ naccepted
;
3084 dest_state
= mctx
->state_log
[next_idx
];
3085 re_node_set_empty (&union_set
);
3088 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
3089 if (BE (err
!= REG_NOERROR
, 0))
3091 re_node_set_free (&union_set
);
3095 result
= re_node_set_insert (&union_set
, next_node
);
3096 if (BE (result
< 0, 0))
3098 re_node_set_free (&union_set
);
3101 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
3103 if (BE (mctx
->state_log
[next_idx
] == NULL
3104 && err
!= REG_NOERROR
, 0))
3106 re_node_set_free (&union_set
);
3111 #endif /* RE_ENABLE_I18N */
3113 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3115 result
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3116 if (BE (result
< 0, 0))
3118 re_node_set_free (&union_set
);
3123 re_node_set_free (&union_set
);
3127 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3128 CUR_NODES, however exclude the nodes which are:
3129 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3130 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3133 static reg_errcode_t
3135 check_arrival_expand_ecl (const re_dfa_t
*dfa
, re_node_set
*cur_nodes
,
3136 int ex_subexp
, int type
)
3139 int idx
, outside_node
;
3140 re_node_set new_nodes
;
3142 assert (cur_nodes
->nelem
);
3144 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3145 if (BE (err
!= REG_NOERROR
, 0))
3147 /* Create a new node set NEW_NODES with the nodes which are epsilon
3148 closures of the node in CUR_NODES. */
3150 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3152 int cur_node
= cur_nodes
->elems
[idx
];
3153 const re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3154 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3155 if (outside_node
== -1)
3157 /* There are no problematic nodes, just merge them. */
3158 err
= re_node_set_merge (&new_nodes
, eclosure
);
3159 if (BE (err
!= REG_NOERROR
, 0))
3161 re_node_set_free (&new_nodes
);
3167 /* There are problematic nodes, re-calculate incrementally. */
3168 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3170 if (BE (err
!= REG_NOERROR
, 0))
3172 re_node_set_free (&new_nodes
);
3177 re_node_set_free (cur_nodes
);
3178 *cur_nodes
= new_nodes
;
3182 /* Helper function for check_arrival_expand_ecl.
3183 Check incrementally the epsilon closure of TARGET, and if it isn't
3184 problematic append it to DST_NODES. */
3186 static reg_errcode_t
3188 check_arrival_expand_ecl_sub (const re_dfa_t
*dfa
, re_node_set
*dst_nodes
,
3189 int target
, int ex_subexp
, int type
)
3192 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3196 if (dfa
->nodes
[cur_node
].type
== type
3197 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3199 if (type
== OP_CLOSE_SUBEXP
)
3201 err
= re_node_set_insert (dst_nodes
, cur_node
);
3202 if (BE (err
== -1, 0))
3207 err
= re_node_set_insert (dst_nodes
, cur_node
);
3208 if (BE (err
== -1, 0))
3210 if (dfa
->edests
[cur_node
].nelem
== 0)
3212 if (dfa
->edests
[cur_node
].nelem
== 2)
3214 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3215 dfa
->edests
[cur_node
].elems
[1],
3217 if (BE (err
!= REG_NOERROR
, 0))
3220 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3226 /* For all the back references in the current state, calculate the
3227 destination of the back references by the appropriate entry
3228 in MCTX->BKREF_ENTS. */
3230 static reg_errcode_t
3232 expand_bkref_cache (re_match_context_t
*mctx
, re_node_set
*cur_nodes
,
3233 int cur_str
, int subexp_num
, int type
)
3235 const re_dfa_t
*const dfa
= mctx
->dfa
;
3237 int cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3238 struct re_backref_cache_entry
*ent
;
3240 if (cache_idx_start
== -1)
3244 ent
= mctx
->bkref_ents
+ cache_idx_start
;
3247 int to_idx
, next_node
;
3249 /* Is this entry ENT is appropriate? */
3250 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3253 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3254 /* Calculate the destination of the back reference, and append it
3255 to MCTX->STATE_LOG. */
3256 if (to_idx
== cur_str
)
3258 /* The backreference did epsilon transit, we must re-check all the
3259 node in the current state. */
3260 re_node_set new_dests
;
3261 reg_errcode_t err2
, err3
;
3262 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3263 if (re_node_set_contains (cur_nodes
, next_node
))
3265 err
= re_node_set_init_1 (&new_dests
, next_node
);
3266 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3267 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3268 re_node_set_free (&new_dests
);
3269 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3270 || err3
!= REG_NOERROR
, 0))
3272 err
= (err
!= REG_NOERROR
? err
3273 : (err2
!= REG_NOERROR
? err2
: err3
));
3276 /* TODO: It is still inefficient... */
3281 re_node_set union_set
;
3282 next_node
= dfa
->nexts
[ent
->node
];
3283 if (mctx
->state_log
[to_idx
])
3286 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3289 err
= re_node_set_init_copy (&union_set
,
3290 &mctx
->state_log
[to_idx
]->nodes
);
3291 ret
= re_node_set_insert (&union_set
, next_node
);
3292 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3294 re_node_set_free (&union_set
);
3295 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3301 err
= re_node_set_init_1 (&union_set
, next_node
);
3302 if (BE (err
!= REG_NOERROR
, 0))
3305 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3306 re_node_set_free (&union_set
);
3307 if (BE (mctx
->state_log
[to_idx
] == NULL
3308 && err
!= REG_NOERROR
, 0))
3312 while (ent
++->more
);
3316 /* Build transition table for the state.
3317 Return 1 if succeeded, otherwise return NULL. */
3321 build_trtable (const re_dfa_t
*dfa
, re_dfastate_t
*state
)
3324 int i
, j
, ch
, need_word_trtable
= 0;
3325 bitset_word_t elem
, mask
;
3326 bool dests_node_malloced
= false;
3327 bool dest_states_malloced
= false;
3328 int ndests
; /* Number of the destination states from `state'. */
3329 re_dfastate_t
**trtable
;
3330 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3331 re_node_set follows
, *dests_node
;
3333 bitset_t acceptable
;
3337 re_node_set dests_node
[SBC_MAX
];
3338 bitset_t dests_ch
[SBC_MAX
];
3341 /* We build DFA states which corresponds to the destination nodes
3342 from `state'. `dests_node[i]' represents the nodes which i-th
3343 destination state contains, and `dests_ch[i]' represents the
3344 characters which i-th destination state accepts. */
3346 if (__libc_use_alloca (sizeof (struct dests_alloc
)))
3347 dests_alloc
= (struct dests_alloc
*) alloca (sizeof (struct dests_alloc
));
3351 dests_alloc
= re_malloc (struct dests_alloc
, 1);
3352 if (BE (dests_alloc
== NULL
, 0))
3354 dests_node_malloced
= true;
3356 dests_node
= dests_alloc
->dests_node
;
3357 dests_ch
= dests_alloc
->dests_ch
;
3359 /* Initialize transiton table. */
3360 state
->word_trtable
= state
->trtable
= NULL
;
3362 /* At first, group all nodes belonging to `state' into several
3364 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3365 if (BE (ndests
<= 0, 0))
3367 if (dests_node_malloced
)
3369 /* Return 0 in case of an error, 1 otherwise. */
3372 state
->trtable
= (re_dfastate_t
**)
3373 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3379 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3380 if (BE (err
!= REG_NOERROR
, 0))
3383 /* Avoid arithmetic overflow in size calculation. */
3384 if (BE ((((SIZE_MAX
- (sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
)
3385 / (3 * sizeof (re_dfastate_t
*)))
3391 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset_t
)) * SBC_MAX
3392 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3393 dest_states
= (re_dfastate_t
**)
3394 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3398 dest_states
= (re_dfastate_t
**)
3399 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3400 if (BE (dest_states
== NULL
, 0))
3403 if (dest_states_malloced
)
3405 re_node_set_free (&follows
);
3406 for (i
= 0; i
< ndests
; ++i
)
3407 re_node_set_free (dests_node
+ i
);
3408 if (dests_node_malloced
)
3412 dest_states_malloced
= true;
3414 dest_states_word
= dest_states
+ ndests
;
3415 dest_states_nl
= dest_states_word
+ ndests
;
3416 bitset_empty (acceptable
);
3418 /* Then build the states for all destinations. */
3419 for (i
= 0; i
< ndests
; ++i
)
3422 re_node_set_empty (&follows
);
3423 /* Merge the follows of this destination states. */
3424 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3426 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3427 if (next_node
!= -1)
3429 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3430 if (BE (err
!= REG_NOERROR
, 0))
3434 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3435 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3437 /* If the new state has context constraint,
3438 build appropriate states for these contexts. */
3439 if (dest_states
[i
]->has_constraint
)
3441 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3443 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3446 if (dest_states
[i
] != dest_states_word
[i
] && dfa
->mb_cur_max
> 1)
3447 need_word_trtable
= 1;
3449 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3451 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3456 dest_states_word
[i
] = dest_states
[i
];
3457 dest_states_nl
[i
] = dest_states
[i
];
3459 bitset_merge (acceptable
, dests_ch
[i
]);
3462 if (!BE (need_word_trtable
, 0))
3464 /* We don't care about whether the following character is a word
3465 character, or we are in a single-byte character set so we can
3466 discern by looking at the character code: allocate a
3467 256-entry transition table. */
3468 trtable
= state
->trtable
=
3469 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3470 if (BE (trtable
== NULL
, 0))
3473 /* For all characters ch...: */
3474 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3475 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3477 mask
<<= 1, elem
>>= 1, ++ch
)
3478 if (BE (elem
& 1, 0))
3480 /* There must be exactly one destination which accepts
3481 character ch. See group_nodes_into_DFAstates. */
3482 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3485 /* j-th destination accepts the word character ch. */
3486 if (dfa
->word_char
[i
] & mask
)
3487 trtable
[ch
] = dest_states_word
[j
];
3489 trtable
[ch
] = dest_states
[j
];
3494 /* We care about whether the following character is a word
3495 character, and we are in a multi-byte character set: discern
3496 by looking at the character code: build two 256-entry
3497 transition tables, one starting at trtable[0] and one
3498 starting at trtable[SBC_MAX]. */
3499 trtable
= state
->word_trtable
=
3500 (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), 2 * SBC_MAX
);
3501 if (BE (trtable
== NULL
, 0))
3504 /* For all characters ch...: */
3505 for (i
= 0; i
< BITSET_WORDS
; ++i
)
3506 for (ch
= i
* BITSET_WORD_BITS
, elem
= acceptable
[i
], mask
= 1;
3508 mask
<<= 1, elem
>>= 1, ++ch
)
3509 if (BE (elem
& 1, 0))
3511 /* There must be exactly one destination which accepts
3512 character ch. See group_nodes_into_DFAstates. */
3513 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3516 /* j-th destination accepts the word character ch. */
3517 trtable
[ch
] = dest_states
[j
];
3518 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3523 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3525 /* The current state accepts newline character. */
3526 for (j
= 0; j
< ndests
; ++j
)
3527 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3529 /* k-th destination accepts newline character. */
3530 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3531 if (need_word_trtable
)
3532 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3533 /* There must be only one destination which accepts
3534 newline. See group_nodes_into_DFAstates. */
3539 if (dest_states_malloced
)
3542 re_node_set_free (&follows
);
3543 for (i
= 0; i
< ndests
; ++i
)
3544 re_node_set_free (dests_node
+ i
);
3546 if (dests_node_malloced
)
3552 /* Group all nodes belonging to STATE into several destinations.
3553 Then for all destinations, set the nodes belonging to the destination
3554 to DESTS_NODE[i] and set the characters accepted by the destination
3555 to DEST_CH[i]. This function return the number of destinations. */
3559 group_nodes_into_DFAstates (const re_dfa_t
*dfa
, const re_dfastate_t
*state
,
3560 re_node_set
*dests_node
, bitset_t
*dests_ch
)
3565 int ndests
; /* Number of the destinations from `state'. */
3566 bitset_t accepts
; /* Characters a node can accept. */
3567 const re_node_set
*cur_nodes
= &state
->nodes
;
3568 bitset_empty (accepts
);
3571 /* For all the nodes belonging to `state', */
3572 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3574 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3575 re_token_type_t type
= node
->type
;
3576 unsigned int constraint
= node
->constraint
;
3578 /* Enumerate all single byte character this node can accept. */
3579 if (type
== CHARACTER
)
3580 bitset_set (accepts
, node
->opr
.c
);
3581 else if (type
== SIMPLE_BRACKET
)
3583 bitset_merge (accepts
, node
->opr
.sbcset
);
3585 else if (type
== OP_PERIOD
)
3587 #ifdef RE_ENABLE_I18N
3588 if (dfa
->mb_cur_max
> 1)
3589 bitset_merge (accepts
, dfa
->sb_char
);
3592 bitset_set_all (accepts
);
3593 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3594 bitset_clear (accepts
, '\n');
3595 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3596 bitset_clear (accepts
, '\0');
3598 #ifdef RE_ENABLE_I18N
3599 else if (type
== OP_UTF8_PERIOD
)
3601 memset (accepts
, '\xff', sizeof (bitset_t
) / 2);
3602 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3603 bitset_clear (accepts
, '\n');
3604 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3605 bitset_clear (accepts
, '\0');
3611 /* Check the `accepts' and sift the characters which are not
3612 match it the context. */
3615 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3617 bool accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3618 bitset_empty (accepts
);
3619 if (accepts_newline
)
3620 bitset_set (accepts
, NEWLINE_CHAR
);
3624 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3626 bitset_empty (accepts
);
3630 if (constraint
& NEXT_WORD_CONSTRAINT
)
3632 bitset_word_t any_set
= 0;
3633 if (type
== CHARACTER
&& !node
->word_char
)
3635 bitset_empty (accepts
);
3638 #ifdef RE_ENABLE_I18N
3639 if (dfa
->mb_cur_max
> 1)
3640 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3641 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3644 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3645 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3649 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3651 bitset_word_t any_set
= 0;
3652 if (type
== CHARACTER
&& node
->word_char
)
3654 bitset_empty (accepts
);
3657 #ifdef RE_ENABLE_I18N
3658 if (dfa
->mb_cur_max
> 1)
3659 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3660 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3663 for (j
= 0; j
< BITSET_WORDS
; ++j
)
3664 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3670 /* Then divide `accepts' into DFA states, or create a new
3671 state. Above, we make sure that accepts is not empty. */
3672 for (j
= 0; j
< ndests
; ++j
)
3674 bitset_t intersec
; /* Intersection sets, see below. */
3676 /* Flags, see below. */
3677 bitset_word_t has_intersec
, not_subset
, not_consumed
;
3679 /* Optimization, skip if this state doesn't accept the character. */
3680 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3683 /* Enumerate the intersection set of this state and `accepts'. */
3685 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3686 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3687 /* And skip if the intersection set is empty. */
3691 /* Then check if this state is a subset of `accepts'. */
3692 not_subset
= not_consumed
= 0;
3693 for (k
= 0; k
< BITSET_WORDS
; ++k
)
3695 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3696 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3699 /* If this state isn't a subset of `accepts', create a
3700 new group state, which has the `remains'. */
3703 bitset_copy (dests_ch
[ndests
], remains
);
3704 bitset_copy (dests_ch
[j
], intersec
);
3705 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3706 if (BE (err
!= REG_NOERROR
, 0))
3711 /* Put the position in the current group. */
3712 result
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3713 if (BE (result
< 0, 0))
3716 /* If all characters are consumed, go to next node. */
3720 /* Some characters remain, create a new group. */
3723 bitset_copy (dests_ch
[ndests
], accepts
);
3724 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3725 if (BE (err
!= REG_NOERROR
, 0))
3728 bitset_empty (accepts
);
3733 for (j
= 0; j
< ndests
; ++j
)
3734 re_node_set_free (dests_node
+ j
);
3738 #ifdef RE_ENABLE_I18N
3739 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3740 Return the number of the bytes the node accepts.
3741 STR_IDX is the current index of the input string.
3743 This function handles the nodes which can accept one character, or
3744 one collating element like '.', '[a-z]', opposite to the other nodes
3745 can only accept one byte. */
3749 check_node_accept_bytes (const re_dfa_t
*dfa
, int node_idx
,
3750 const re_string_t
*input
, int str_idx
)
3752 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3753 int char_len
, elem_len
;
3757 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3759 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3760 if (BE (c
< 0xc2, 1))
3763 if (str_idx
+ 2 > input
->len
)
3766 d
= re_string_byte_at (input
, str_idx
+ 1);
3768 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3772 if (c
== 0xe0 && d
< 0xa0)
3778 if (c
== 0xf0 && d
< 0x90)
3784 if (c
== 0xf8 && d
< 0x88)
3790 if (c
== 0xfc && d
< 0x84)
3796 if (str_idx
+ char_len
> input
->len
)
3799 for (i
= 1; i
< char_len
; ++i
)
3801 d
= re_string_byte_at (input
, str_idx
+ i
);
3802 if (d
< 0x80 || d
> 0xbf)
3808 char_len
= re_string_char_size_at (input
, str_idx
);
3809 if (node
->type
== OP_PERIOD
)
3813 /* FIXME: I don't think this if is needed, as both '\n'
3814 and '\0' are char_len == 1. */
3815 /* '.' accepts any one character except the following two cases. */
3816 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3817 re_string_byte_at (input
, str_idx
) == '\n') ||
3818 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3819 re_string_byte_at (input
, str_idx
) == '\0'))
3824 elem_len
= re_string_elem_size_at (input
, str_idx
);
3825 wc
= __btowc(*(input
->mbs
+str_idx
));
3826 if (((elem_len
<= 1 && char_len
<= 1) || char_len
== 0) && (wc
!= WEOF
&& wc
< SBC_MAX
))
3829 if (node
->type
== COMPLEX_BRACKET
)
3831 const re_charset_t
*cset
= node
->opr
.mbcset
;
3833 const unsigned char *pin
3834 = ((const unsigned char *) re_string_get_buffer (input
) + str_idx
);
3839 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3840 ? re_string_wchar_at (input
, str_idx
) : 0);
3842 /* match with multibyte character? */
3843 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3844 if (wc
== cset
->mbchars
[i
])
3846 match_len
= char_len
;
3847 goto check_node_accept_bytes_match
;
3849 /* match with character_class? */
3850 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3852 wctype_t wt
= cset
->char_classes
[i
];
3853 if (__iswctype (wc
, wt
))
3855 match_len
= char_len
;
3856 goto check_node_accept_bytes_match
;
3861 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3864 unsigned int in_collseq
= 0;
3865 const int32_t *table
, *indirect
;
3866 const unsigned char *weights
, *extra
;
3867 const char *collseqwc
;
3868 /* This #include defines a local function! */
3869 # include <locale/weight.h>
3871 /* match with collating_symbol? */
3872 if (cset
->ncoll_syms
)
3873 extra
= (const unsigned char *)
3874 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3875 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3877 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3878 /* Compare the length of input collating element and
3879 the length of current collating element. */
3880 if (*coll_sym
!= elem_len
)
3882 /* Compare each bytes. */
3883 for (j
= 0; j
< *coll_sym
; j
++)
3884 if (pin
[j
] != coll_sym
[1 + j
])
3888 /* Match if every bytes is equal. */
3890 goto check_node_accept_bytes_match
;
3896 if (elem_len
<= char_len
)
3898 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3899 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3902 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3904 /* match with range expression? */
3905 for (i
= 0; i
< cset
->nranges
; ++i
)
3906 if (cset
->range_starts
[i
] <= in_collseq
3907 && in_collseq
<= cset
->range_ends
[i
])
3909 match_len
= elem_len
;
3910 goto check_node_accept_bytes_match
;
3913 /* match with equivalence_class? */
3914 if (cset
->nequiv_classes
)
3916 const unsigned char *cp
= pin
;
3917 table
= (const int32_t *)
3918 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3919 weights
= (const unsigned char *)
3920 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3921 extra
= (const unsigned char *)
3922 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3923 indirect
= (const int32_t *)
3924 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3925 int32_t idx
= findidx (&cp
);
3927 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3929 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3930 size_t weight_len
= weights
[idx
& 0xffffff];
3931 if (weight_len
== weights
[equiv_class_idx
& 0xffffff]
3932 && (idx
>> 24) == (equiv_class_idx
>> 24))
3937 equiv_class_idx
&= 0xffffff;
3939 while (cnt
<= weight_len
3940 && (weights
[equiv_class_idx
+ 1 + cnt
]
3941 == weights
[idx
+ 1 + cnt
]))
3943 if (cnt
> weight_len
)
3945 match_len
= elem_len
;
3946 goto check_node_accept_bytes_match
;
3955 /* match with range expression? */
3957 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3959 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3962 for (i
= 0; i
< cset
->nranges
; ++i
)
3964 cmp_buf
[0] = cset
->range_starts
[i
];
3965 cmp_buf
[4] = cset
->range_ends
[i
];
3966 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3967 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3969 match_len
= char_len
;
3970 goto check_node_accept_bytes_match
;
3974 check_node_accept_bytes_match
:
3975 if (!cset
->non_match
)
3982 return (elem_len
> char_len
) ? elem_len
: char_len
;
3991 find_collation_sequence_value (const unsigned char *mbs
, size_t mbs_len
)
3993 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3998 /* No valid character. Match it as a single byte character. */
3999 const unsigned char *collseq
= (const unsigned char *)
4000 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
4001 return collseq
[mbs
[0]];
4008 const unsigned char *extra
= (const unsigned char *)
4009 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
4010 int32_t extrasize
= (const unsigned char *)
4011 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
4013 for (idx
= 0; idx
< extrasize
;)
4015 int mbs_cnt
, found
= 0;
4016 int32_t elem_mbs_len
;
4017 /* Skip the name of collating element name. */
4018 idx
= idx
+ extra
[idx
] + 1;
4019 elem_mbs_len
= extra
[idx
++];
4020 if (mbs_len
== elem_mbs_len
)
4022 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
4023 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
4025 if (mbs_cnt
== elem_mbs_len
)
4026 /* Found the entry. */
4029 /* Skip the byte sequence of the collating element. */
4030 idx
+= elem_mbs_len
;
4031 /* Adjust for the alignment. */
4032 idx
= (idx
+ 3) & ~3;
4033 /* Skip the collation sequence value. */
4034 idx
+= sizeof (uint32_t);
4035 /* Skip the wide char sequence of the collating element. */
4036 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
4037 /* If we found the entry, return the sequence value. */
4039 return *(uint32_t *) (extra
+ idx
);
4040 /* Skip the collation sequence value. */
4041 idx
+= sizeof (uint32_t);
4047 #endif /* RE_ENABLE_I18N */
4049 /* Check whether the node accepts the byte which is IDX-th
4050 byte of the INPUT. */
4054 check_node_accept (const re_match_context_t
*mctx
, const re_token_t
*node
,
4058 ch
= re_string_byte_at (&mctx
->input
, idx
);
4062 if (node
->opr
.c
!= ch
)
4066 case SIMPLE_BRACKET
:
4067 if (!bitset_contain (node
->opr
.sbcset
, ch
))
4071 #ifdef RE_ENABLE_I18N
4072 case OP_UTF8_PERIOD
:
4078 if ((ch
== '\n' && !(mctx
->dfa
->syntax
& RE_DOT_NEWLINE
))
4079 || (ch
== '\0' && (mctx
->dfa
->syntax
& RE_DOT_NOT_NULL
)))
4087 if (node
->constraint
)
4089 /* The node has constraints. Check whether the current context
4090 satisfies the constraints. */
4091 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
4093 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
4100 /* Extend the buffers, if the buffers have run out. */
4102 static reg_errcode_t
4104 extend_buffers (re_match_context_t
*mctx
)
4107 re_string_t
*pstr
= &mctx
->input
;
4109 /* Avoid overflow. */
4110 if (BE (INT_MAX
/ 2 / sizeof (re_dfastate_t
*) <= pstr
->bufs_len
, 0))
4113 /* Double the lengthes of the buffers. */
4114 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
4115 if (BE (ret
!= REG_NOERROR
, 0))
4118 if (mctx
->state_log
!= NULL
)
4120 /* And double the length of state_log. */
4121 /* XXX We have no indication of the size of this buffer. If this
4122 allocation fail we have no indication that the state_log array
4123 does not have the right size. */
4124 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
4125 pstr
->bufs_len
+ 1);
4126 if (BE (new_array
== NULL
, 0))
4128 mctx
->state_log
= new_array
;
4131 /* Then reconstruct the buffers. */
4134 #ifdef RE_ENABLE_I18N
4135 if (pstr
->mb_cur_max
> 1)
4137 ret
= build_wcs_upper_buffer (pstr
);
4138 if (BE (ret
!= REG_NOERROR
, 0))
4142 #endif /* RE_ENABLE_I18N */
4143 build_upper_buffer (pstr
);
4147 #ifdef RE_ENABLE_I18N
4148 if (pstr
->mb_cur_max
> 1)
4149 build_wcs_buffer (pstr
);
4151 #endif /* RE_ENABLE_I18N */
4153 if (pstr
->trans
!= NULL
)
4154 re_string_translate_buffer (pstr
);
4161 /* Functions for matching context. */
4163 /* Initialize MCTX. */
4165 static reg_errcode_t
4167 match_ctx_init (re_match_context_t
*mctx
, int eflags
, int n
)
4169 mctx
->eflags
= eflags
;
4170 mctx
->match_last
= -1;
4173 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4174 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4175 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4178 /* Already zero-ed by the caller.
4180 mctx->bkref_ents = NULL;
4181 mctx->nbkref_ents = 0;
4182 mctx->nsub_tops = 0; */
4183 mctx
->abkref_ents
= n
;
4184 mctx
->max_mb_elem_len
= 1;
4185 mctx
->asub_tops
= n
;
4189 /* Clean the entries which depend on the current input in MCTX.
4190 This function must be invoked when the matcher changes the start index
4191 of the input, or changes the input string. */
4195 match_ctx_clean (re_match_context_t
*mctx
)
4198 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4201 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4202 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4204 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4205 re_free (last
->path
.array
);
4208 re_free (top
->lasts
);
4211 re_free (top
->path
->array
);
4212 re_free (top
->path
);
4217 mctx
->nsub_tops
= 0;
4218 mctx
->nbkref_ents
= 0;
4221 /* Free all the memory associated with MCTX. */
4225 match_ctx_free (re_match_context_t
*mctx
)
4227 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4228 match_ctx_clean (mctx
);
4229 re_free (mctx
->sub_tops
);
4230 re_free (mctx
->bkref_ents
);
4233 /* Add a new backreference entry to MCTX.
4234 Note that we assume that caller never call this function with duplicate
4235 entry, and call with STR_IDX which isn't smaller than any existing entry.
4238 static reg_errcode_t
4240 match_ctx_add_entry (re_match_context_t
*mctx
, int node
, int str_idx
, int from
,
4243 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4245 struct re_backref_cache_entry
* new_entry
;
4246 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4247 mctx
->abkref_ents
* 2);
4248 if (BE (new_entry
== NULL
, 0))
4250 re_free (mctx
->bkref_ents
);
4253 mctx
->bkref_ents
= new_entry
;
4254 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4255 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4256 mctx
->abkref_ents
*= 2;
4258 if (mctx
->nbkref_ents
> 0
4259 && mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].str_idx
== str_idx
)
4260 mctx
->bkref_ents
[mctx
->nbkref_ents
- 1].more
= 1;
4262 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4263 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4264 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4265 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4267 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4268 If bit N is clear, means that this entry won't epsilon-transition to
4269 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4270 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4273 A backreference does not epsilon-transition unless it is empty, so set
4274 to all zeros if FROM != TO. */
4275 mctx
->bkref_ents
[mctx
->nbkref_ents
].eps_reachable_subexps_map
4276 = (from
== to
? ~0 : 0);
4278 mctx
->bkref_ents
[mctx
->nbkref_ents
++].more
= 0;
4279 if (mctx
->max_mb_elem_len
< to
- from
)
4280 mctx
->max_mb_elem_len
= to
- from
;
4284 /* Search for the first entry which has the same str_idx, or -1 if none is
4285 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4289 search_cur_bkref_entry (const re_match_context_t
*mctx
, int str_idx
)
4291 int left
, right
, mid
, last
;
4292 last
= right
= mctx
->nbkref_ents
;
4293 for (left
= 0; left
< right
;)
4295 mid
= (left
+ right
) / 2;
4296 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4301 if (left
< last
&& mctx
->bkref_ents
[left
].str_idx
== str_idx
)
4307 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4310 static reg_errcode_t
4312 match_ctx_add_subtop (re_match_context_t
*mctx
, int node
, int str_idx
)
4315 assert (mctx
->sub_tops
!= NULL
);
4316 assert (mctx
->asub_tops
> 0);
4318 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4320 int new_asub_tops
= mctx
->asub_tops
* 2;
4321 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4322 re_sub_match_top_t
*,
4324 if (BE (new_array
== NULL
, 0))
4326 mctx
->sub_tops
= new_array
;
4327 mctx
->asub_tops
= new_asub_tops
;
4329 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4330 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4332 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4333 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4337 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4338 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4340 static re_sub_match_last_t
*
4342 match_ctx_add_sublast (re_sub_match_top_t
*subtop
, int node
, int str_idx
)
4344 re_sub_match_last_t
*new_entry
;
4345 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4347 int new_alasts
= 2 * subtop
->alasts
+ 1;
4348 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4349 re_sub_match_last_t
*,
4351 if (BE (new_array
== NULL
, 0))
4353 subtop
->lasts
= new_array
;
4354 subtop
->alasts
= new_alasts
;
4356 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4357 if (BE (new_entry
!= NULL
, 1))
4359 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4360 new_entry
->node
= node
;
4361 new_entry
->str_idx
= str_idx
;
4369 sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
4370 re_dfastate_t
**limited_sts
, int last_node
, int last_str_idx
)
4372 sctx
->sifted_states
= sifted_sts
;
4373 sctx
->limited_states
= limited_sts
;
4374 sctx
->last_node
= last_node
;
4375 sctx
->last_str_idx
= last_str_idx
;
4376 re_node_set_init_empty (&sctx
->limits
);