1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 re_string_t
*input
, int n
);
23 static void match_ctx_clean (re_match_context_t
*mctx
);
24 static void match_ctx_free (re_match_context_t
*cache
);
25 static void match_ctx_free_subtops (re_match_context_t
*mctx
);
26 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
27 int str_idx
, int from
, int to
);
28 static int search_cur_bkref_entry (re_match_context_t
*mctx
, int str_idx
);
29 static void match_ctx_clear_flag (re_match_context_t
*mctx
);
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
);
34 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
35 re_dfastate_t
**limited_sts
, int last_node
,
36 int last_str_idx
, int check_subexp
);
37 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
38 const char *string
, int length
,
39 int start
, int range
, int stop
,
40 size_t nmatch
, regmatch_t pmatch
[],
42 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
43 const char *string1
, int length1
,
44 const char *string2
, int length2
,
45 int start
, int range
, struct re_registers
*regs
,
46 int stop
, int ret_len
);
47 static int re_search_stub (struct re_pattern_buffer
*bufp
,
48 const char *string
, int length
, int start
,
49 int range
, int stop
, struct re_registers
*regs
,
51 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
52 int nregs
, int regs_allocated
);
53 static inline re_dfastate_t
*acquire_init_state_context
54 (reg_errcode_t
*err
, const regex_t
*preg
, const re_match_context_t
*mctx
,
55 int idx
) __attribute ((always_inline
));
56 static reg_errcode_t
prune_impossible_nodes (const regex_t
*preg
,
57 re_match_context_t
*mctx
);
58 static int check_matching (const regex_t
*preg
, re_match_context_t
*mctx
,
59 int fl_longest_match
);
60 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
61 unsigned int context
);
62 static int check_halt_state_context (const regex_t
*preg
,
63 const re_dfastate_t
*state
,
64 const re_match_context_t
*mctx
, int idx
);
65 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
, int cur_node
,
66 int cur_idx
, int nmatch
);
67 static int proceed_next_node (const regex_t
*preg
, int nregs
, regmatch_t
*regs
,
68 const re_match_context_t
*mctx
,
69 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
70 struct re_fail_stack_t
*fs
);
71 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
72 int str_idx
, int *dests
, int nregs
,
74 re_node_set
*eps_via_nodes
);
75 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
76 regmatch_t
*regs
, re_node_set
*eps_via_nodes
);
77 static reg_errcode_t
set_regs (const regex_t
*preg
,
78 const re_match_context_t
*mctx
,
79 size_t nmatch
, regmatch_t
*pmatch
,
81 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
);
84 static int sift_states_iter_mb (const regex_t
*preg
,
85 const re_match_context_t
*mctx
,
86 re_sift_context_t
*sctx
,
87 int node_idx
, int str_idx
, int max_str_idx
);
88 #endif /* RE_ENABLE_I18N */
89 static reg_errcode_t
sift_states_backward (const regex_t
*preg
,
90 re_match_context_t
*mctx
,
91 re_sift_context_t
*sctx
);
92 static reg_errcode_t
update_cur_sifted_state (const regex_t
*preg
,
93 re_match_context_t
*mctx
,
94 re_sift_context_t
*sctx
,
96 re_node_set
*dest_nodes
);
97 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
98 re_node_set
*dest_nodes
,
99 const re_node_set
*candidates
);
100 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
101 re_node_set
*dest_nodes
,
102 const re_node_set
*and_nodes
);
103 static int check_dst_limits (re_dfa_t
*dfa
, re_node_set
*limits
,
104 re_match_context_t
*mctx
, int dst_node
,
105 int dst_idx
, int src_node
, int src_idx
);
106 static int check_dst_limits_calc_pos (re_dfa_t
*dfa
, re_match_context_t
*mctx
,
107 int limit
, re_node_set
*eclosures
,
108 int subexp_idx
, int node
, int str_idx
);
109 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
110 re_node_set
*dest_nodes
,
111 const re_node_set
*candidates
,
113 struct re_backref_cache_entry
*bkref_ents
,
115 static reg_errcode_t
sift_states_bkref (const regex_t
*preg
,
116 re_match_context_t
*mctx
,
117 re_sift_context_t
*sctx
,
118 int str_idx
, re_node_set
*dest_nodes
);
119 static reg_errcode_t
clean_state_log_if_need (re_match_context_t
*mctx
,
120 int next_state_log_idx
);
121 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
122 re_dfastate_t
**src
, int num
);
123 static re_dfastate_t
*transit_state (reg_errcode_t
*err
, const regex_t
*preg
,
124 re_match_context_t
*mctx
,
125 re_dfastate_t
*state
);
126 static reg_errcode_t
check_subexp_matching_top (re_dfa_t
*dfa
,
127 re_match_context_t
*mctx
,
128 re_node_set
*cur_nodes
,
131 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
, const regex_t
*preg
,
132 re_dfastate_t
*pstate
,
133 re_match_context_t
*mctx
);
135 #ifdef RE_ENABLE_I18N
136 static reg_errcode_t
transit_state_mb (const regex_t
*preg
,
137 re_dfastate_t
*pstate
,
138 re_match_context_t
*mctx
);
139 #endif /* RE_ENABLE_I18N */
140 static reg_errcode_t
transit_state_bkref (const regex_t
*preg
,
141 const re_node_set
*nodes
,
142 re_match_context_t
*mctx
);
143 static reg_errcode_t
get_subexp (const regex_t
*preg
, re_match_context_t
*mctx
,
144 int bkref_node
, int bkref_str_idx
);
145 static reg_errcode_t
get_subexp_sub (const regex_t
*preg
,
146 re_match_context_t
*mctx
,
147 const re_sub_match_top_t
*sub_top
,
148 re_sub_match_last_t
*sub_last
,
149 int bkref_node
, int bkref_str
);
150 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
151 int subexp_idx
, int type
);
152 static reg_errcode_t
check_arrival (const regex_t
*preg
,
153 re_match_context_t
*mctx
,
154 state_array_t
*path
, int top_node
,
155 int top_str
, int last_node
, int last_str
,
157 static reg_errcode_t
check_arrival_add_next_nodes (const regex_t
*preg
,
159 re_match_context_t
*mctx
,
161 re_node_set
*cur_nodes
,
162 re_node_set
*next_nodes
);
163 static reg_errcode_t
check_arrival_expand_ecl (re_dfa_t
*dfa
,
164 re_node_set
*cur_nodes
,
165 int ex_subexp
, int type
);
166 static reg_errcode_t
check_arrival_expand_ecl_sub (re_dfa_t
*dfa
,
167 re_node_set
*dst_nodes
,
168 int target
, int ex_subexp
,
170 static reg_errcode_t
expand_bkref_cache (const regex_t
*preg
,
171 re_match_context_t
*mctx
,
172 re_node_set
*cur_nodes
, int cur_str
,
173 int last_str
, int subexp_num
,
175 static re_dfastate_t
**build_trtable (const regex_t
*dfa
,
176 re_dfastate_t
*state
);
177 #ifdef RE_ENABLE_I18N
178 static int check_node_accept_bytes (const regex_t
*preg
, int node_idx
,
179 const re_string_t
*input
, int idx
);
181 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
184 #endif /* RE_ENABLE_I18N */
185 static int group_nodes_into_DFAstates (const regex_t
*dfa
,
186 const re_dfastate_t
*state
,
187 re_node_set
*states_node
,
189 static int check_node_accept (const regex_t
*preg
, const re_token_t
*node
,
190 const re_match_context_t
*mctx
, int idx
);
191 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
);
193 /* Entry point for POSIX code. */
195 /* regexec searches for a given pattern, specified by PREG, in the
198 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
199 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
200 least NMATCH elements, and we set them to the offsets of the
201 corresponding matched substrings.
203 EFLAGS specifies `execution flags' which affect matching: if
204 REG_NOTBOL is set, then ^ does not match at the beginning of the
205 string; if REG_NOTEOL is set, then $ does not match at the end.
207 We return 0 if we find a match and REG_NOMATCH if not. */
210 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
211 const regex_t
*__restrict preg
;
212 const char *__restrict string
;
218 int length
= strlen (string
);
220 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, 0,
223 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, nmatch
,
225 return err
!= REG_NOERROR
;
228 weak_alias (__regexec
, regexec
)
231 /* Entry points for GNU code. */
233 /* re_match, re_search, re_match_2, re_search_2
235 The former two functions operate on STRING with length LENGTH,
236 while the later two operate on concatenation of STRING1 and STRING2
237 with lengths LENGTH1 and LENGTH2, respectively.
239 re_match() matches the compiled pattern in BUFP against the string,
240 starting at index START.
242 re_search() first tries matching at index START, then it tries to match
243 starting from index START + 1, and so on. The last start position tried
244 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
247 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
248 the first STOP characters of the concatenation of the strings should be
251 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
252 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
253 computed relative to the concatenation, not relative to the individual
256 On success, re_match* functions return the length of the match, re_search*
257 return the position of the start of the match. Return value -1 means no
258 match was found and -2 indicates an internal error. */
261 re_match (bufp
, string
, length
, start
, regs
)
262 struct re_pattern_buffer
*bufp
;
265 struct re_registers
*regs
;
267 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
270 weak_alias (__re_match
, re_match
)
274 re_search (bufp
, string
, length
, start
, range
, regs
)
275 struct re_pattern_buffer
*bufp
;
277 int length
, start
, range
;
278 struct re_registers
*regs
;
280 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
283 weak_alias (__re_search
, re_search
)
287 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
288 struct re_pattern_buffer
*bufp
;
289 const char *string1
, *string2
;
290 int length1
, length2
, start
, stop
;
291 struct re_registers
*regs
;
293 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
294 start
, 0, regs
, stop
, 1);
297 weak_alias (__re_match_2
, re_match_2
)
301 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
302 struct re_pattern_buffer
*bufp
;
303 const char *string1
, *string2
;
304 int length1
, length2
, start
, range
, stop
;
305 struct re_registers
*regs
;
307 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
308 start
, range
, regs
, stop
, 0);
311 weak_alias (__re_search_2
, re_search_2
)
315 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
317 struct re_pattern_buffer
*bufp
;
318 const char *string1
, *string2
;
319 int length1
, length2
, start
, range
, stop
, ret_len
;
320 struct re_registers
*regs
;
324 int len
= length1
+ length2
;
327 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
330 /* Concatenate the strings. */
334 char *s
= re_malloc (char, len
);
336 if (BE (s
== NULL
, 0))
338 memcpy (s
, string1
, length1
);
339 memcpy (s
+ length1
, string2
, length2
);
348 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
351 re_free ((char *) str
);
355 /* The parameters have the same meaning as those of re_search.
356 Additional parameters:
357 If RET_LEN is nonzero the length of the match is returned (re_match style);
358 otherwise the position of the match is returned. */
361 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
362 struct re_pattern_buffer
*bufp
;
364 int length
, start
, range
, stop
, ret_len
;
365 struct re_registers
*regs
;
367 reg_errcode_t result
;
372 /* Check for out-of-range. */
373 if (BE (start
< 0 || start
> length
, 0))
375 if (BE (start
+ range
> length
, 0))
376 range
= length
- start
;
377 else if (BE (start
+ range
< 0, 0))
380 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
381 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
383 /* Compile fastmap if we haven't yet. */
384 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
385 re_compile_fastmap (bufp
);
387 if (BE (bufp
->no_sub
, 0))
390 /* We need at least 1 register. */
393 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
394 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
396 nregs
= regs
->num_regs
;
397 if (BE (nregs
< 1, 0))
399 /* Nothing can be copied to regs. */
405 nregs
= bufp
->re_nsub
+ 1;
406 pmatch
= re_malloc (regmatch_t
, nregs
);
407 if (BE (pmatch
== NULL
, 0))
410 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
411 nregs
, pmatch
, eflags
);
415 /* I hope we needn't fill ther regs with -1's when no match was found. */
416 if (result
!= REG_NOERROR
)
418 else if (regs
!= NULL
)
420 /* If caller wants register contents data back, copy them. */
421 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
422 bufp
->regs_allocated
);
423 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
427 if (BE (rval
== 0, 1))
431 assert (pmatch
[0].rm_so
== start
);
432 rval
= pmatch
[0].rm_eo
- start
;
435 rval
= pmatch
[0].rm_so
;
442 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
443 struct re_registers
*regs
;
445 int nregs
, regs_allocated
;
447 int rval
= REGS_REALLOCATE
;
449 int need_regs
= nregs
+ 1;
450 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
453 /* Have the register data arrays been allocated? */
454 if (regs_allocated
== REGS_UNALLOCATED
)
455 { /* No. So allocate them with malloc. */
456 regs
->start
= re_malloc (regoff_t
, need_regs
);
457 if (BE (regs
->start
== NULL
, 0))
458 return REGS_UNALLOCATED
;
459 regs
->end
= re_malloc (regoff_t
, need_regs
);
460 if (BE (regs
->end
== NULL
, 0))
462 re_free (regs
->start
);
463 return REGS_UNALLOCATED
;
465 regs
->num_regs
= need_regs
;
467 else if (regs_allocated
== REGS_REALLOCATE
)
468 { /* Yes. If we need more elements than were already
469 allocated, reallocate them. If we need fewer, just
471 if (BE (need_regs
> regs
->num_regs
, 0))
473 regs
->start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
474 if (BE (regs
->start
== NULL
, 0))
476 if (regs
->end
!= NULL
)
478 return REGS_UNALLOCATED
;
480 regs
->end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
481 if (BE (regs
->end
== NULL
, 0))
483 re_free (regs
->start
);
484 return REGS_UNALLOCATED
;
486 regs
->num_regs
= need_regs
;
491 assert (regs_allocated
== REGS_FIXED
);
492 /* This function may not be called with REGS_FIXED and nregs too big. */
493 assert (regs
->num_regs
>= nregs
);
498 for (i
= 0; i
< nregs
; ++i
)
500 regs
->start
[i
] = pmatch
[i
].rm_so
;
501 regs
->end
[i
] = pmatch
[i
].rm_eo
;
503 for ( ; i
< regs
->num_regs
; ++i
)
504 regs
->start
[i
] = regs
->end
[i
] = -1;
509 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
510 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
511 this memory for recording register information. STARTS and ENDS
512 must be allocated using the malloc library routine, and must each
513 be at least NUM_REGS * sizeof (regoff_t) bytes long.
515 If NUM_REGS == 0, then subsequent matches should allocate their own
518 Unless this function is called, the first search or match using
519 PATTERN_BUFFER will allocate its own register data, without
520 freeing the old data. */
523 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
524 struct re_pattern_buffer
*bufp
;
525 struct re_registers
*regs
;
527 regoff_t
*starts
, *ends
;
531 bufp
->regs_allocated
= REGS_REALLOCATE
;
532 regs
->num_regs
= num_regs
;
533 regs
->start
= starts
;
538 bufp
->regs_allocated
= REGS_UNALLOCATED
;
540 regs
->start
= regs
->end
= (regoff_t
*) 0;
544 weak_alias (__re_set_registers
, re_set_registers
)
547 /* Entry points compatible with 4.2 BSD regex library. We don't define
548 them unless specifically requested. */
550 #if defined _REGEX_RE_COMP || defined _LIBC
558 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
560 #endif /* _REGEX_RE_COMP */
562 static re_node_set empty_set
;
564 /* Internal entry point. */
566 /* Searches for a compiled pattern PREG in the string STRING, whose
567 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
568 mingings with regexec. START, and RANGE have the same meanings
570 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
571 otherwise return the error code.
572 Note: We assume front end functions already check ranges.
573 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
576 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
580 int length
, start
, range
, stop
, eflags
;
585 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
587 int left_lim
, right_lim
, incr
;
588 int fl_longest_match
, match_first
, match_last
= -1;
589 int fast_translate
, sb
;
590 re_match_context_t mctx
;
591 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
592 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
);
594 /* Check if the DFA haven't been compiled. */
595 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
596 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
597 || dfa
->init_state_begbuf
== NULL
, 0))
601 /* We assume front-end functions already check them. */
602 assert (start
+ range
>= 0 && start
+ range
<= length
);
605 /* If initial states with non-begbuf contexts have no elements,
606 the regex must be anchored. If preg->newline_anchor is set,
607 we'll never use init_state_nl, so do not check it. */
608 if (dfa
->init_state
->nodes
.nelem
== 0
609 && dfa
->init_state_word
->nodes
.nelem
== 0
610 && (dfa
->init_state_nl
->nodes
.nelem
== 0
611 || !preg
->newline_anchor
))
613 if (start
!= 0 && start
+ range
!= 0)
618 re_node_set_init_empty (&empty_set
);
619 memset (&mctx
, '\0', sizeof (re_match_context_t
));
621 /* We must check the longest matching, if nmatch > 0. */
622 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
624 err
= re_string_allocate (&input
, string
, length
, dfa
->nodes_len
+ 1,
625 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
626 if (BE (err
!= REG_NOERROR
, 0))
629 input
.raw_stop
= stop
;
631 err
= match_ctx_init (&mctx
, eflags
, &input
, dfa
->nbackref
* 2);
632 if (BE (err
!= REG_NOERROR
, 0))
635 /* We will log all the DFA states through which the dfa pass,
636 if nmatch > 1, or this dfa has "multibyte node", which is a
637 back-reference or a node which can accept multibyte character or
638 multi character collating element. */
639 if (nmatch
> 1 || dfa
->has_mb_node
)
641 mctx
.state_log
= re_malloc (re_dfastate_t
*, input
.bufs_len
+ 1);
642 if (BE (mctx
.state_log
== NULL
, 0))
649 mctx
.state_log
= NULL
;
652 input
.tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
653 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
655 /* Check incrementally whether of not the input string match. */
656 incr
= (range
< 0) ? -1 : 1;
657 left_lim
= (range
< 0) ? start
+ range
: start
;
658 right_lim
= (range
< 0) ? start
: start
+ range
;
659 sb
= dfa
->mb_cur_max
== 1;
660 fast_translate
= sb
|| !(preg
->syntax
& RE_ICASE
|| preg
->translate
);
664 /* At first get the current byte from input string. */
667 if (BE (fast_translate
, 1))
669 unsigned RE_TRANSLATE_TYPE t
670 = (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
671 if (BE (range
>= 0, 1))
673 if (BE (t
!= NULL
, 0))
675 while (BE (match_first
< right_lim
, 1)
676 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
681 while (BE (match_first
< right_lim
, 1)
682 && !fastmap
[(unsigned char) string
[match_first
]])
685 if (BE (match_first
== right_lim
, 0))
687 int ch
= match_first
>= length
688 ? 0 : (unsigned char) string
[match_first
];
689 if (!fastmap
[t
? t
[ch
] : ch
])
695 while (match_first
>= left_lim
)
697 int ch
= match_first
>= length
698 ? 0 : (unsigned char) string
[match_first
];
699 if (fastmap
[t
? t
[ch
] : ch
])
703 if (match_first
< left_lim
)
713 /* In this case, we can't determine easily the current byte,
714 since it might be a component byte of a multibyte
715 character. Then we use the constructed buffer
717 /* If MATCH_FIRST is out of the valid range, reconstruct the
719 if (input
.raw_mbs_idx
+ input
.valid_raw_len
<= match_first
720 || match_first
< input
.raw_mbs_idx
)
722 err
= re_string_reconstruct (&input
, match_first
, eflags
,
723 preg
->newline_anchor
);
724 if (BE (err
!= REG_NOERROR
, 0))
727 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
728 Note that MATCH_FIRST must not be smaller than 0. */
729 ch
= ((match_first
>= length
) ? 0
730 : re_string_byte_at (&input
,
731 match_first
- input
.raw_mbs_idx
));
736 while (match_first
>= left_lim
&& match_first
<= right_lim
);
742 /* Reconstruct the buffers so that the matcher can assume that
743 the matching starts from the beginning of the buffer. */
744 err
= re_string_reconstruct (&input
, match_first
, eflags
,
745 preg
->newline_anchor
);
746 if (BE (err
!= REG_NOERROR
, 0))
748 #ifdef RE_ENABLE_I18N
749 /* Eliminate it when it is a component of a multibyte character
750 and isn't the head of a multibyte character. */
751 if (sb
|| re_string_first_byte (&input
, 0))
754 /* It seems to be appropriate one, then use the matcher. */
755 /* We assume that the matching starts from 0. */
756 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
757 match_last
= check_matching (preg
, &mctx
, fl_longest_match
);
758 if (match_last
!= -1)
760 if (BE (match_last
== -2, 0))
767 mctx
.match_last
= match_last
;
768 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
770 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
771 mctx
.last_node
= check_halt_state_context (preg
, pstate
,
774 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
777 err
= prune_impossible_nodes (preg
, &mctx
);
778 if (err
== REG_NOERROR
)
780 if (BE (err
!= REG_NOMATCH
, 0))
785 break; /* We found a match. */
788 match_ctx_clean (&mctx
);
790 /* Update counter. */
792 if (match_first
< left_lim
|| right_lim
< match_first
)
796 /* Set pmatch[] if we need. */
797 if (match_last
!= -1 && nmatch
> 0)
801 /* Initialize registers. */
802 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
803 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
805 /* Set the points where matching start/end. */
807 pmatch
[0].rm_eo
= mctx
.match_last
;
809 if (!preg
->no_sub
&& nmatch
> 1)
811 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
812 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
813 if (BE (err
!= REG_NOERROR
, 0))
817 /* At last, add the offset to the each registers, since we slided
818 the buffers so that we could assume that the matching starts
820 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
821 if (pmatch
[reg_idx
].rm_so
!= -1)
823 #ifdef RE_ENABLE_I18N
824 if (BE (input
.offsets_needed
!= 0, 0))
826 if (pmatch
[reg_idx
].rm_so
== input
.valid_len
)
827 pmatch
[reg_idx
].rm_so
+= input
.valid_raw_len
- input
.valid_len
;
829 pmatch
[reg_idx
].rm_so
= input
.offsets
[pmatch
[reg_idx
].rm_so
];
830 if (pmatch
[reg_idx
].rm_eo
== input
.valid_len
)
831 pmatch
[reg_idx
].rm_eo
+= input
.valid_raw_len
- input
.valid_len
;
833 pmatch
[reg_idx
].rm_eo
= input
.offsets
[pmatch
[reg_idx
].rm_eo
];
836 assert (input
.offsets_needed
== 0);
838 pmatch
[reg_idx
].rm_so
+= match_first
;
839 pmatch
[reg_idx
].rm_eo
+= match_first
;
842 err
= (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
844 re_free (mctx
.state_log
);
846 match_ctx_free (&mctx
);
847 re_string_destruct (&input
);
852 prune_impossible_nodes (preg
, mctx
)
854 re_match_context_t
*mctx
;
856 int halt_node
, match_last
;
858 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
859 re_dfastate_t
**sifted_states
;
860 re_dfastate_t
**lim_states
= NULL
;
861 re_sift_context_t sctx
;
863 assert (mctx
->state_log
!= NULL
);
865 match_last
= mctx
->match_last
;
866 halt_node
= mctx
->last_node
;
867 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
868 if (BE (sifted_states
== NULL
, 0))
875 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
876 if (BE (lim_states
== NULL
, 0))
883 memset (lim_states
, '\0',
884 sizeof (re_dfastate_t
*) * (match_last
+ 1));
885 match_ctx_clear_flag (mctx
);
886 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
888 ret
= sift_states_backward (preg
, mctx
, &sctx
);
889 re_node_set_free (&sctx
.limits
);
890 if (BE (ret
!= REG_NOERROR
, 0))
892 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
902 } while (mctx
->state_log
[match_last
] == NULL
903 || !mctx
->state_log
[match_last
]->halt
);
904 halt_node
= check_halt_state_context (preg
,
905 mctx
->state_log
[match_last
],
908 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
910 re_free (lim_states
);
912 if (BE (ret
!= REG_NOERROR
, 0))
917 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
919 ret
= sift_states_backward (preg
, mctx
, &sctx
);
920 re_node_set_free (&sctx
.limits
);
921 if (BE (ret
!= REG_NOERROR
, 0))
924 re_free (mctx
->state_log
);
925 mctx
->state_log
= sifted_states
;
926 sifted_states
= NULL
;
927 mctx
->last_node
= halt_node
;
928 mctx
->match_last
= match_last
;
931 re_free (sifted_states
);
932 re_free (lim_states
);
936 /* Acquire an initial state and return it.
937 We must select appropriate initial state depending on the context,
938 since initial states may have constraints like "\<", "^", etc.. */
940 static inline re_dfastate_t
*
941 acquire_init_state_context (err
, preg
, mctx
, idx
)
944 const re_match_context_t
*mctx
;
947 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
950 if (dfa
->init_state
->has_constraint
)
952 unsigned int context
;
953 context
= re_string_context_at (mctx
->input
, idx
- 1, mctx
->eflags
,
954 preg
->newline_anchor
);
955 if (IS_WORD_CONTEXT (context
))
956 return dfa
->init_state_word
;
957 else if (IS_ORDINARY_CONTEXT (context
))
958 return dfa
->init_state
;
959 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
960 return dfa
->init_state_begbuf
;
961 else if (IS_NEWLINE_CONTEXT (context
))
962 return dfa
->init_state_nl
;
963 else if (IS_BEGBUF_CONTEXT (context
))
965 /* It is relatively rare case, then calculate on demand. */
966 return re_acquire_state_context (err
, dfa
,
967 dfa
->init_state
->entrance_nodes
,
971 /* Must not happen? */
972 return dfa
->init_state
;
975 return dfa
->init_state
;
978 /* Check whether the regular expression match input string INPUT or not,
979 and return the index where the matching end, return -1 if not match,
980 or return -2 in case of an error.
981 FL_LONGEST_MATCH means we want the POSIX longest matching.
982 Note that the matcher assume that the maching starts from the current
983 index of the buffer. */
986 check_matching (preg
, mctx
, fl_longest_match
)
988 re_match_context_t
*mctx
;
989 int fl_longest_match
;
991 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
995 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
996 re_dfastate_t
*cur_state
;
998 cur_state
= acquire_init_state_context (&err
, preg
, mctx
, cur_str_idx
);
999 /* An initial state must not be NULL(invalid state). */
1000 if (BE (cur_state
== NULL
, 0))
1002 if (mctx
->state_log
!= NULL
)
1003 mctx
->state_log
[cur_str_idx
] = cur_state
;
1005 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1006 later. E.g. Processing back references. */
1007 if (BE (dfa
->nbackref
, 0))
1009 err
= check_subexp_matching_top (dfa
, mctx
, &cur_state
->nodes
, 0);
1010 if (BE (err
!= REG_NOERROR
, 0))
1013 if (cur_state
->has_backref
)
1015 err
= transit_state_bkref (preg
, &cur_state
->nodes
, mctx
);
1016 if (BE (err
!= REG_NOERROR
, 0))
1021 /* If the RE accepts NULL string. */
1022 if (BE (cur_state
->halt
, 0))
1024 if (!cur_state
->has_constraint
1025 || check_halt_state_context (preg
, cur_state
, mctx
, cur_str_idx
))
1027 if (!fl_longest_match
)
1031 match_last
= cur_str_idx
;
1037 while (!re_string_eoi (mctx
->input
))
1039 cur_state
= transit_state (&err
, preg
, mctx
, cur_state
);
1040 if (cur_state
== NULL
) /* Reached at the invalid state or an error. */
1042 cur_str_idx
= re_string_cur_idx (mctx
->input
);
1043 if (BE (err
!= REG_NOERROR
, 0))
1045 if (!fl_longest_match
&& match
)
1049 if (mctx
->state_log
== NULL
)
1053 int max
= mctx
->state_log_top
;
1054 for (; cur_str_idx
<= max
; ++cur_str_idx
)
1055 if (mctx
->state_log
[cur_str_idx
] != NULL
)
1057 if (cur_str_idx
> max
)
1063 if (cur_state
!= NULL
&& cur_state
->halt
)
1065 /* Reached at a halt state.
1066 Check the halt state can satisfy the current context. */
1067 if (!cur_state
->has_constraint
1068 || check_halt_state_context (preg
, cur_state
, mctx
,
1069 re_string_cur_idx (mctx
->input
)))
1071 /* We found an appropriate halt state. */
1072 match_last
= re_string_cur_idx (mctx
->input
);
1074 if (!fl_longest_match
)
1082 /* Check NODE match the current context. */
1084 static int check_halt_node_context (dfa
, node
, context
)
1085 const re_dfa_t
*dfa
;
1087 unsigned int context
;
1089 re_token_type_t type
= dfa
->nodes
[node
].type
;
1090 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1091 if (type
!= END_OF_RE
)
1095 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1100 /* Check the halt state STATE match the current context.
1101 Return 0 if not match, if the node, STATE has, is a halt node and
1102 match the context, return the node. */
1105 check_halt_state_context (preg
, state
, mctx
, idx
)
1106 const regex_t
*preg
;
1107 const re_dfastate_t
*state
;
1108 const re_match_context_t
*mctx
;
1111 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1113 unsigned int context
;
1115 assert (state
->halt
);
1117 context
= re_string_context_at (mctx
->input
, idx
, mctx
->eflags
,
1118 preg
->newline_anchor
);
1119 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1120 if (check_halt_node_context (dfa
, state
->nodes
.elems
[i
], context
))
1121 return state
->nodes
.elems
[i
];
1125 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1126 corresponding to the DFA).
1127 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1131 proceed_next_node (preg
, nregs
, regs
, mctx
, pidx
, node
, eps_via_nodes
, fs
)
1132 const regex_t
*preg
;
1134 const re_match_context_t
*mctx
;
1135 int nregs
, *pidx
, node
;
1136 re_node_set
*eps_via_nodes
;
1137 struct re_fail_stack_t
*fs
;
1139 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1140 int i
, err
, dest_node
;
1142 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1144 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1145 int ndest
, dest_nodes
[2];
1146 err
= re_node_set_insert (eps_via_nodes
, node
);
1147 if (BE (err
< 0, 0))
1149 /* Pick up valid destinations. */
1150 for (ndest
= 0, i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1152 int candidate
= dfa
->edests
[node
].elems
[i
];
1153 if (!re_node_set_contains (cur_nodes
, candidate
))
1155 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1156 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1160 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1161 /* In order to avoid infinite loop like "(a*)*". */
1162 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1163 return dest_nodes
[1];
1165 push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
, eps_via_nodes
);
1166 return dest_nodes
[0];
1171 re_token_type_t type
= dfa
->nodes
[node
].type
;
1173 #ifdef RE_ENABLE_I18N
1174 if (ACCEPT_MB_NODE (type
))
1175 naccepted
= check_node_accept_bytes (preg
, node
, mctx
->input
, *pidx
);
1177 #endif /* RE_ENABLE_I18N */
1178 if (type
== OP_BACK_REF
)
1180 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
;
1181 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1184 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1188 char *buf
= (char *) re_string_get_buffer (mctx
->input
);
1189 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1197 err
= re_node_set_insert (eps_via_nodes
, node
);
1198 if (BE (err
< 0, 0))
1200 dest_node
= dfa
->edests
[node
].elems
[0];
1201 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1208 || check_node_accept (preg
, dfa
->nodes
+ node
, mctx
, *pidx
))
1210 dest_node
= dfa
->nexts
[node
];
1211 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1212 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1213 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1216 re_node_set_empty (eps_via_nodes
);
1223 static reg_errcode_t
1224 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1225 struct re_fail_stack_t
*fs
;
1226 int str_idx
, *dests
, nregs
;
1228 re_node_set
*eps_via_nodes
;
1231 int num
= fs
->num
++;
1232 if (fs
->num
== fs
->alloc
)
1234 struct re_fail_stack_ent_t
*new_array
;
1236 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1238 if (new_array
== NULL
)
1240 fs
->stack
= new_array
;
1242 fs
->stack
[num
].idx
= str_idx
;
1243 fs
->stack
[num
].node
= dests
[1];
1244 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1245 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1246 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1251 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1252 struct re_fail_stack_t
*fs
;
1255 re_node_set
*eps_via_nodes
;
1257 int num
= --fs
->num
;
1259 *pidx
= fs
->stack
[num
].idx
;
1260 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1261 re_node_set_free (eps_via_nodes
);
1262 re_free (fs
->stack
[num
].regs
);
1263 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1264 return fs
->stack
[num
].node
;
1267 /* Set the positions where the subexpressions are starts/ends to registers
1269 Note: We assume that pmatch[0] is already set, and
1270 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1272 static reg_errcode_t
1273 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1274 const regex_t
*preg
;
1275 const re_match_context_t
*mctx
;
1280 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1281 int idx
, cur_node
, real_nmatch
;
1282 re_node_set eps_via_nodes
;
1283 struct re_fail_stack_t
*fs
;
1284 struct re_fail_stack_t fs_body
= {0, 2, NULL
};
1286 assert (nmatch
> 1);
1287 assert (mctx
->state_log
!= NULL
);
1292 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1296 cur_node
= dfa
->init_node
;
1297 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1298 re_node_set_init_empty (&eps_via_nodes
);
1299 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1301 update_regs (dfa
, pmatch
, cur_node
, idx
, real_nmatch
);
1302 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1307 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1308 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1310 if (reg_idx
== nmatch
)
1312 re_node_set_free (&eps_via_nodes
);
1313 return free_fail_stack_return (fs
);
1315 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1320 re_node_set_free (&eps_via_nodes
);
1325 /* Proceed to next node. */
1326 cur_node
= proceed_next_node (preg
, nmatch
, pmatch
, mctx
, &idx
, cur_node
,
1327 &eps_via_nodes
, fs
);
1329 if (BE (cur_node
< 0, 0))
1334 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1338 re_node_set_free (&eps_via_nodes
);
1343 re_node_set_free (&eps_via_nodes
);
1344 return free_fail_stack_return (fs
);
1347 static reg_errcode_t
1348 free_fail_stack_return (fs
)
1349 struct re_fail_stack_t
*fs
;
1354 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1356 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1357 re_free (fs
->stack
[fs_idx
].regs
);
1359 re_free (fs
->stack
);
1365 update_regs (dfa
, pmatch
, cur_node
, cur_idx
, nmatch
)
1368 int cur_node
, cur_idx
, nmatch
;
1370 int type
= dfa
->nodes
[cur_node
].type
;
1372 if (type
!= OP_OPEN_SUBEXP
&& type
!= OP_CLOSE_SUBEXP
)
1374 reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1375 if (reg_num
>= nmatch
)
1377 if (type
== OP_OPEN_SUBEXP
)
1379 /* We are at the first node of this sub expression. */
1380 pmatch
[reg_num
].rm_so
= cur_idx
;
1381 pmatch
[reg_num
].rm_eo
= -1;
1383 else if (type
== OP_CLOSE_SUBEXP
)
1384 /* We are at the first node of this sub expression. */
1385 pmatch
[reg_num
].rm_eo
= cur_idx
;
1388 #define NUMBER_OF_STATE 1
1390 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1391 and sift the nodes in each states according to the following rules.
1392 Updated state_log will be wrote to STATE_LOG.
1394 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1395 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1396 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1397 the LAST_NODE, we throw away the node `a'.
1398 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1399 string `s' and transit to `b':
1400 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1402 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1403 thrown away, we throw away the node `a'.
1404 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1405 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1407 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1408 we throw away the node `a'. */
1410 #define STATE_NODE_CONTAINS(state,node) \
1411 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1413 static reg_errcode_t
1414 sift_states_backward (preg
, mctx
, sctx
)
1415 const regex_t
*preg
;
1416 re_match_context_t
*mctx
;
1417 re_sift_context_t
*sctx
;
1420 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1422 int str_idx
= sctx
->last_str_idx
;
1423 re_node_set cur_dest
;
1424 re_node_set
*cur_src
; /* Points the state_log[str_idx]->nodes */
1427 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1429 cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1431 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1432 transit to the last_node and the last_node itself. */
1433 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1434 if (BE (err
!= REG_NOERROR
, 0))
1436 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1437 if (BE (err
!= REG_NOERROR
, 0))
1440 /* Then check each states in the state_log. */
1444 /* Update counters. */
1445 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1446 if (null_cnt
> mctx
->max_mb_elem_len
)
1448 memset (sctx
->sifted_states
, '\0',
1449 sizeof (re_dfastate_t
*) * str_idx
);
1450 re_node_set_free (&cur_dest
);
1453 re_node_set_empty (&cur_dest
);
1455 cur_src
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1456 : &mctx
->state_log
[str_idx
]->nodes
);
1458 /* Then build the next sifted state.
1459 We build the next sifted state on `cur_dest', and update
1460 `sifted_states[str_idx]' with `cur_dest'.
1462 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1463 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1464 for (i
= 0; i
< cur_src
->nelem
; i
++)
1466 int prev_node
= cur_src
->elems
[i
];
1468 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1470 if (IS_EPSILON_NODE (type
))
1472 #ifdef RE_ENABLE_I18N
1473 /* If the node may accept `multi byte'. */
1474 if (ACCEPT_MB_NODE (type
))
1475 naccepted
= sift_states_iter_mb (preg
, mctx
, sctx
, prev_node
,
1476 str_idx
, sctx
->last_str_idx
);
1478 #endif /* RE_ENABLE_I18N */
1479 /* We don't check backreferences here.
1480 See update_cur_sifted_state(). */
1483 && check_node_accept (preg
, dfa
->nodes
+ prev_node
, mctx
,
1485 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1486 dfa
->nexts
[prev_node
]))
1492 if (sctx
->limits
.nelem
)
1494 int to_idx
= str_idx
+ naccepted
;
1495 if (check_dst_limits (dfa
, &sctx
->limits
, mctx
,
1496 dfa
->nexts
[prev_node
], to_idx
,
1497 prev_node
, str_idx
))
1500 ret
= re_node_set_insert (&cur_dest
, prev_node
);
1501 if (BE (ret
== -1, 0))
1508 /* Add all the nodes which satisfy the following conditions:
1509 - It can epsilon transit to a node in CUR_DEST.
1511 And update state_log. */
1512 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1513 if (BE (err
!= REG_NOERROR
, 0))
1518 re_node_set_free (&cur_dest
);
1522 /* Helper functions. */
1524 static reg_errcode_t
1525 clean_state_log_if_need (mctx
, next_state_log_idx
)
1526 re_match_context_t
*mctx
;
1527 int next_state_log_idx
;
1529 int top
= mctx
->state_log_top
;
1531 if (next_state_log_idx
>= mctx
->input
->bufs_len
1532 || (next_state_log_idx
>= mctx
->input
->valid_len
1533 && mctx
->input
->valid_len
< mctx
->input
->len
))
1536 err
= extend_buffers (mctx
);
1537 if (BE (err
!= REG_NOERROR
, 0))
1541 if (top
< next_state_log_idx
)
1543 memset (mctx
->state_log
+ top
+ 1, '\0',
1544 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1545 mctx
->state_log_top
= next_state_log_idx
;
1550 static reg_errcode_t
1551 merge_state_array (dfa
, dst
, src
, num
)
1553 re_dfastate_t
**dst
;
1554 re_dfastate_t
**src
;
1559 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1561 if (dst
[st_idx
] == NULL
)
1562 dst
[st_idx
] = src
[st_idx
];
1563 else if (src
[st_idx
] != NULL
)
1565 re_node_set merged_set
;
1566 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1567 &src
[st_idx
]->nodes
);
1568 if (BE (err
!= REG_NOERROR
, 0))
1570 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1571 re_node_set_free (&merged_set
);
1572 if (BE (err
!= REG_NOERROR
, 0))
1579 static reg_errcode_t
1580 update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1581 const regex_t
*preg
;
1582 re_match_context_t
*mctx
;
1583 re_sift_context_t
*sctx
;
1585 re_node_set
*dest_nodes
;
1588 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1589 const re_node_set
*candidates
;
1590 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1591 : &mctx
->state_log
[str_idx
]->nodes
);
1593 /* At first, add the nodes which can epsilon transit to a node in
1595 if (dest_nodes
->nelem
)
1597 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1598 if (BE (err
!= REG_NOERROR
, 0))
1602 /* Then, check the limitations in the current sift_context. */
1603 if (dest_nodes
->nelem
&& sctx
->limits
.nelem
)
1605 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1606 mctx
->bkref_ents
, str_idx
);
1607 if (BE (err
!= REG_NOERROR
, 0))
1611 /* Update state_log. */
1612 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1613 if (BE (sctx
->sifted_states
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1616 if ((mctx
->state_log
[str_idx
] != NULL
1617 && mctx
->state_log
[str_idx
]->has_backref
))
1619 err
= sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
);
1620 if (BE (err
!= REG_NOERROR
, 0))
1626 static reg_errcode_t
1627 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1629 re_node_set
*dest_nodes
;
1630 const re_node_set
*candidates
;
1634 re_node_set src_copy
;
1636 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1637 if (BE (err
!= REG_NOERROR
, 0))
1639 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1641 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1643 + src_copy
.elems
[src_idx
]);
1644 if (BE (err
!= REG_NOERROR
, 0))
1646 re_node_set_free (&src_copy
);
1650 re_node_set_free (&src_copy
);
1654 static reg_errcode_t
1655 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1658 re_node_set
*dest_nodes
;
1659 const re_node_set
*candidates
;
1663 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1664 re_node_set except_nodes
;
1665 re_node_set_init_empty (&except_nodes
);
1666 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1668 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1669 if (cur_node
== node
)
1671 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1673 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1674 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1675 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1676 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1677 && re_node_set_contains (dest_nodes
, edst1
))
1679 && !re_node_set_contains (inv_eclosure
, edst2
)
1680 && re_node_set_contains (dest_nodes
, edst2
)))
1682 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1683 dfa
->inveclosures
+ cur_node
);
1684 if (BE (err
!= REG_NOERROR
, 0))
1686 re_node_set_free (&except_nodes
);
1692 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1694 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1695 if (!re_node_set_contains (&except_nodes
, cur_node
))
1697 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1698 re_node_set_remove_at (dest_nodes
, idx
);
1701 re_node_set_free (&except_nodes
);
1706 check_dst_limits (dfa
, limits
, mctx
, dst_node
, dst_idx
, src_node
, src_idx
)
1708 re_node_set
*limits
;
1709 re_match_context_t
*mctx
;
1710 int dst_node
, dst_idx
, src_node
, src_idx
;
1712 int lim_idx
, src_pos
, dst_pos
;
1714 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1717 struct re_backref_cache_entry
*ent
;
1718 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1719 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1721 dst_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1722 dfa
->eclosures
+ dst_node
,
1723 subexp_idx
, dst_node
, dst_idx
);
1724 src_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1725 dfa
->eclosures
+ src_node
,
1726 subexp_idx
, src_node
, src_idx
);
1729 <src> <dst> ( <subexp> )
1730 ( <subexp> ) <src> <dst>
1731 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1732 if (src_pos
== dst_pos
)
1733 continue; /* This is unrelated limitation. */
1741 check_dst_limits_calc_pos (dfa
, mctx
, limit
, eclosures
, subexp_idx
, from_node
,
1744 re_match_context_t
*mctx
;
1745 re_node_set
*eclosures
;
1746 int limit
, subexp_idx
, from_node
, str_idx
;
1748 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1751 /* If we are outside the range of the subexpression, return -1 or 1. */
1752 if (str_idx
< lim
->subexp_from
)
1755 if (lim
->subexp_to
< str_idx
)
1758 /* If we are within the subexpression, return 0. */
1759 if (str_idx
!= lim
->subexp_from
&& str_idx
!= lim
->subexp_to
)
1762 /* Else, we are on the boundary: examine the nodes on the epsilon
1764 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1766 int node
= eclosures
->elems
[node_idx
];
1767 switch (dfa
->nodes
[node
].type
)
1771 int bi
= search_cur_bkref_entry (mctx
, str_idx
);
1772 for (; bi
< mctx
->nbkref_ents
; ++bi
)
1774 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bi
;
1777 /* If this backreference goes beyond the point we're
1778 examining, don't go any further. */
1779 if (ent
->str_idx
> str_idx
)
1782 if (ent
->node
!= node
|| ent
->subexp_from
!= ent
->subexp_to
)
1785 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1786 OP_CLOSE_SUBEXP cases below. But, if the
1787 destination node is the same node as the source
1788 node, don't recurse because it would cause an
1789 infinite loop: a regex that exhibits this behavior
1791 dst
= dfa
->edests
[node
].elems
[0];
1792 if (dst
== from_node
)
1794 if (str_idx
== lim
->subexp_from
)
1796 else /* if (str_idx == lim->subexp_to) */
1800 cpos
= check_dst_limits_calc_pos (dfa
, mctx
, limit
,
1801 dfa
->eclosures
+ dst
,
1805 if (cpos
== -1 && str_idx
== lim
->subexp_from
)
1808 if (cpos
== 0 /* && str_idx == lim->lim->subexp_to */)
1814 case OP_OPEN_SUBEXP
:
1815 if (str_idx
== lim
->subexp_from
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1819 case OP_CLOSE_SUBEXP
:
1820 if (str_idx
== lim
->subexp_to
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1829 if (str_idx
== lim
->subexp_to
)
1835 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1836 which are against limitations from DEST_NODES. */
1838 static reg_errcode_t
1839 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1841 re_node_set
*dest_nodes
;
1842 const re_node_set
*candidates
;
1843 re_node_set
*limits
;
1844 struct re_backref_cache_entry
*bkref_ents
;
1848 int node_idx
, lim_idx
;
1850 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1853 struct re_backref_cache_entry
*ent
;
1854 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
1856 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
1857 continue; /* This is unrelated limitation. */
1859 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1860 if (ent
->subexp_to
== str_idx
)
1864 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1866 int node
= dest_nodes
->elems
[node_idx
];
1867 re_token_type_t type
= dfa
->nodes
[node
].type
;
1868 if (type
== OP_OPEN_SUBEXP
1869 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1871 else if (type
== OP_CLOSE_SUBEXP
1872 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1876 /* Check the limitation of the open subexpression. */
1877 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1880 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
1882 if (BE (err
!= REG_NOERROR
, 0))
1886 /* Check the limitation of the close subexpression. */
1888 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1890 int node
= dest_nodes
->elems
[node_idx
];
1891 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
1893 && !re_node_set_contains (dfa
->eclosures
+ node
,
1896 /* It is against this limitation.
1897 Remove it form the current sifted state. */
1898 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
1900 if (BE (err
!= REG_NOERROR
, 0))
1906 else /* (ent->subexp_to != str_idx) */
1908 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1910 int node
= dest_nodes
->elems
[node_idx
];
1911 re_token_type_t type
= dfa
->nodes
[node
].type
;
1912 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
1914 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
1916 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
1917 || (type
== OP_OPEN_SUBEXP
))
1919 /* It is against this limitation.
1920 Remove it form the current sifted state. */
1921 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
1923 if (BE (err
!= REG_NOERROR
, 0))
1933 static reg_errcode_t
1934 sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1935 const regex_t
*preg
;
1936 re_match_context_t
*mctx
;
1937 re_sift_context_t
*sctx
;
1939 re_node_set
*dest_nodes
;
1942 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1944 re_sift_context_t local_sctx
;
1945 const re_node_set
*candidates
;
1946 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1947 : &mctx
->state_log
[str_idx
]->nodes
);
1948 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
1950 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
1952 int cur_bkref_idx
= re_string_cur_idx (mctx
->input
);
1953 re_token_type_t type
;
1954 node
= candidates
->elems
[node_idx
];
1955 type
= dfa
->nodes
[node
].type
;
1956 if (node
== sctx
->cur_bkref
&& str_idx
== cur_bkref_idx
)
1958 /* Avoid infinite loop for the REs like "()\1+". */
1959 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
1961 if (type
== OP_BACK_REF
)
1963 int enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
1964 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
1966 int disabled_idx
, subexp_len
, to_idx
, dst_node
;
1967 struct re_backref_cache_entry
*entry
;
1968 entry
= mctx
->bkref_ents
+ enabled_idx
;
1969 if (entry
->str_idx
> str_idx
)
1971 if (entry
->node
!= node
)
1973 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
1974 to_idx
= str_idx
+ subexp_len
;
1975 dst_node
= (subexp_len
? dfa
->nexts
[node
]
1976 : dfa
->edests
[node
].elems
[0]);
1978 if (to_idx
> sctx
->last_str_idx
1979 || sctx
->sifted_states
[to_idx
] == NULL
1980 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
],
1982 || check_dst_limits (dfa
, &sctx
->limits
, mctx
, node
,
1983 str_idx
, dst_node
, to_idx
))
1986 re_dfastate_t
*cur_state
;
1988 for (disabled_idx
= enabled_idx
+ 1;
1989 disabled_idx
< mctx
->nbkref_ents
; ++disabled_idx
)
1991 struct re_backref_cache_entry
*entry2
;
1992 entry2
= mctx
->bkref_ents
+ disabled_idx
;
1993 if (entry2
->str_idx
> str_idx
)
1995 entry2
->flag
= (entry2
->node
== node
) ? 1 : entry2
->flag
;
1998 if (local_sctx
.sifted_states
== NULL
)
2001 err
= re_node_set_init_copy (&local_sctx
.limits
,
2003 if (BE (err
!= REG_NOERROR
, 0))
2006 local_sctx
.last_node
= node
;
2007 local_sctx
.last_str_idx
= str_idx
;
2008 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2009 if (BE (err
< 0, 0))
2014 cur_state
= local_sctx
.sifted_states
[str_idx
];
2015 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
2016 if (BE (err
!= REG_NOERROR
, 0))
2018 if (sctx
->limited_states
!= NULL
)
2020 err
= merge_state_array (dfa
, sctx
->limited_states
,
2021 local_sctx
.sifted_states
,
2023 if (BE (err
!= REG_NOERROR
, 0))
2026 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2027 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2028 /* We must not use the variable entry here, since
2029 mctx->bkref_ents might be realloced. */
2030 mctx
->bkref_ents
[enabled_idx
].flag
= 1;
2033 enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2034 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2036 struct re_backref_cache_entry
*entry
;
2037 entry
= mctx
->bkref_ents
+ enabled_idx
;
2038 if (entry
->str_idx
> str_idx
)
2040 if (entry
->node
== node
)
2047 if (local_sctx
.sifted_states
!= NULL
)
2049 re_node_set_free (&local_sctx
.limits
);
2056 #ifdef RE_ENABLE_I18N
2058 sift_states_iter_mb (preg
, mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2059 const regex_t
*preg
;
2060 const re_match_context_t
*mctx
;
2061 re_sift_context_t
*sctx
;
2062 int node_idx
, str_idx
, max_str_idx
;
2064 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2066 /* Check the node can accept `multi byte'. */
2067 naccepted
= check_node_accept_bytes (preg
, node_idx
, mctx
->input
, str_idx
);
2068 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2069 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2070 dfa
->nexts
[node_idx
]))
2071 /* The node can't accept the `multi byte', or the
2072 destination was already thrown away, then the node
2073 could't accept the current input `multi byte'. */
2075 /* Otherwise, it is sure that the node could accept
2076 `naccepted' bytes input. */
2079 #endif /* RE_ENABLE_I18N */
2082 /* Functions for state transition. */
2084 /* Return the next state to which the current state STATE will transit by
2085 accepting the current input byte, and update STATE_LOG if necessary.
2086 If STATE can accept a multibyte char/collating element/back reference
2087 update the destination of STATE_LOG. */
2089 static re_dfastate_t
*
2090 transit_state (err
, preg
, mctx
, state
)
2092 const regex_t
*preg
;
2093 re_match_context_t
*mctx
;
2094 re_dfastate_t
*state
;
2096 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2097 re_dfastate_t
**trtable
, *next_state
;
2101 if (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->bufs_len
2102 || (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->valid_len
2103 && mctx
->input
->valid_len
< mctx
->input
->len
))
2105 *err
= extend_buffers (mctx
);
2106 if (BE (*err
!= REG_NOERROR
, 0))
2114 re_string_skip_bytes (mctx
->input
, 1);
2118 #ifdef RE_ENABLE_I18N
2119 /* If the current state can accept multibyte. */
2120 if (state
->accept_mb
)
2122 *err
= transit_state_mb (preg
, state
, mctx
);
2123 if (BE (*err
!= REG_NOERROR
, 0))
2126 #endif /* RE_ENABLE_I18N */
2128 /* Then decide the next state with the single byte. */
2131 /* Use transition table */
2132 ch
= re_string_fetch_byte (mctx
->input
);
2133 trtable
= state
->trtable
;
2134 if (trtable
== NULL
)
2136 trtable
= build_trtable (preg
, state
);
2137 if (trtable
== NULL
)
2143 if (BE (state
->word_trtable
, 0))
2145 unsigned int context
;
2147 = re_string_context_at (mctx
->input
,
2148 re_string_cur_idx (mctx
->input
) - 1,
2149 mctx
->eflags
, preg
->newline_anchor
);
2150 if (IS_WORD_CONTEXT (context
))
2151 next_state
= trtable
[ch
+ SBC_MAX
];
2153 next_state
= trtable
[ch
];
2156 next_state
= trtable
[ch
];
2161 /* don't use transition table */
2162 next_state
= transit_state_sb (err
, preg
, state
, mctx
);
2163 if (BE (next_state
== NULL
&& err
!= REG_NOERROR
, 0))
2169 cur_idx
= re_string_cur_idx (mctx
->input
);
2170 /* Update the state_log if we need. */
2171 if (mctx
->state_log
!= NULL
)
2173 if (cur_idx
> mctx
->state_log_top
)
2175 mctx
->state_log
[cur_idx
] = next_state
;
2176 mctx
->state_log_top
= cur_idx
;
2178 else if (mctx
->state_log
[cur_idx
] == 0)
2180 mctx
->state_log
[cur_idx
] = next_state
;
2184 re_dfastate_t
*pstate
;
2185 unsigned int context
;
2186 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2187 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2188 the destination of a multibyte char/collating element/
2189 back reference. Then the next state is the union set of
2190 these destinations and the results of the transition table. */
2191 pstate
= mctx
->state_log
[cur_idx
];
2192 log_nodes
= pstate
->entrance_nodes
;
2193 if (next_state
!= NULL
)
2195 table_nodes
= next_state
->entrance_nodes
;
2196 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2198 if (BE (*err
!= REG_NOERROR
, 0))
2202 next_nodes
= *log_nodes
;
2203 /* Note: We already add the nodes of the initial state,
2204 then we don't need to add them here. */
2206 context
= re_string_context_at (mctx
->input
,
2207 re_string_cur_idx (mctx
->input
) - 1,
2208 mctx
->eflags
, preg
->newline_anchor
);
2209 next_state
= mctx
->state_log
[cur_idx
]
2210 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2211 /* We don't need to check errors here, since the return value of
2212 this function is next_state and ERR is already set. */
2214 if (table_nodes
!= NULL
)
2215 re_node_set_free (&next_nodes
);
2219 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2221 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2222 later. We must check them here, since the back references in the
2223 next state might use them. */
2224 *err
= check_subexp_matching_top (dfa
, mctx
, &next_state
->nodes
,
2226 if (BE (*err
!= REG_NOERROR
, 0))
2229 /* If the next state has back references. */
2230 if (next_state
->has_backref
)
2232 *err
= transit_state_bkref (preg
, &next_state
->nodes
, mctx
);
2233 if (BE (*err
!= REG_NOERROR
, 0))
2235 next_state
= mctx
->state_log
[cur_idx
];
2241 /* Helper functions for transit_state. */
2243 /* From the node set CUR_NODES, pick up the nodes whose types are
2244 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2245 expression. And register them to use them later for evaluating the
2246 correspoding back references. */
2248 static reg_errcode_t
2249 check_subexp_matching_top (dfa
, mctx
, cur_nodes
, str_idx
)
2251 re_match_context_t
*mctx
;
2252 re_node_set
*cur_nodes
;
2258 /* TODO: This isn't efficient.
2259 Because there might be more than one nodes whose types are
2260 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2263 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2265 int node
= cur_nodes
->elems
[node_idx
];
2266 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2267 && dfa
->nodes
[node
].opr
.idx
< (8 * sizeof (dfa
->used_bkref_map
))
2268 && dfa
->used_bkref_map
& (1 << dfa
->nodes
[node
].opr
.idx
))
2270 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2271 if (BE (err
!= REG_NOERROR
, 0))
2279 /* Return the next state to which the current state STATE will transit by
2280 accepting the current input byte. */
2282 static re_dfastate_t
*
2283 transit_state_sb (err
, preg
, state
, mctx
)
2285 const regex_t
*preg
;
2286 re_dfastate_t
*state
;
2287 re_match_context_t
*mctx
;
2289 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2290 re_node_set next_nodes
;
2291 re_dfastate_t
*next_state
;
2292 int node_cnt
, cur_str_idx
= re_string_cur_idx (mctx
->input
);
2293 unsigned int context
;
2295 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2296 if (BE (*err
!= REG_NOERROR
, 0))
2298 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2300 int cur_node
= state
->nodes
.elems
[node_cnt
];
2301 if (check_node_accept (preg
, dfa
->nodes
+ cur_node
, mctx
, cur_str_idx
))
2303 *err
= re_node_set_merge (&next_nodes
,
2304 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2305 if (BE (*err
!= REG_NOERROR
, 0))
2307 re_node_set_free (&next_nodes
);
2312 context
= re_string_context_at (mctx
->input
, cur_str_idx
, mctx
->eflags
,
2313 preg
->newline_anchor
);
2314 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2315 /* We don't need to check errors here, since the return value of
2316 this function is next_state and ERR is already set. */
2318 re_node_set_free (&next_nodes
);
2319 re_string_skip_bytes (mctx
->input
, 1);
2324 #ifdef RE_ENABLE_I18N
2325 static reg_errcode_t
2326 transit_state_mb (preg
, pstate
, mctx
)
2327 const regex_t
*preg
;
2328 re_dfastate_t
*pstate
;
2329 re_match_context_t
*mctx
;
2332 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2335 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2337 re_node_set dest_nodes
, *new_nodes
;
2338 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2339 int naccepted
= 0, dest_idx
;
2340 unsigned int context
;
2341 re_dfastate_t
*dest_state
;
2343 if (dfa
->nodes
[cur_node_idx
].constraint
)
2345 context
= re_string_context_at (mctx
->input
,
2346 re_string_cur_idx (mctx
->input
),
2347 mctx
->eflags
, preg
->newline_anchor
);
2348 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2353 /* How many bytes the node can accept? */
2354 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2355 naccepted
= check_node_accept_bytes (preg
, cur_node_idx
, mctx
->input
,
2356 re_string_cur_idx (mctx
->input
));
2360 /* The node can accepts `naccepted' bytes. */
2361 dest_idx
= re_string_cur_idx (mctx
->input
) + naccepted
;
2362 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2363 : mctx
->max_mb_elem_len
);
2364 err
= clean_state_log_if_need (mctx
, dest_idx
);
2365 if (BE (err
!= REG_NOERROR
, 0))
2368 assert (dfa
->nexts
[cur_node_idx
] != -1);
2370 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2371 then we use pstate->nodes.elems[i] instead. */
2372 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2374 dest_state
= mctx
->state_log
[dest_idx
];
2375 if (dest_state
== NULL
)
2376 dest_nodes
= *new_nodes
;
2379 err
= re_node_set_init_union (&dest_nodes
,
2380 dest_state
->entrance_nodes
, new_nodes
);
2381 if (BE (err
!= REG_NOERROR
, 0))
2384 context
= re_string_context_at (mctx
->input
, dest_idx
- 1, mctx
->eflags
,
2385 preg
->newline_anchor
);
2386 mctx
->state_log
[dest_idx
]
2387 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2388 if (dest_state
!= NULL
)
2389 re_node_set_free (&dest_nodes
);
2390 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2395 #endif /* RE_ENABLE_I18N */
2397 static reg_errcode_t
2398 transit_state_bkref (preg
, nodes
, mctx
)
2399 const regex_t
*preg
;
2400 const re_node_set
*nodes
;
2401 re_match_context_t
*mctx
;
2404 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2406 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
2408 for (i
= 0; i
< nodes
->nelem
; ++i
)
2410 int dest_str_idx
, prev_nelem
, bkc_idx
;
2411 int node_idx
= nodes
->elems
[i
];
2412 unsigned int context
;
2413 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2414 re_node_set
*new_dest_nodes
;
2416 /* Check whether `node' is a backreference or not. */
2417 if (node
->type
!= OP_BACK_REF
)
2420 if (node
->constraint
)
2422 context
= re_string_context_at (mctx
->input
, cur_str_idx
,
2423 mctx
->eflags
, preg
->newline_anchor
);
2424 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2428 /* `node' is a backreference.
2429 Check the substring which the substring matched. */
2430 bkc_idx
= mctx
->nbkref_ents
;
2431 err
= get_subexp (preg
, mctx
, node_idx
, cur_str_idx
);
2432 if (BE (err
!= REG_NOERROR
, 0))
2435 /* And add the epsilon closures (which is `new_dest_nodes') of
2436 the backreference to appropriate state_log. */
2438 assert (dfa
->nexts
[node_idx
] != -1);
2440 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2443 re_dfastate_t
*dest_state
;
2444 struct re_backref_cache_entry
*bkref_ent
;
2445 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2446 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2448 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2449 new_dest_nodes
= (subexp_len
== 0
2450 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2451 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2452 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2453 - bkref_ent
->subexp_from
);
2454 context
= re_string_context_at (mctx
->input
, dest_str_idx
- 1,
2455 mctx
->eflags
, preg
->newline_anchor
);
2456 dest_state
= mctx
->state_log
[dest_str_idx
];
2457 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2458 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2459 /* Add `new_dest_node' to state_log. */
2460 if (dest_state
== NULL
)
2462 mctx
->state_log
[dest_str_idx
]
2463 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2465 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2466 && err
!= REG_NOERROR
, 0))
2471 re_node_set dest_nodes
;
2472 err
= re_node_set_init_union (&dest_nodes
,
2473 dest_state
->entrance_nodes
,
2475 if (BE (err
!= REG_NOERROR
, 0))
2477 re_node_set_free (&dest_nodes
);
2480 mctx
->state_log
[dest_str_idx
]
2481 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2482 re_node_set_free (&dest_nodes
);
2483 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2484 && err
!= REG_NOERROR
, 0))
2487 /* We need to check recursively if the backreference can epsilon
2490 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2492 err
= check_subexp_matching_top (dfa
, mctx
, new_dest_nodes
,
2494 if (BE (err
!= REG_NOERROR
, 0))
2496 err
= transit_state_bkref (preg
, new_dest_nodes
, mctx
);
2497 if (BE (err
!= REG_NOERROR
, 0))
2507 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2508 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2509 Note that we might collect inappropriate candidates here.
2510 However, the cost of checking them strictly here is too high, then we
2511 delay these checking for prune_impossible_nodes(). */
2513 static reg_errcode_t
2514 get_subexp (preg
, mctx
, bkref_node
, bkref_str_idx
)
2515 const regex_t
*preg
;
2516 re_match_context_t
*mctx
;
2517 int bkref_node
, bkref_str_idx
;
2519 int subexp_num
, sub_top_idx
;
2520 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2521 const char *buf
= (const char *) re_string_get_buffer (mctx
->input
);
2522 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2523 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2524 for (; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
2526 const struct re_backref_cache_entry
*entry
2527 = &mctx
->bkref_ents
[cache_idx
];
2528 if (entry
->str_idx
> bkref_str_idx
)
2530 if (entry
->node
== bkref_node
)
2531 return REG_NOERROR
; /* We already checked it. */
2533 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
- 1;
2535 /* For each sub expression */
2536 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2539 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2540 re_sub_match_last_t
*sub_last
;
2541 int sub_last_idx
, sl_str
;
2542 const char *bkref_str
;
2544 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2545 continue; /* It isn't related. */
2547 sl_str
= sub_top
->str_idx
;
2548 bkref_str
= buf
+ bkref_str_idx
;
2549 /* At first, check the last node of sub expressions we already
2551 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2554 sub_last
= sub_top
->lasts
[sub_last_idx
];
2555 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2556 /* The matched string by the sub expression match with the substring
2557 at the back reference? */
2559 && memcmp (bkref_str
, buf
+ sl_str
, sl_str_diff
) != 0)
2560 break; /* We don't need to search this sub expression any more. */
2561 bkref_str
+= sl_str_diff
;
2562 sl_str
+= sl_str_diff
;
2563 err
= get_subexp_sub (preg
, mctx
, sub_top
, sub_last
, bkref_node
,
2566 /* Reload buf and bkref_str, since the preceding call might
2567 have reallocated the buffer. */
2568 buf
= (const char *) re_string_get_buffer (mctx
->input
);
2569 bkref_str
= buf
+ bkref_str_idx
;
2571 if (err
== REG_NOMATCH
)
2573 if (BE (err
!= REG_NOERROR
, 0))
2576 if (sub_last_idx
< sub_top
->nlasts
)
2578 if (sub_last_idx
> 0)
2580 /* Then, search for the other last nodes of the sub expression. */
2581 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2583 int cls_node
, sl_str_off
;
2584 const re_node_set
*nodes
;
2585 sl_str_off
= sl_str
- sub_top
->str_idx
;
2586 /* The matched string by the sub expression match with the substring
2587 at the back reference? */
2588 if (sl_str_off
> 0 && *bkref_str
++ != buf
[sl_str
- 1])
2589 break; /* We don't need to search this sub expression any more. */
2590 if (mctx
->state_log
[sl_str
] == NULL
)
2592 /* Does this state have a ')' of the sub expression? */
2593 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2594 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
, OP_CLOSE_SUBEXP
);
2597 if (sub_top
->path
== NULL
)
2599 sub_top
->path
= calloc (sizeof (state_array_t
),
2600 sl_str
- sub_top
->str_idx
+ 1);
2601 if (sub_top
->path
== NULL
)
2604 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2605 in the current context? */
2606 err
= check_arrival (preg
, mctx
, sub_top
->path
, sub_top
->node
,
2607 sub_top
->str_idx
, cls_node
, sl_str
, OP_CLOSE_SUBEXP
);
2608 if (err
== REG_NOMATCH
)
2610 if (BE (err
!= REG_NOERROR
, 0))
2612 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2613 if (BE (sub_last
== NULL
, 0))
2615 err
= get_subexp_sub (preg
, mctx
, sub_top
, sub_last
, bkref_node
,
2617 if (err
== REG_NOMATCH
)
2624 /* Helper functions for get_subexp(). */
2626 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2627 If it can arrive, register the sub expression expressed with SUB_TOP
2630 static reg_errcode_t
2631 get_subexp_sub (preg
, mctx
, sub_top
, sub_last
, bkref_node
, bkref_str
)
2632 const regex_t
*preg
;
2633 re_match_context_t
*mctx
;
2634 const re_sub_match_top_t
*sub_top
;
2635 re_sub_match_last_t
*sub_last
;
2636 int bkref_node
, bkref_str
;
2640 /* Can the subexpression arrive the back reference? */
2641 err
= check_arrival (preg
, mctx
, &sub_last
->path
, sub_last
->node
,
2642 sub_last
->str_idx
, bkref_node
, bkref_str
, OP_OPEN_SUBEXP
);
2643 if (err
!= REG_NOERROR
)
2645 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2647 if (BE (err
!= REG_NOERROR
, 0))
2649 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2650 clean_state_log_if_need (mctx
, to_idx
);
2654 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2655 Search '(' if FL_OPEN, or search ')' otherwise.
2656 TODO: This function isn't efficient...
2657 Because there might be more than one nodes whose types are
2658 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2663 find_subexp_node (dfa
, nodes
, subexp_idx
, type
)
2664 const re_dfa_t
*dfa
;
2665 const re_node_set
*nodes
;
2666 int subexp_idx
, type
;
2669 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2671 int cls_node
= nodes
->elems
[cls_idx
];
2672 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2673 if (node
->type
== type
2674 && node
->opr
.idx
== subexp_idx
)
2680 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2681 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2683 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2685 static reg_errcode_t
2686 check_arrival (preg
, mctx
, path
, top_node
, top_str
, last_node
, last_str
,
2688 const regex_t
*preg
;
2689 re_match_context_t
*mctx
;
2690 state_array_t
*path
;
2691 int top_node
, top_str
, last_node
, last_str
, type
;
2693 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2695 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2696 re_dfastate_t
*cur_state
= NULL
;
2697 re_node_set
*cur_nodes
, next_nodes
;
2698 re_dfastate_t
**backup_state_log
;
2699 unsigned int context
;
2701 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2702 /* Extend the buffer if we need. */
2703 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2705 re_dfastate_t
**new_array
;
2706 int old_alloc
= path
->alloc
;
2707 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2708 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2709 if (new_array
== NULL
)
2711 path
->alloc
= old_alloc
;
2714 path
->array
= new_array
;
2715 memset (new_array
+ old_alloc
, '\0',
2716 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2719 str_idx
= path
->next_idx
== 0 ? top_str
: path
->next_idx
;
2721 /* Temporary modify MCTX. */
2722 backup_state_log
= mctx
->state_log
;
2723 backup_cur_idx
= mctx
->input
->cur_idx
;
2724 mctx
->state_log
= path
->array
;
2725 mctx
->input
->cur_idx
= str_idx
;
2727 /* Setup initial node set. */
2728 context
= re_string_context_at (mctx
->input
, str_idx
- 1, mctx
->eflags
,
2729 preg
->newline_anchor
);
2730 if (str_idx
== top_str
)
2732 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2733 if (BE (err
!= REG_NOERROR
, 0))
2735 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2736 if (BE (err
!= REG_NOERROR
, 0))
2738 re_node_set_free (&next_nodes
);
2744 cur_state
= mctx
->state_log
[str_idx
];
2745 if (cur_state
&& cur_state
->has_backref
)
2747 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2748 if (BE ( err
!= REG_NOERROR
, 0))
2752 re_node_set_init_empty (&next_nodes
);
2754 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2756 if (next_nodes
.nelem
)
2758 err
= expand_bkref_cache (preg
, mctx
, &next_nodes
, str_idx
, last_str
,
2760 if (BE ( err
!= REG_NOERROR
, 0))
2762 re_node_set_free (&next_nodes
);
2766 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2767 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2769 re_node_set_free (&next_nodes
);
2772 mctx
->state_log
[str_idx
] = cur_state
;
2775 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2777 re_node_set_empty (&next_nodes
);
2778 if (mctx
->state_log
[str_idx
+ 1])
2780 err
= re_node_set_merge (&next_nodes
,
2781 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2782 if (BE (err
!= REG_NOERROR
, 0))
2784 re_node_set_free (&next_nodes
);
2790 err
= check_arrival_add_next_nodes (preg
, dfa
, mctx
, str_idx
,
2791 &cur_state
->nodes
, &next_nodes
);
2792 if (BE (err
!= REG_NOERROR
, 0))
2794 re_node_set_free (&next_nodes
);
2799 if (next_nodes
.nelem
)
2801 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2802 if (BE (err
!= REG_NOERROR
, 0))
2804 re_node_set_free (&next_nodes
);
2807 err
= expand_bkref_cache (preg
, mctx
, &next_nodes
, str_idx
, last_str
,
2809 if (BE ( err
!= REG_NOERROR
, 0))
2811 re_node_set_free (&next_nodes
);
2815 context
= re_string_context_at (mctx
->input
, str_idx
- 1, mctx
->eflags
,
2816 preg
->newline_anchor
);
2817 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2818 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2820 re_node_set_free (&next_nodes
);
2823 mctx
->state_log
[str_idx
] = cur_state
;
2824 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2826 re_node_set_free (&next_nodes
);
2827 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2828 : &mctx
->state_log
[last_str
]->nodes
);
2829 path
->next_idx
= str_idx
;
2832 mctx
->state_log
= backup_state_log
;
2833 mctx
->input
->cur_idx
= backup_cur_idx
;
2835 /* Then check the current node set has the node LAST_NODE. */
2836 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
2842 /* Helper functions for check_arrival. */
2844 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2846 TODO: This function is similar to the functions transit_state*(),
2847 however this function has many additional works.
2848 Can't we unify them? */
2850 static reg_errcode_t
2851 check_arrival_add_next_nodes (preg
, dfa
, mctx
, str_idx
, cur_nodes
, next_nodes
)
2852 const regex_t
*preg
;
2854 re_match_context_t
*mctx
;
2856 re_node_set
*cur_nodes
, *next_nodes
;
2860 re_node_set union_set
;
2861 re_node_set_init_empty (&union_set
);
2862 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
2865 int cur_node
= cur_nodes
->elems
[cur_idx
];
2866 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
2867 if (IS_EPSILON_NODE (type
))
2869 #ifdef RE_ENABLE_I18N
2870 /* If the node may accept `multi byte'. */
2871 if (ACCEPT_MB_NODE (type
))
2873 naccepted
= check_node_accept_bytes (preg
, cur_node
, mctx
->input
,
2877 re_dfastate_t
*dest_state
;
2878 int next_node
= dfa
->nexts
[cur_node
];
2879 int next_idx
= str_idx
+ naccepted
;
2880 dest_state
= mctx
->state_log
[next_idx
];
2881 re_node_set_empty (&union_set
);
2884 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
2885 if (BE (err
!= REG_NOERROR
, 0))
2887 re_node_set_free (&union_set
);
2890 err
= re_node_set_insert (&union_set
, next_node
);
2891 if (BE (err
< 0, 0))
2893 re_node_set_free (&union_set
);
2899 err
= re_node_set_insert (&union_set
, next_node
);
2900 if (BE (err
< 0, 0))
2902 re_node_set_free (&union_set
);
2906 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
2908 if (BE (mctx
->state_log
[next_idx
] == NULL
2909 && err
!= REG_NOERROR
, 0))
2911 re_node_set_free (&union_set
);
2916 #endif /* RE_ENABLE_I18N */
2918 || check_node_accept (preg
, dfa
->nodes
+ cur_node
, mctx
,
2921 err
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
2922 if (BE (err
< 0, 0))
2924 re_node_set_free (&union_set
);
2929 re_node_set_free (&union_set
);
2933 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
2934 CUR_NODES, however exclude the nodes which are:
2935 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2936 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2939 static reg_errcode_t
2940 check_arrival_expand_ecl (dfa
, cur_nodes
, ex_subexp
, type
)
2942 re_node_set
*cur_nodes
;
2943 int ex_subexp
, type
;
2946 int idx
, outside_node
;
2947 re_node_set new_nodes
;
2949 assert (cur_nodes
->nelem
);
2951 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
2952 if (BE (err
!= REG_NOERROR
, 0))
2954 /* Create a new node set NEW_NODES with the nodes which are epsilon
2955 closures of the node in CUR_NODES. */
2957 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
2959 int cur_node
= cur_nodes
->elems
[idx
];
2960 re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
2961 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
2962 if (outside_node
== -1)
2964 /* There are no problematic nodes, just merge them. */
2965 err
= re_node_set_merge (&new_nodes
, eclosure
);
2966 if (BE (err
!= REG_NOERROR
, 0))
2968 re_node_set_free (&new_nodes
);
2974 /* There are problematic nodes, re-calculate incrementally. */
2975 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
2977 if (BE (err
!= REG_NOERROR
, 0))
2979 re_node_set_free (&new_nodes
);
2984 re_node_set_free (cur_nodes
);
2985 *cur_nodes
= new_nodes
;
2989 /* Helper function for check_arrival_expand_ecl.
2990 Check incrementally the epsilon closure of TARGET, and if it isn't
2991 problematic append it to DST_NODES. */
2993 static reg_errcode_t
2994 check_arrival_expand_ecl_sub (dfa
, dst_nodes
, target
, ex_subexp
, type
)
2996 int target
, ex_subexp
, type
;
2997 re_node_set
*dst_nodes
;
3000 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3004 if (dfa
->nodes
[cur_node
].type
== type
3005 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3007 if (type
== OP_CLOSE_SUBEXP
)
3009 err
= re_node_set_insert (dst_nodes
, cur_node
);
3010 if (BE (err
== -1, 0))
3015 err
= re_node_set_insert (dst_nodes
, cur_node
);
3016 if (BE (err
== -1, 0))
3018 if (dfa
->edests
[cur_node
].nelem
== 0)
3020 if (dfa
->edests
[cur_node
].nelem
== 2)
3022 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3023 dfa
->edests
[cur_node
].elems
[1],
3025 if (BE (err
!= REG_NOERROR
, 0))
3028 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3034 /* For all the back references in the current state, calculate the
3035 destination of the back references by the appropriate entry
3036 in MCTX->BKREF_ENTS. */
3038 static reg_errcode_t
3039 expand_bkref_cache (preg
, mctx
, cur_nodes
, cur_str
, last_str
, subexp_num
,
3041 const regex_t
*preg
;
3042 re_match_context_t
*mctx
;
3043 int cur_str
, last_str
, subexp_num
, type
;
3044 re_node_set
*cur_nodes
;
3047 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3048 int cache_idx
, cache_idx_start
;
3049 /* The current state. */
3051 cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3052 for (cache_idx
= cache_idx_start
; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
3054 int to_idx
, next_node
;
3055 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ cache_idx
;
3056 if (ent
->str_idx
> cur_str
)
3058 /* Is this entry ENT is appropriate? */
3059 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3062 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3063 /* Calculate the destination of the back reference, and append it
3064 to MCTX->STATE_LOG. */
3065 if (to_idx
== cur_str
)
3067 /* The backreference did epsilon transit, we must re-check all the
3068 node in the current state. */
3069 re_node_set new_dests
;
3070 reg_errcode_t err2
, err3
;
3071 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3072 if (re_node_set_contains (cur_nodes
, next_node
))
3074 err
= re_node_set_init_1 (&new_dests
, next_node
);
3075 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3076 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3077 re_node_set_free (&new_dests
);
3078 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3079 || err3
!= REG_NOERROR
, 0))
3081 err
= (err
!= REG_NOERROR
? err
3082 : (err2
!= REG_NOERROR
? err2
: err3
));
3085 /* TODO: It is still inefficient... */
3086 cache_idx
= cache_idx_start
- 1;
3091 re_node_set union_set
;
3092 next_node
= dfa
->nexts
[ent
->node
];
3093 if (mctx
->state_log
[to_idx
])
3096 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3099 err
= re_node_set_init_copy (&union_set
,
3100 &mctx
->state_log
[to_idx
]->nodes
);
3101 ret
= re_node_set_insert (&union_set
, next_node
);
3102 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3104 re_node_set_free (&union_set
);
3105 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3111 err
= re_node_set_init_1 (&union_set
, next_node
);
3112 if (BE (err
!= REG_NOERROR
, 0))
3115 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3116 re_node_set_free (&union_set
);
3117 if (BE (mctx
->state_log
[to_idx
] == NULL
3118 && err
!= REG_NOERROR
, 0))
3125 /* Build transition table for the state.
3126 Return the new table if succeeded, otherwise return NULL. */
3128 static re_dfastate_t
**
3129 build_trtable (preg
, state
)
3130 const regex_t
*preg
;
3131 re_dfastate_t
*state
;
3134 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3136 int dests_node_malloced
= 0, dest_states_malloced
= 0;
3137 int ndests
; /* Number of the destination states from `state'. */
3138 re_dfastate_t
**trtable
;
3139 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3140 re_node_set follows
, *dests_node
;
3144 /* We build DFA states which corresponds to the destination nodes
3145 from `state'. `dests_node[i]' represents the nodes which i-th
3146 destination state contains, and `dests_ch[i]' represents the
3147 characters which i-th destination state accepts. */
3149 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
3150 dests_node
= (re_node_set
*)
3151 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3155 dests_node
= (re_node_set
*)
3156 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3157 if (BE (dests_node
== NULL
, 0))
3159 dests_node_malloced
= 1;
3161 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
3163 /* Initialize transiton table. */
3164 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3165 state
->word_trtable
= 0;
3166 if (BE (trtable
== NULL
, 0))
3168 if (dests_node_malloced
)
3173 /* At first, group all nodes belonging to `state' into several
3175 ndests
= group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
);
3176 if (BE (ndests
<= 0, 0))
3178 if (dests_node_malloced
)
3180 /* Return NULL in case of an error, trtable otherwise. */
3183 state
->trtable
= trtable
;
3190 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3191 if (BE (err
!= REG_NOERROR
, 0))
3195 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
3196 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3197 dest_states
= (re_dfastate_t
**)
3198 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3202 dest_states
= (re_dfastate_t
**)
3203 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3204 if (BE (dest_states
== NULL
, 0))
3207 if (dest_states_malloced
)
3209 re_node_set_free (&follows
);
3210 for (i
= 0; i
< ndests
; ++i
)
3211 re_node_set_free (dests_node
+ i
);
3213 if (dests_node_malloced
)
3217 dest_states_malloced
= 1;
3219 dest_states_word
= dest_states
+ ndests
;
3220 dest_states_nl
= dest_states_word
+ ndests
;
3221 bitset_empty (acceptable
);
3223 /* Then build the states for all destinations. */
3224 for (i
= 0; i
< ndests
; ++i
)
3227 re_node_set_empty (&follows
);
3228 /* Merge the follows of this destination states. */
3229 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3231 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3232 if (next_node
!= -1)
3234 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3235 if (BE (err
!= REG_NOERROR
, 0))
3239 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3240 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3242 /* If the new state has context constraint,
3243 build appropriate states for these contexts. */
3244 if (dest_states
[i
]->has_constraint
)
3246 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3248 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3250 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3252 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3257 dest_states_word
[i
] = dest_states
[i
];
3258 dest_states_nl
[i
] = dest_states
[i
];
3260 bitset_merge (acceptable
, dests_ch
[i
]);
3263 /* Update the transition table. */
3264 /* For all characters ch...: */
3265 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
3266 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
3267 if ((acceptable
[i
] >> j
) & 1)
3269 for (k
= 0; k
< ndests
; ++k
)
3270 if ((dests_ch
[k
][i
] >> j
) & 1)
3272 /* k-th destination accepts the word character ch. */
3273 if (state
->word_trtable
)
3275 trtable
[ch
] = dest_states
[k
];
3276 trtable
[ch
+ SBC_MAX
] = dest_states_word
[k
];
3278 else if (dfa
->mb_cur_max
> 1
3279 && dest_states
[k
] != dest_states_word
[k
])
3281 re_dfastate_t
**new_trtable
;
3283 new_trtable
= (re_dfastate_t
**)
3285 sizeof (re_dfastate_t
*)
3287 if (BE (new_trtable
== NULL
, 0))
3289 memcpy (new_trtable
+ SBC_MAX
, new_trtable
,
3290 sizeof (re_dfastate_t
*) * SBC_MAX
);
3291 trtable
= new_trtable
;
3292 state
->word_trtable
= 1;
3293 trtable
[ch
] = dest_states
[k
];
3294 trtable
[ch
+ SBC_MAX
] = dest_states_word
[k
];
3296 else if (IS_WORD_CHAR (ch
))
3297 trtable
[ch
] = dest_states_word
[k
];
3299 trtable
[ch
] = dest_states
[k
];
3300 /* There must be only one destination which accepts
3301 character ch. See group_nodes_into_DFAstates. */
3306 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3308 /* The current state accepts newline character. */
3309 for (k
= 0; k
< ndests
; ++k
)
3310 if (bitset_contain (dests_ch
[k
], NEWLINE_CHAR
))
3312 /* k-th destination accepts newline character. */
3313 trtable
[NEWLINE_CHAR
] = dest_states_nl
[k
];
3314 if (state
->word_trtable
)
3315 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[k
];
3316 /* There must be only one destination which accepts
3317 newline. See group_nodes_into_DFAstates. */
3322 if (dest_states_malloced
)
3325 re_node_set_free (&follows
);
3326 for (i
= 0; i
< ndests
; ++i
)
3327 re_node_set_free (dests_node
+ i
);
3329 if (dests_node_malloced
)
3332 state
->trtable
= trtable
;
3336 /* Group all nodes belonging to STATE into several destinations.
3337 Then for all destinations, set the nodes belonging to the destination
3338 to DESTS_NODE[i] and set the characters accepted by the destination
3339 to DEST_CH[i]. This function return the number of destinations. */
3342 group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
)
3343 const regex_t
*preg
;
3344 const re_dfastate_t
*state
;
3345 re_node_set
*dests_node
;
3349 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3351 int ndests
; /* Number of the destinations from `state'. */
3352 bitset accepts
; /* Characters a node can accept. */
3353 const re_node_set
*cur_nodes
= &state
->nodes
;
3354 bitset_empty (accepts
);
3357 /* For all the nodes belonging to `state', */
3358 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3360 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3361 re_token_type_t type
= node
->type
;
3362 unsigned int constraint
= node
->constraint
;
3364 /* Enumerate all single byte character this node can accept. */
3365 if (type
== CHARACTER
)
3366 bitset_set (accepts
, node
->opr
.c
);
3367 else if (type
== SIMPLE_BRACKET
)
3369 bitset_merge (accepts
, node
->opr
.sbcset
);
3371 else if (type
== OP_PERIOD
)
3373 #ifdef RE_ENABLE_I18N
3374 if (dfa
->mb_cur_max
> 1)
3375 bitset_merge (accepts
, dfa
->sb_char
);
3378 bitset_set_all (accepts
);
3379 if (!(preg
->syntax
& RE_DOT_NEWLINE
))
3380 bitset_clear (accepts
, '\n');
3381 if (preg
->syntax
& RE_DOT_NOT_NULL
)
3382 bitset_clear (accepts
, '\0');
3384 #ifdef RE_ENABLE_I18N
3385 else if (type
== OP_UTF8_PERIOD
)
3387 memset (accepts
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
3388 if (!(preg
->syntax
& RE_DOT_NEWLINE
))
3389 bitset_clear (accepts
, '\n');
3390 if (preg
->syntax
& RE_DOT_NOT_NULL
)
3391 bitset_clear (accepts
, '\0');
3397 /* Check the `accepts' and sift the characters which are not
3398 match it the context. */
3401 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3403 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3404 bitset_empty (accepts
);
3405 if (accepts_newline
)
3406 bitset_set (accepts
, NEWLINE_CHAR
);
3410 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3412 bitset_empty (accepts
);
3416 if (constraint
& NEXT_WORD_CONSTRAINT
)
3418 unsigned int any_set
= 0;
3419 if (type
== CHARACTER
&& !node
->word_char
)
3421 bitset_empty (accepts
);
3424 #ifdef RE_ENABLE_I18N
3425 if (dfa
->mb_cur_max
> 1)
3426 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3427 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3430 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3431 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3435 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3437 unsigned int any_set
= 0;
3438 if (type
== CHARACTER
&& node
->word_char
)
3440 bitset_empty (accepts
);
3443 #ifdef RE_ENABLE_I18N
3444 if (dfa
->mb_cur_max
> 1)
3445 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3446 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3449 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3450 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3456 /* Then divide `accepts' into DFA states, or create a new
3457 state. Above, we make sure that accepts is not empty. */
3458 for (j
= 0; j
< ndests
; ++j
)
3460 bitset intersec
; /* Intersection sets, see below. */
3462 /* Flags, see below. */
3463 int has_intersec
, not_subset
, not_consumed
;
3465 /* Optimization, skip if this state doesn't accept the character. */
3466 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3469 /* Enumerate the intersection set of this state and `accepts'. */
3471 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3472 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3473 /* And skip if the intersection set is empty. */
3477 /* Then check if this state is a subset of `accepts'. */
3478 not_subset
= not_consumed
= 0;
3479 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3481 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3482 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3485 /* If this state isn't a subset of `accepts', create a
3486 new group state, which has the `remains'. */
3489 bitset_copy (dests_ch
[ndests
], remains
);
3490 bitset_copy (dests_ch
[j
], intersec
);
3491 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3492 if (BE (err
!= REG_NOERROR
, 0))
3497 /* Put the position in the current group. */
3498 err
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3499 if (BE (err
< 0, 0))
3502 /* If all characters are consumed, go to next node. */
3506 /* Some characters remain, create a new group. */
3509 bitset_copy (dests_ch
[ndests
], accepts
);
3510 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3511 if (BE (err
!= REG_NOERROR
, 0))
3514 bitset_empty (accepts
);
3519 for (j
= 0; j
< ndests
; ++j
)
3520 re_node_set_free (dests_node
+ j
);
3524 #ifdef RE_ENABLE_I18N
3525 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3526 Return the number of the bytes the node accepts.
3527 STR_IDX is the current index of the input string.
3529 This function handles the nodes which can accept one character, or
3530 one collating element like '.', '[a-z]', opposite to the other nodes
3531 can only accept one byte. */
3534 check_node_accept_bytes (preg
, node_idx
, input
, str_idx
)
3535 const regex_t
*preg
;
3536 int node_idx
, str_idx
;
3537 const re_string_t
*input
;
3539 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3540 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3541 int char_len
, elem_len
;
3544 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3546 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3547 if (BE (c
< 0xc2, 1))
3550 if (str_idx
+ 2 > input
->len
)
3553 d
= re_string_byte_at (input
, str_idx
+ 1);
3555 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3559 if (c
== 0xe0 && d
< 0xa0)
3565 if (c
== 0xf0 && d
< 0x90)
3571 if (c
== 0xf8 && d
< 0x88)
3577 if (c
== 0xfc && d
< 0x84)
3583 if (str_idx
+ char_len
> input
->len
)
3586 for (i
= 1; i
< char_len
; ++i
)
3588 d
= re_string_byte_at (input
, str_idx
+ i
);
3589 if (d
< 0x80 || d
> 0xbf)
3595 char_len
= re_string_char_size_at (input
, str_idx
);
3596 if (node
->type
== OP_PERIOD
)
3600 /* FIXME: I don't think this if is needed, as both '\n'
3601 and '\0' are char_len == 1. */
3602 /* '.' accepts any one character except the following two cases. */
3603 if ((!(preg
->syntax
& RE_DOT_NEWLINE
) &&
3604 re_string_byte_at (input
, str_idx
) == '\n') ||
3605 ((preg
->syntax
& RE_DOT_NOT_NULL
) &&
3606 re_string_byte_at (input
, str_idx
) == '\0'))
3611 elem_len
= re_string_elem_size_at (input
, str_idx
);
3612 if (elem_len
<= 1 && char_len
<= 1)
3615 if (node
->type
== COMPLEX_BRACKET
)
3617 const re_charset_t
*cset
= node
->opr
.mbcset
;
3619 const unsigned char *pin
= ((char *) re_string_get_buffer (input
)
3625 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3626 ? re_string_wchar_at (input
, str_idx
) : 0);
3628 /* match with multibyte character? */
3629 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3630 if (wc
== cset
->mbchars
[i
])
3632 match_len
= char_len
;
3633 goto check_node_accept_bytes_match
;
3635 /* match with character_class? */
3636 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3638 wctype_t wt
= cset
->char_classes
[i
];
3639 if (__iswctype (wc
, wt
))
3641 match_len
= char_len
;
3642 goto check_node_accept_bytes_match
;
3647 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3650 unsigned int in_collseq
= 0;
3651 const int32_t *table
, *indirect
;
3652 const unsigned char *weights
, *extra
;
3653 const char *collseqwc
;
3655 /* This #include defines a local function! */
3656 # include <locale/weight.h>
3658 /* match with collating_symbol? */
3659 if (cset
->ncoll_syms
)
3660 extra
= (const unsigned char *)
3661 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3662 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3664 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3665 /* Compare the length of input collating element and
3666 the length of current collating element. */
3667 if (*coll_sym
!= elem_len
)
3669 /* Compare each bytes. */
3670 for (j
= 0; j
< *coll_sym
; j
++)
3671 if (pin
[j
] != coll_sym
[1 + j
])
3675 /* Match if every bytes is equal. */
3677 goto check_node_accept_bytes_match
;
3683 if (elem_len
<= char_len
)
3685 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3686 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3689 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3691 /* match with range expression? */
3692 for (i
= 0; i
< cset
->nranges
; ++i
)
3693 if (cset
->range_starts
[i
] <= in_collseq
3694 && in_collseq
<= cset
->range_ends
[i
])
3696 match_len
= elem_len
;
3697 goto check_node_accept_bytes_match
;
3700 /* match with equivalence_class? */
3701 if (cset
->nequiv_classes
)
3703 const unsigned char *cp
= pin
;
3704 table
= (const int32_t *)
3705 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3706 weights
= (const unsigned char *)
3707 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3708 extra
= (const unsigned char *)
3709 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3710 indirect
= (const int32_t *)
3711 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3712 idx
= findidx (&cp
);
3714 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3716 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3717 size_t weight_len
= weights
[idx
];
3718 if (weight_len
== weights
[equiv_class_idx
])
3721 while (cnt
<= weight_len
3722 && (weights
[equiv_class_idx
+ 1 + cnt
]
3723 == weights
[idx
+ 1 + cnt
]))
3725 if (cnt
> weight_len
)
3727 match_len
= elem_len
;
3728 goto check_node_accept_bytes_match
;
3737 /* match with range expression? */
3739 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3741 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3744 for (i
= 0; i
< cset
->nranges
; ++i
)
3746 cmp_buf
[0] = cset
->range_starts
[i
];
3747 cmp_buf
[4] = cset
->range_ends
[i
];
3748 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3749 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3751 match_len
= char_len
;
3752 goto check_node_accept_bytes_match
;
3756 check_node_accept_bytes_match
:
3757 if (!cset
->non_match
)
3764 return (elem_len
> char_len
) ? elem_len
: char_len
;
3772 find_collation_sequence_value (mbs
, mbs_len
)
3773 const unsigned char *mbs
;
3776 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3781 /* No valid character. Match it as a single byte character. */
3782 const unsigned char *collseq
= (const unsigned char *)
3783 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3784 return collseq
[mbs
[0]];
3791 const unsigned char *extra
= (const unsigned char *)
3792 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3796 int mbs_cnt
, found
= 0;
3797 int32_t elem_mbs_len
;
3798 /* Skip the name of collating element name. */
3799 idx
= idx
+ extra
[idx
] + 1;
3800 elem_mbs_len
= extra
[idx
++];
3801 if (mbs_len
== elem_mbs_len
)
3803 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3804 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3806 if (mbs_cnt
== elem_mbs_len
)
3807 /* Found the entry. */
3810 /* Skip the byte sequence of the collating element. */
3811 idx
+= elem_mbs_len
;
3812 /* Adjust for the alignment. */
3813 idx
= (idx
+ 3) & ~3;
3814 /* Skip the collation sequence value. */
3815 idx
+= sizeof (uint32_t);
3816 /* Skip the wide char sequence of the collating element. */
3817 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3818 /* If we found the entry, return the sequence value. */
3820 return *(uint32_t *) (extra
+ idx
);
3821 /* Skip the collation sequence value. */
3822 idx
+= sizeof (uint32_t);
3827 #endif /* RE_ENABLE_I18N */
3829 /* Check whether the node accepts the byte which is IDX-th
3830 byte of the INPUT. */
3833 check_node_accept (preg
, node
, mctx
, idx
)
3834 const regex_t
*preg
;
3835 const re_token_t
*node
;
3836 const re_match_context_t
*mctx
;
3840 if (node
->constraint
)
3842 /* The node has constraints. Check whether the current context
3843 satisfies the constraints. */
3844 unsigned int context
= re_string_context_at (mctx
->input
, idx
,
3846 preg
->newline_anchor
);
3847 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
3850 ch
= re_string_byte_at (mctx
->input
, idx
);
3854 return node
->opr
.c
== ch
;
3855 case SIMPLE_BRACKET
:
3856 return bitset_contain (node
->opr
.sbcset
, ch
);
3857 #ifdef RE_ENABLE_I18N
3858 case OP_UTF8_PERIOD
:
3864 return !((ch
== '\n' && !(preg
->syntax
& RE_DOT_NEWLINE
))
3865 || (ch
== '\0' && (preg
->syntax
& RE_DOT_NOT_NULL
)));
3871 /* Extend the buffers, if the buffers have run out. */
3873 static reg_errcode_t
3874 extend_buffers (mctx
)
3875 re_match_context_t
*mctx
;
3878 re_string_t
*pstr
= mctx
->input
;
3880 /* Double the lengthes of the buffers. */
3881 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
3882 if (BE (ret
!= REG_NOERROR
, 0))
3885 if (mctx
->state_log
!= NULL
)
3887 /* And double the length of state_log. */
3888 /* XXX We have no indication of the size of this buffer. If this
3889 allocation fail we have no indication that the state_log array
3890 does not have the right size. */
3891 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
3892 pstr
->bufs_len
+ 1);
3893 if (BE (new_array
== NULL
, 0))
3895 mctx
->state_log
= new_array
;
3898 /* Then reconstruct the buffers. */
3901 #ifdef RE_ENABLE_I18N
3902 if (pstr
->mb_cur_max
> 1)
3904 ret
= build_wcs_upper_buffer (pstr
);
3905 if (BE (ret
!= REG_NOERROR
, 0))
3909 #endif /* RE_ENABLE_I18N */
3910 build_upper_buffer (pstr
);
3914 #ifdef RE_ENABLE_I18N
3915 if (pstr
->mb_cur_max
> 1)
3916 build_wcs_buffer (pstr
);
3918 #endif /* RE_ENABLE_I18N */
3920 if (pstr
->trans
!= NULL
)
3921 re_string_translate_buffer (pstr
);
3928 /* Functions for matching context. */
3930 /* Initialize MCTX. */
3932 static reg_errcode_t
3933 match_ctx_init (mctx
, eflags
, input
, n
)
3934 re_match_context_t
*mctx
;
3938 mctx
->eflags
= eflags
;
3939 mctx
->input
= input
;
3940 mctx
->match_last
= -1;
3943 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
3944 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
3945 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
3949 mctx
->bkref_ents
= NULL
;
3950 mctx
->nbkref_ents
= 0;
3951 mctx
->abkref_ents
= n
;
3952 mctx
->max_mb_elem_len
= 1;
3953 mctx
->nsub_tops
= 0;
3954 mctx
->asub_tops
= n
;
3958 /* Clean the entries which depend on the current input in MCTX.
3959 This function must be invoked when the matcher changes the start index
3960 of the input, or changes the input string. */
3963 match_ctx_clean (mctx
)
3964 re_match_context_t
*mctx
;
3966 match_ctx_free_subtops (mctx
);
3967 mctx
->nsub_tops
= 0;
3968 mctx
->nbkref_ents
= 0;
3971 /* Free all the memory associated with MCTX. */
3974 match_ctx_free (mctx
)
3975 re_match_context_t
*mctx
;
3977 match_ctx_free_subtops (mctx
);
3978 re_free (mctx
->sub_tops
);
3979 re_free (mctx
->bkref_ents
);
3982 /* Free all the memory associated with MCTX->SUB_TOPS. */
3985 match_ctx_free_subtops (mctx
)
3986 re_match_context_t
*mctx
;
3989 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
3992 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
3993 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
3995 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
3996 re_free (last
->path
.array
);
3999 re_free (top
->lasts
);
4002 re_free (top
->path
->array
);
4003 re_free (top
->path
);
4009 /* Add a new backreference entry to MCTX.
4010 Note that we assume that caller never call this function with duplicate
4011 entry, and call with STR_IDX which isn't smaller than any existing entry.
4014 static reg_errcode_t
4015 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
4016 re_match_context_t
*mctx
;
4017 int node
, str_idx
, from
, to
;
4019 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4021 struct re_backref_cache_entry
* new_entry
;
4022 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4023 mctx
->abkref_ents
* 2);
4024 if (BE (new_entry
== NULL
, 0))
4026 re_free (mctx
->bkref_ents
);
4029 mctx
->bkref_ents
= new_entry
;
4030 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4031 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4032 mctx
->abkref_ents
*= 2;
4034 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4035 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4036 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4037 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4038 mctx
->bkref_ents
[mctx
->nbkref_ents
++].flag
= 0;
4039 if (mctx
->max_mb_elem_len
< to
- from
)
4040 mctx
->max_mb_elem_len
= to
- from
;
4044 /* Search for the first entry which has the same str_idx.
4045 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4048 search_cur_bkref_entry (mctx
, str_idx
)
4049 re_match_context_t
*mctx
;
4052 int left
, right
, mid
;
4053 right
= mctx
->nbkref_ents
;
4054 for (left
= 0; left
< right
;)
4056 mid
= (left
+ right
) / 2;
4057 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4066 match_ctx_clear_flag (mctx
)
4067 re_match_context_t
*mctx
;
4070 for (i
= 0; i
< mctx
->nbkref_ents
; ++i
)
4071 mctx
->bkref_ents
[i
].flag
= 0;
4074 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4077 static reg_errcode_t
4078 match_ctx_add_subtop (mctx
, node
, str_idx
)
4079 re_match_context_t
*mctx
;
4083 assert (mctx
->sub_tops
!= NULL
);
4084 assert (mctx
->asub_tops
> 0);
4086 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4088 int new_asub_tops
= mctx
->asub_tops
* 2;
4089 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4090 re_sub_match_top_t
*,
4092 if (BE (new_array
== NULL
, 0))
4094 mctx
->sub_tops
= new_array
;
4095 mctx
->asub_tops
= new_asub_tops
;
4097 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4098 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4100 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4101 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4105 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4106 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4108 static re_sub_match_last_t
*
4109 match_ctx_add_sublast (subtop
, node
, str_idx
)
4110 re_sub_match_top_t
*subtop
;
4113 re_sub_match_last_t
*new_entry
;
4114 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4116 int new_alasts
= 2 * subtop
->alasts
+ 1;
4117 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4118 re_sub_match_last_t
*,
4120 if (BE (new_array
== NULL
, 0))
4122 subtop
->lasts
= new_array
;
4123 subtop
->alasts
= new_alasts
;
4125 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4126 if (BE (new_entry
!= NULL
, 1))
4128 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4129 new_entry
->node
= node
;
4130 new_entry
->str_idx
= str_idx
;
4137 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
,
4139 re_sift_context_t
*sctx
;
4140 re_dfastate_t
**sifted_sts
, **limited_sts
;
4141 int last_node
, last_str_idx
, check_subexp
;
4143 sctx
->sifted_states
= sifted_sts
;
4144 sctx
->limited_states
= limited_sts
;
4145 sctx
->last_node
= last_node
;
4146 sctx
->last_str_idx
= last_str_idx
;
4147 sctx
->check_subexp
= check_subexp
;
4148 sctx
->cur_bkref
= -1;
4149 sctx
->cls_subexp_idx
= -1;
4150 re_node_set_init_empty (&sctx
->limits
);