1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 re_string_t
*input
, int n
);
23 static void match_ctx_clean (re_match_context_t
*mctx
);
24 static void match_ctx_free (re_match_context_t
*cache
);
25 static void match_ctx_free_subtops (re_match_context_t
*mctx
);
26 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
27 int str_idx
, int from
, int to
);
28 static int search_cur_bkref_entry (re_match_context_t
*mctx
, int str_idx
);
29 static void match_ctx_clear_flag (re_match_context_t
*mctx
);
30 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
32 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
33 int node
, int str_idx
);
34 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
35 re_dfastate_t
**limited_sts
, int last_node
,
36 int last_str_idx
, int check_subexp
);
37 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
38 const char *string
, int length
,
39 int start
, int range
, int stop
,
40 size_t nmatch
, regmatch_t pmatch
[],
42 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
43 const char *string1
, int length1
,
44 const char *string2
, int length2
,
45 int start
, int range
, struct re_registers
*regs
,
46 int stop
, int ret_len
);
47 static int re_search_stub (struct re_pattern_buffer
*bufp
,
48 const char *string
, int length
, int start
,
49 int range
, int stop
, struct re_registers
*regs
,
51 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
52 int nregs
, int regs_allocated
);
53 static re_dfastate_t
*acquire_init_state_context (reg_errcode_t
*err
,
55 const re_match_context_t
*mctx
,
57 static reg_errcode_t
prune_impossible_nodes (const regex_t
*preg
,
58 re_match_context_t
*mctx
);
59 static int check_matching (const regex_t
*preg
, re_match_context_t
*mctx
,
60 int fl_search
, int fl_longest_match
);
61 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
62 unsigned int context
);
63 static int check_halt_state_context (const regex_t
*preg
,
64 const re_dfastate_t
*state
,
65 const re_match_context_t
*mctx
, int idx
);
66 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
, int cur_node
,
67 int cur_idx
, int nmatch
);
68 static int proceed_next_node (const regex_t
*preg
, int nregs
, regmatch_t
*regs
,
69 const re_match_context_t
*mctx
,
70 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
71 struct re_fail_stack_t
*fs
);
72 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
73 int str_idx
, int *dests
, int nregs
,
75 re_node_set
*eps_via_nodes
);
76 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
77 regmatch_t
*regs
, re_node_set
*eps_via_nodes
);
78 static reg_errcode_t
set_regs (const regex_t
*preg
,
79 const re_match_context_t
*mctx
,
80 size_t nmatch
, regmatch_t
*pmatch
,
82 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
);
85 static int sift_states_iter_mb (const regex_t
*preg
,
86 const re_match_context_t
*mctx
,
87 re_sift_context_t
*sctx
,
88 int node_idx
, int str_idx
, int max_str_idx
);
89 #endif /* RE_ENABLE_I18N */
90 static reg_errcode_t
sift_states_backward (const regex_t
*preg
,
91 re_match_context_t
*mctx
,
92 re_sift_context_t
*sctx
);
93 static reg_errcode_t
update_cur_sifted_state (const regex_t
*preg
,
94 re_match_context_t
*mctx
,
95 re_sift_context_t
*sctx
,
97 re_node_set
*dest_nodes
);
98 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
99 re_node_set
*dest_nodes
,
100 const re_node_set
*candidates
);
101 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
102 re_node_set
*dest_nodes
,
103 const re_node_set
*and_nodes
);
104 static int check_dst_limits (re_dfa_t
*dfa
, re_node_set
*limits
,
105 re_match_context_t
*mctx
, int dst_node
,
106 int dst_idx
, int src_node
, int src_idx
);
107 static int check_dst_limits_calc_pos (re_dfa_t
*dfa
, re_match_context_t
*mctx
,
108 int limit
, re_node_set
*eclosures
,
109 int subexp_idx
, int node
, int str_idx
);
110 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
111 re_node_set
*dest_nodes
,
112 const re_node_set
*candidates
,
114 struct re_backref_cache_entry
*bkref_ents
,
116 static reg_errcode_t
sift_states_bkref (const regex_t
*preg
,
117 re_match_context_t
*mctx
,
118 re_sift_context_t
*sctx
,
119 int str_idx
, re_node_set
*dest_nodes
);
120 static reg_errcode_t
clean_state_log_if_need (re_match_context_t
*mctx
,
121 int next_state_log_idx
);
122 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
123 re_dfastate_t
**src
, int num
);
124 static re_dfastate_t
*transit_state (reg_errcode_t
*err
, const regex_t
*preg
,
125 re_match_context_t
*mctx
,
126 re_dfastate_t
*state
, int fl_search
);
127 static reg_errcode_t
check_subexp_matching_top (re_dfa_t
*dfa
,
128 re_match_context_t
*mctx
,
129 re_node_set
*cur_nodes
,
131 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
, const regex_t
*preg
,
132 re_dfastate_t
*pstate
,
134 re_match_context_t
*mctx
);
135 #ifdef RE_ENABLE_I18N
136 static reg_errcode_t
transit_state_mb (const regex_t
*preg
,
137 re_dfastate_t
*pstate
,
138 re_match_context_t
*mctx
);
139 #endif /* RE_ENABLE_I18N */
140 static reg_errcode_t
transit_state_bkref (const regex_t
*preg
,
142 re_match_context_t
*mctx
);
143 static reg_errcode_t
get_subexp (const regex_t
*preg
, re_match_context_t
*mctx
,
144 int bkref_node
, int bkref_str_idx
);
145 static reg_errcode_t
get_subexp_sub (const regex_t
*preg
,
146 re_match_context_t
*mctx
,
147 re_sub_match_top_t
*sub_top
,
148 re_sub_match_last_t
*sub_last
,
149 int bkref_node
, int bkref_str
);
150 static int find_subexp_node (re_dfa_t
*dfa
, re_node_set
*nodes
,
151 int subexp_idx
, int fl_open
);
152 static reg_errcode_t
check_arrival (const regex_t
*preg
,
153 re_match_context_t
*mctx
,
154 state_array_t
*path
, int top_node
,
155 int top_str
, int last_node
, int last_str
,
157 static reg_errcode_t
check_arrival_add_next_nodes (const regex_t
*preg
,
159 re_match_context_t
*mctx
,
161 re_node_set
*cur_nodes
,
162 re_node_set
*next_nodes
);
163 static reg_errcode_t
check_arrival_expand_ecl (re_dfa_t
*dfa
,
164 re_node_set
*cur_nodes
,
165 int ex_subexp
, int fl_open
);
166 static reg_errcode_t
check_arrival_expand_ecl_sub (re_dfa_t
*dfa
,
167 re_node_set
*dst_nodes
,
168 int target
, int ex_subexp
,
170 static reg_errcode_t
expand_bkref_cache (const regex_t
*preg
,
171 re_match_context_t
*mctx
,
172 re_node_set
*cur_nodes
, int cur_str
,
173 int last_str
, int subexp_num
,
175 static re_dfastate_t
**build_trtable (const regex_t
*dfa
,
176 const re_dfastate_t
*state
,
178 #ifdef RE_ENABLE_I18N
179 static int check_node_accept_bytes (const regex_t
*preg
, int node_idx
,
180 const re_string_t
*input
, int idx
);
182 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
185 #endif /* RE_ENABLE_I18N */
186 static int group_nodes_into_DFAstates (const regex_t
*dfa
,
187 const re_dfastate_t
*state
,
188 re_node_set
*states_node
,
190 static int check_node_accept (const regex_t
*preg
, const re_token_t
*node
,
191 const re_match_context_t
*mctx
, int idx
);
192 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
);
194 /* Entry point for POSIX code. */
196 /* regexec searches for a given pattern, specified by PREG, in the
199 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
200 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
201 least NMATCH elements, and we set them to the offsets of the
202 corresponding matched substrings.
204 EFLAGS specifies `execution flags' which affect matching: if
205 REG_NOTBOL is set, then ^ does not match at the beginning of the
206 string; if REG_NOTEOL is set, then $ does not match at the end.
208 We return 0 if we find a match and REG_NOMATCH if not. */
211 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
212 const regex_t
*__restrict preg
;
213 const char *__restrict string
;
219 int length
= strlen (string
);
221 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, 0,
224 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, nmatch
,
226 return err
!= REG_NOERROR
;
229 weak_alias (__regexec
, regexec
)
232 /* Entry points for GNU code. */
234 /* re_match, re_search, re_match_2, re_search_2
236 The former two functions operate on STRING with length LENGTH,
237 while the later two operate on concatenation of STRING1 and STRING2
238 with lengths LENGTH1 and LENGTH2, respectively.
240 re_match() matches the compiled pattern in BUFP against the string,
241 starting at index START.
243 re_search() first tries matching at index START, then it tries to match
244 starting from index START + 1, and so on. The last start position tried
245 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
248 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
249 the first STOP characters of the concatenation of the strings should be
252 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
253 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
254 computed relative to the concatenation, not relative to the individual
257 On success, re_match* functions return the length of the match, re_search*
258 return the position of the start of the match. Return value -1 means no
259 match was found and -2 indicates an internal error. */
262 re_match (bufp
, string
, length
, start
, regs
)
263 struct re_pattern_buffer
*bufp
;
266 struct re_registers
*regs
;
268 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
271 weak_alias (__re_match
, re_match
)
275 re_search (bufp
, string
, length
, start
, range
, regs
)
276 struct re_pattern_buffer
*bufp
;
278 int length
, start
, range
;
279 struct re_registers
*regs
;
281 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
284 weak_alias (__re_search
, re_search
)
288 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
289 struct re_pattern_buffer
*bufp
;
290 const char *string1
, *string2
;
291 int length1
, length2
, start
, stop
;
292 struct re_registers
*regs
;
294 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
295 start
, 0, regs
, stop
, 1);
298 weak_alias (__re_match_2
, re_match_2
)
302 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
303 struct re_pattern_buffer
*bufp
;
304 const char *string1
, *string2
;
305 int length1
, length2
, start
, range
, stop
;
306 struct re_registers
*regs
;
308 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
309 start
, range
, regs
, stop
, 0);
312 weak_alias (__re_search_2
, re_search_2
)
316 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
318 struct re_pattern_buffer
*bufp
;
319 const char *string1
, *string2
;
320 int length1
, length2
, start
, range
, stop
, ret_len
;
321 struct re_registers
*regs
;
325 int len
= length1
+ length2
;
328 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
331 /* Concatenate the strings. */
335 char *s
= re_malloc (char, len
);
337 if (BE (s
== NULL
, 0))
339 memcpy (s
, string1
, length1
);
340 memcpy (s
+ length1
, string2
, length2
);
349 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
352 re_free ((char *) str
);
356 /* The parameters have the same meaning as those of re_search.
357 Additional parameters:
358 If RET_LEN is nonzero the length of the match is returned (re_match style);
359 otherwise the position of the match is returned. */
362 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
363 struct re_pattern_buffer
*bufp
;
365 int length
, start
, range
, stop
, ret_len
;
366 struct re_registers
*regs
;
368 reg_errcode_t result
;
373 /* Check for out-of-range. */
374 if (BE (start
< 0 || start
> length
, 0))
376 if (BE (start
+ range
> length
, 0))
377 range
= length
- start
;
378 else if (BE (start
+ range
< 0, 0))
381 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
382 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
384 /* Compile fastmap if we haven't yet. */
385 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
386 re_compile_fastmap (bufp
);
388 if (BE (bufp
->no_sub
, 0))
391 /* We need at least 1 register. */
394 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
395 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
397 nregs
= regs
->num_regs
;
398 if (BE (nregs
< 1, 0))
400 /* Nothing can be copied to regs. */
406 nregs
= bufp
->re_nsub
+ 1;
407 pmatch
= re_malloc (regmatch_t
, nregs
);
408 if (BE (pmatch
== NULL
, 0))
411 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
412 nregs
, pmatch
, eflags
);
416 /* I hope we needn't fill ther regs with -1's when no match was found. */
417 if (result
!= REG_NOERROR
)
419 else if (regs
!= NULL
)
421 /* If caller wants register contents data back, copy them. */
422 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
423 bufp
->regs_allocated
);
424 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
428 if (BE (rval
== 0, 1))
432 assert (pmatch
[0].rm_so
== start
);
433 rval
= pmatch
[0].rm_eo
- start
;
436 rval
= pmatch
[0].rm_so
;
443 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
444 struct re_registers
*regs
;
446 int nregs
, regs_allocated
;
448 int rval
= REGS_REALLOCATE
;
450 int need_regs
= nregs
+ 1;
451 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
454 /* Have the register data arrays been allocated? */
455 if (regs_allocated
== REGS_UNALLOCATED
)
456 { /* No. So allocate them with malloc. */
457 regs
->start
= re_malloc (regoff_t
, need_regs
);
458 if (BE (regs
->start
== NULL
, 0))
459 return REGS_UNALLOCATED
;
460 regs
->end
= re_malloc (regoff_t
, need_regs
);
461 if (BE (regs
->end
== NULL
, 0))
463 re_free (regs
->start
);
464 return REGS_UNALLOCATED
;
466 regs
->num_regs
= need_regs
;
468 else if (regs_allocated
== REGS_REALLOCATE
)
469 { /* Yes. If we need more elements than were already
470 allocated, reallocate them. If we need fewer, just
472 if (need_regs
> regs
->num_regs
)
474 regs
->start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
475 if (BE (regs
->start
== NULL
, 0))
477 if (regs
->end
!= NULL
)
479 return REGS_UNALLOCATED
;
481 regs
->end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
482 if (BE (regs
->end
== NULL
, 0))
484 re_free (regs
->start
);
485 return REGS_UNALLOCATED
;
487 regs
->num_regs
= need_regs
;
492 assert (regs_allocated
== REGS_FIXED
);
493 /* This function may not be called with REGS_FIXED and nregs too big. */
494 assert (regs
->num_regs
>= nregs
);
499 for (i
= 0; i
< nregs
; ++i
)
501 regs
->start
[i
] = pmatch
[i
].rm_so
;
502 regs
->end
[i
] = pmatch
[i
].rm_eo
;
504 for ( ; i
< regs
->num_regs
; ++i
)
505 regs
->start
[i
] = regs
->end
[i
] = -1;
510 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
511 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
512 this memory for recording register information. STARTS and ENDS
513 must be allocated using the malloc library routine, and must each
514 be at least NUM_REGS * sizeof (regoff_t) bytes long.
516 If NUM_REGS == 0, then subsequent matches should allocate their own
519 Unless this function is called, the first search or match using
520 PATTERN_BUFFER will allocate its own register data, without
521 freeing the old data. */
524 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
525 struct re_pattern_buffer
*bufp
;
526 struct re_registers
*regs
;
528 regoff_t
*starts
, *ends
;
532 bufp
->regs_allocated
= REGS_REALLOCATE
;
533 regs
->num_regs
= num_regs
;
534 regs
->start
= starts
;
539 bufp
->regs_allocated
= REGS_UNALLOCATED
;
541 regs
->start
= regs
->end
= (regoff_t
*) 0;
545 weak_alias (__re_set_registers
, re_set_registers
)
548 /* Entry points compatible with 4.2 BSD regex library. We don't define
549 them unless specifically requested. */
551 #if defined _REGEX_RE_COMP || defined _LIBC
559 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
561 #endif /* _REGEX_RE_COMP */
563 static re_node_set empty_set
;
565 /* Internal entry point. */
567 /* Searches for a compiled pattern PREG in the string STRING, whose
568 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
569 mingings with regexec. START, and RANGE have the same meanings
571 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
572 otherwise return the error code.
573 Note: We assume front end functions already check ranges.
574 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
577 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
581 int length
, start
, range
, stop
, eflags
;
586 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
588 int left_lim
, right_lim
, incr
;
589 int fl_longest_match
, match_first
, match_last
= -1;
590 int fast_translate
, sb
;
591 re_match_context_t mctx
;
592 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
593 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
);
595 /* Check if the DFA haven't been compiled. */
596 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
597 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
598 || dfa
->init_state_begbuf
== NULL
, 0))
601 re_node_set_init_empty (&empty_set
);
602 memset (&mctx
, '\0', sizeof (re_match_context_t
));
604 /* We must check the longest matching, if nmatch > 0. */
605 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
607 err
= re_string_allocate (&input
, string
, length
, dfa
->nodes_len
+ 1,
608 preg
->translate
, preg
->syntax
& RE_ICASE
);
609 if (BE (err
!= REG_NOERROR
, 0))
613 err
= match_ctx_init (&mctx
, eflags
, &input
, dfa
->nbackref
* 2);
614 if (BE (err
!= REG_NOERROR
, 0))
617 /* We will log all the DFA states through which the dfa pass,
618 if nmatch > 1, or this dfa has "multibyte node", which is a
619 back-reference or a node which can accept multibyte character or
620 multi character collating element. */
621 if (nmatch
> 1 || dfa
->has_mb_node
)
623 mctx
.state_log
= re_malloc (re_dfastate_t
*, dfa
->nodes_len
+ 1);
624 if (BE (mctx
.state_log
== NULL
, 0))
631 mctx
.state_log
= NULL
;
634 /* We assume front-end functions already check them. */
635 assert (start
+ range
>= 0 && start
+ range
<= length
);
639 input
.tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
640 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
642 /* Check incrementally whether of not the input string match. */
643 incr
= (range
< 0) ? -1 : 1;
644 left_lim
= (range
< 0) ? start
+ range
: start
;
645 right_lim
= (range
< 0) ? start
: start
+ range
;
646 sb
= MB_CUR_MAX
== 1;
647 fast_translate
= sb
|| !(preg
->syntax
& RE_ICASE
|| preg
->translate
);
651 /* At first get the current byte from input string. */
654 if (BE (fast_translate
, 1))
656 unsigned RE_TRANSLATE_TYPE t
657 = (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
658 if (BE (range
>= 0, 1))
660 if (BE (t
!= NULL
, 0))
662 while (BE (match_first
< right_lim
, 1)
663 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
668 while (BE (match_first
< right_lim
, 1)
669 && !fastmap
[(unsigned char) string
[match_first
]])
672 if (BE (match_first
== right_lim
, 0))
674 int ch
= match_first
>= length
675 ? 0 : (unsigned char) string
[match_first
];
676 if (!fastmap
[t
? t
[ch
] : ch
])
682 while (match_first
>= left_lim
)
684 int ch
= match_first
>= length
685 ? 0 : (unsigned char) string
[match_first
];
686 if (fastmap
[t
? t
[ch
] : ch
])
690 if (match_first
< left_lim
)
700 /* In this case, we can't determine easily the current byte,
701 since it might be a component byte of a multibyte
702 character. Then we use the constructed buffer
704 /* If MATCH_FIRST is out of the valid range, reconstruct the
706 if (input
.raw_mbs_idx
+ input
.valid_len
<= match_first
707 || match_first
< input
.raw_mbs_idx
)
709 err
= re_string_reconstruct (&input
, match_first
, eflags
,
710 preg
->newline_anchor
);
711 if (BE (err
!= REG_NOERROR
, 0))
714 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
715 Note that MATCH_FIRST must not be smaller than 0. */
716 ch
= ((match_first
>= length
) ? 0
717 : re_string_byte_at (&input
,
718 match_first
- input
.raw_mbs_idx
));
723 while (match_first
>= left_lim
&& match_first
<= right_lim
);
729 /* Reconstruct the buffers so that the matcher can assume that
730 the matching starts from the begining of the buffer. */
731 err
= re_string_reconstruct (&input
, match_first
, eflags
,
732 preg
->newline_anchor
);
733 if (BE (err
!= REG_NOERROR
, 0))
735 #ifdef RE_ENABLE_I18N
736 /* Eliminate it when it is a component of a multibyte character
737 and isn't the head of a multibyte character. */
738 if (sb
|| re_string_first_byte (&input
, 0))
741 /* It seems to be appropriate one, then use the matcher. */
742 /* We assume that the matching starts from 0. */
743 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
744 match_last
= check_matching (preg
, &mctx
, 0, fl_longest_match
);
745 if (match_last
!= -1)
747 if (BE (match_last
== -2, 0))
754 mctx
.match_last
= match_last
;
755 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
757 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
758 mctx
.last_node
= check_halt_state_context (preg
, pstate
,
761 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
764 err
= prune_impossible_nodes (preg
, &mctx
);
765 if (err
== REG_NOERROR
)
767 if (BE (err
!= REG_NOMATCH
, 0))
771 break; /* We found a matching. */
774 match_ctx_clean (&mctx
);
776 /* Update counter. */
778 if (match_first
< left_lim
|| right_lim
< match_first
)
782 /* Set pmatch[] if we need. */
783 if (match_last
!= -1 && nmatch
> 0)
787 /* Initialize registers. */
788 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
789 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
791 /* Set the points where matching start/end. */
793 pmatch
[0].rm_eo
= mctx
.match_last
;
795 if (!preg
->no_sub
&& nmatch
> 1)
797 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
798 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
799 if (BE (err
!= REG_NOERROR
, 0))
803 /* At last, add the offset to the each registers, since we slided
804 the buffers so that We can assume that the matching starts from 0. */
805 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
806 if (pmatch
[reg_idx
].rm_so
!= -1)
808 pmatch
[reg_idx
].rm_so
+= match_first
;
809 pmatch
[reg_idx
].rm_eo
+= match_first
;
812 err
= (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
814 re_free (mctx
.state_log
);
816 match_ctx_free (&mctx
);
817 re_string_destruct (&input
);
822 prune_impossible_nodes (preg
, mctx
)
824 re_match_context_t
*mctx
;
826 int halt_node
, match_last
;
828 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
829 re_dfastate_t
**sifted_states
;
830 re_dfastate_t
**lim_states
= NULL
;
831 re_sift_context_t sctx
;
833 assert (mctx
->state_log
!= NULL
);
835 match_last
= mctx
->match_last
;
836 halt_node
= mctx
->last_node
;
837 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
838 if (BE (sifted_states
== NULL
, 0))
845 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
846 if (BE (lim_states
== NULL
, 0))
853 memset (lim_states
, '\0',
854 sizeof (re_dfastate_t
*) * (match_last
+ 1));
855 match_ctx_clear_flag (mctx
);
856 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
858 ret
= sift_states_backward (preg
, mctx
, &sctx
);
859 re_node_set_free (&sctx
.limits
);
860 if (BE (ret
!= REG_NOERROR
, 0))
862 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
872 } while (!mctx
->state_log
[match_last
]->halt
);
873 halt_node
= check_halt_state_context (preg
,
874 mctx
->state_log
[match_last
],
877 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
879 re_free (lim_states
);
881 if (BE (ret
!= REG_NOERROR
, 0))
886 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
888 ret
= sift_states_backward (preg
, mctx
, &sctx
);
889 re_node_set_free (&sctx
.limits
);
890 if (BE (ret
!= REG_NOERROR
, 0))
893 re_free (mctx
->state_log
);
894 mctx
->state_log
= sifted_states
;
895 sifted_states
= NULL
;
896 mctx
->last_node
= halt_node
;
897 mctx
->match_last
= match_last
;
900 re_free (sifted_states
);
901 re_free (lim_states
);
905 /* Acquire an initial state and return it.
906 We must select appropriate initial state depending on the context,
907 since initial states may have constraints like "\<", "^", etc.. */
909 static re_dfastate_t
*
910 acquire_init_state_context (err
, preg
, mctx
, idx
)
913 const re_match_context_t
*mctx
;
916 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
919 if (dfa
->init_state
->has_constraint
)
921 unsigned int context
;
922 context
= re_string_context_at (mctx
->input
, idx
- 1, mctx
->eflags
,
923 preg
->newline_anchor
);
924 if (IS_WORD_CONTEXT (context
))
925 return dfa
->init_state_word
;
926 else if (IS_ORDINARY_CONTEXT (context
))
927 return dfa
->init_state
;
928 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
929 return dfa
->init_state_begbuf
;
930 else if (IS_NEWLINE_CONTEXT (context
))
931 return dfa
->init_state_nl
;
932 else if (IS_BEGBUF_CONTEXT (context
))
934 /* It is relatively rare case, then calculate on demand. */
935 return re_acquire_state_context (err
, dfa
,
936 dfa
->init_state
->entrance_nodes
,
940 /* Must not happen? */
941 return dfa
->init_state
;
944 return dfa
->init_state
;
947 /* Check whether the regular expression match input string INPUT or not,
948 and return the index where the matching end, return -1 if not match,
949 or return -2 in case of an error.
950 FL_SEARCH means we must search where the matching starts,
951 FL_LONGEST_MATCH means we want the POSIX longest matching.
952 Note that the matcher assume that the maching starts from the current
953 index of the buffer. */
956 check_matching (preg
, mctx
, fl_search
, fl_longest_match
)
958 re_match_context_t
*mctx
;
959 int fl_search
, fl_longest_match
;
961 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
965 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
966 re_dfastate_t
*cur_state
;
968 cur_state
= acquire_init_state_context (&err
, preg
, mctx
, cur_str_idx
);
969 /* An initial state must not be NULL(invalid state). */
970 if (BE (cur_state
== NULL
, 0))
972 if (mctx
->state_log
!= NULL
)
973 mctx
->state_log
[cur_str_idx
] = cur_state
;
975 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
976 later. E.g. Processing back references. */
979 err
= check_subexp_matching_top (dfa
, mctx
, &cur_state
->nodes
, 0);
980 if (BE (err
!= REG_NOERROR
, 0))
984 if (cur_state
->has_backref
)
986 err
= transit_state_bkref (preg
, &cur_state
->nodes
, mctx
);
987 if (BE (err
!= REG_NOERROR
, 0))
991 /* If the RE accepts NULL string. */
994 if (!cur_state
->has_constraint
995 || check_halt_state_context (preg
, cur_state
, mctx
, cur_str_idx
))
997 if (!fl_longest_match
)
1001 match_last
= cur_str_idx
;
1007 while (!re_string_eoi (mctx
->input
))
1009 cur_state
= transit_state (&err
, preg
, mctx
, cur_state
,
1010 fl_search
&& !match
);
1011 if (cur_state
== NULL
) /* Reached at the invalid state or an error. */
1013 cur_str_idx
= re_string_cur_idx (mctx
->input
);
1014 if (BE (err
!= REG_NOERROR
, 0))
1016 if (fl_search
&& !match
)
1018 /* Restart from initial state, since we are searching
1019 the point from where matching start. */
1020 #ifdef RE_ENABLE_I18N
1022 || re_string_first_byte (mctx
->input
, cur_str_idx
))
1023 #endif /* RE_ENABLE_I18N */
1024 cur_state
= acquire_init_state_context (&err
, preg
, mctx
,
1026 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
1028 if (mctx
->state_log
!= NULL
)
1029 mctx
->state_log
[cur_str_idx
] = cur_state
;
1031 else if (!fl_longest_match
&& match
)
1033 else /* (fl_longest_match && match) || (!fl_search && !match) */
1035 if (mctx
->state_log
== NULL
)
1039 int max
= mctx
->state_log_top
;
1040 for (; cur_str_idx
<= max
; ++cur_str_idx
)
1041 if (mctx
->state_log
[cur_str_idx
] != NULL
)
1043 if (cur_str_idx
> max
)
1049 if (cur_state
!= NULL
&& cur_state
->halt
)
1051 /* Reached at a halt state.
1052 Check the halt state can satisfy the current context. */
1053 if (!cur_state
->has_constraint
1054 || check_halt_state_context (preg
, cur_state
, mctx
,
1055 re_string_cur_idx (mctx
->input
)))
1057 /* We found an appropriate halt state. */
1058 match_last
= re_string_cur_idx (mctx
->input
);
1060 if (!fl_longest_match
)
1068 /* Check NODE match the current context. */
1070 static int check_halt_node_context (dfa
, node
, context
)
1071 const re_dfa_t
*dfa
;
1073 unsigned int context
;
1075 re_token_type_t type
= dfa
->nodes
[node
].type
;
1076 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1077 if (type
!= END_OF_RE
)
1081 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1086 /* Check the halt state STATE match the current context.
1087 Return 0 if not match, if the node, STATE has, is a halt node and
1088 match the context, return the node. */
1091 check_halt_state_context (preg
, state
, mctx
, idx
)
1092 const regex_t
*preg
;
1093 const re_dfastate_t
*state
;
1094 const re_match_context_t
*mctx
;
1097 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1099 unsigned int context
;
1101 assert (state
->halt
);
1103 context
= re_string_context_at (mctx
->input
, idx
, mctx
->eflags
,
1104 preg
->newline_anchor
);
1105 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1106 if (check_halt_node_context (dfa
, state
->nodes
.elems
[i
], context
))
1107 return state
->nodes
.elems
[i
];
1111 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1112 corresponding to the DFA).
1113 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1117 proceed_next_node (preg
, nregs
, regs
, mctx
, pidx
, node
, eps_via_nodes
, fs
)
1118 const regex_t
*preg
;
1120 const re_match_context_t
*mctx
;
1121 int nregs
, *pidx
, node
;
1122 re_node_set
*eps_via_nodes
;
1123 struct re_fail_stack_t
*fs
;
1125 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1126 int i
, err
, dest_node
;
1128 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1130 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1131 int ndest
, dest_nodes
[2];
1132 err
= re_node_set_insert (eps_via_nodes
, node
);
1133 if (BE (err
< 0, 0))
1135 /* Pick up valid destinations. */
1136 for (ndest
= 0, i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1138 int candidate
= dfa
->edests
[node
].elems
[i
];
1139 if (!re_node_set_contains (cur_nodes
, candidate
))
1141 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1142 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1146 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1147 /* In order to avoid infinite loop like "(a*)*". */
1148 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1149 return dest_nodes
[1];
1151 push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
, eps_via_nodes
);
1152 return dest_nodes
[0];
1157 re_token_type_t type
= dfa
->nodes
[node
].type
;
1159 #ifdef RE_ENABLE_I18N
1160 if (ACCEPT_MB_NODE (type
))
1161 naccepted
= check_node_accept_bytes (preg
, node
, mctx
->input
, *pidx
);
1163 #endif /* RE_ENABLE_I18N */
1164 if (type
== OP_BACK_REF
)
1166 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
;
1167 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1170 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1174 char *buf
= (char *) re_string_get_buffer (mctx
->input
);
1175 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1183 err
= re_node_set_insert (eps_via_nodes
, node
);
1184 if (BE (err
< 0, 0))
1186 dest_node
= dfa
->edests
[node
].elems
[0];
1187 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1194 || check_node_accept (preg
, dfa
->nodes
+ node
, mctx
, *pidx
))
1196 dest_node
= dfa
->nexts
[node
];
1197 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1198 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1199 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1202 re_node_set_empty (eps_via_nodes
);
1209 static reg_errcode_t
1210 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1211 struct re_fail_stack_t
*fs
;
1212 int str_idx
, *dests
, nregs
;
1214 re_node_set
*eps_via_nodes
;
1217 int num
= fs
->num
++;
1218 if (fs
->num
== fs
->alloc
)
1220 struct re_fail_stack_ent_t
*new_array
;
1222 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1224 if (new_array
== NULL
)
1226 fs
->stack
= new_array
;
1228 fs
->stack
[num
].idx
= str_idx
;
1229 fs
->stack
[num
].node
= dests
[1];
1230 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1231 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1232 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1237 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1238 struct re_fail_stack_t
*fs
;
1241 re_node_set
*eps_via_nodes
;
1243 int num
= --fs
->num
;
1245 *pidx
= fs
->stack
[num
].idx
;
1246 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1247 re_node_set_free (eps_via_nodes
);
1248 re_free (fs
->stack
[num
].regs
);
1249 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1250 return fs
->stack
[num
].node
;
1253 /* Set the positions where the subexpressions are starts/ends to registers
1255 Note: We assume that pmatch[0] is already set, and
1256 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1258 static reg_errcode_t
1259 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1260 const regex_t
*preg
;
1261 const re_match_context_t
*mctx
;
1266 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1267 int idx
, cur_node
, real_nmatch
;
1268 re_node_set eps_via_nodes
;
1269 struct re_fail_stack_t
*fs
;
1270 struct re_fail_stack_t fs_body
= {0, 2, NULL
};
1272 assert (nmatch
> 1);
1273 assert (mctx
->state_log
!= NULL
);
1278 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1282 cur_node
= dfa
->init_node
;
1283 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1284 re_node_set_init_empty (&eps_via_nodes
);
1285 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1287 update_regs (dfa
, pmatch
, cur_node
, idx
, real_nmatch
);
1288 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1293 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1294 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1296 if (reg_idx
== nmatch
)
1298 re_node_set_free (&eps_via_nodes
);
1299 return free_fail_stack_return (fs
);
1301 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1306 re_node_set_free (&eps_via_nodes
);
1311 /* Proceed to next node. */
1312 cur_node
= proceed_next_node (preg
, nmatch
, pmatch
, mctx
, &idx
, cur_node
,
1313 &eps_via_nodes
, fs
);
1315 if (BE (cur_node
< 0, 0))
1320 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1324 re_node_set_free (&eps_via_nodes
);
1329 re_node_set_free (&eps_via_nodes
);
1330 return free_fail_stack_return (fs
);
1333 static reg_errcode_t
1334 free_fail_stack_return (fs
)
1335 struct re_fail_stack_t
*fs
;
1340 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1342 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1343 re_free (fs
->stack
[fs_idx
].regs
);
1345 re_free (fs
->stack
);
1351 update_regs (dfa
, pmatch
, cur_node
, cur_idx
, nmatch
)
1354 int cur_node
, cur_idx
, nmatch
;
1356 int type
= dfa
->nodes
[cur_node
].type
;
1358 if (type
!= OP_OPEN_SUBEXP
&& type
!= OP_CLOSE_SUBEXP
)
1360 reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1361 if (reg_num
>= nmatch
)
1363 if (type
== OP_OPEN_SUBEXP
)
1365 /* We are at the first node of this sub expression. */
1366 pmatch
[reg_num
].rm_so
= cur_idx
;
1367 pmatch
[reg_num
].rm_eo
= -1;
1369 else if (type
== OP_CLOSE_SUBEXP
)
1370 /* We are at the first node of this sub expression. */
1371 pmatch
[reg_num
].rm_eo
= cur_idx
;
1374 #define NUMBER_OF_STATE 1
1376 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1377 and sift the nodes in each states according to the following rules.
1378 Updated state_log will be wrote to STATE_LOG.
1380 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1381 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1382 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1383 the LAST_NODE, we throw away the node `a'.
1384 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1385 string `s' and transit to `b':
1386 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1388 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1389 throwed away, we throw away the node `a'.
1390 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1391 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1393 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1394 we throw away the node `a'. */
1396 #define STATE_NODE_CONTAINS(state,node) \
1397 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1399 static reg_errcode_t
1400 sift_states_backward (preg
, mctx
, sctx
)
1401 const regex_t
*preg
;
1402 re_match_context_t
*mctx
;
1403 re_sift_context_t
*sctx
;
1406 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1408 int str_idx
= sctx
->last_str_idx
;
1409 re_node_set cur_dest
;
1410 re_node_set
*cur_src
; /* Points the state_log[str_idx]->nodes */
1413 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1415 cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1417 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1418 transit to the last_node and the last_node itself. */
1419 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1420 if (BE (err
!= REG_NOERROR
, 0))
1422 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1423 if (BE (err
!= REG_NOERROR
, 0))
1426 /* Then check each states in the state_log. */
1430 /* Update counters. */
1431 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1432 if (null_cnt
> mctx
->max_mb_elem_len
)
1434 memset (sctx
->sifted_states
, '\0',
1435 sizeof (re_dfastate_t
*) * str_idx
);
1436 re_node_set_free (&cur_dest
);
1439 re_node_set_empty (&cur_dest
);
1441 cur_src
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1442 : &mctx
->state_log
[str_idx
]->nodes
);
1444 /* Then build the next sifted state.
1445 We build the next sifted state on `cur_dest', and update
1446 `sifted_states[str_idx]' with `cur_dest'.
1448 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1449 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1450 for (i
= 0; i
< cur_src
->nelem
; i
++)
1452 int prev_node
= cur_src
->elems
[i
];
1454 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1456 if (IS_EPSILON_NODE(type
))
1458 #ifdef RE_ENABLE_I18N
1459 /* If the node may accept `multi byte'. */
1460 if (ACCEPT_MB_NODE (type
))
1461 naccepted
= sift_states_iter_mb (preg
, mctx
, sctx
, prev_node
,
1462 str_idx
, sctx
->last_str_idx
);
1464 #endif /* RE_ENABLE_I18N */
1465 /* We don't check backreferences here.
1466 See update_cur_sifted_state(). */
1469 && check_node_accept (preg
, dfa
->nodes
+ prev_node
, mctx
,
1471 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1472 dfa
->nexts
[prev_node
]))
1478 if (sctx
->limits
.nelem
)
1480 int to_idx
= str_idx
+ naccepted
;
1481 if (check_dst_limits (dfa
, &sctx
->limits
, mctx
,
1482 dfa
->nexts
[prev_node
], to_idx
,
1483 prev_node
, str_idx
))
1486 ret
= re_node_set_insert (&cur_dest
, prev_node
);
1487 if (BE (ret
== -1, 0))
1494 /* Add all the nodes which satisfy the following conditions:
1495 - It can epsilon transit to a node in CUR_DEST.
1497 And update state_log. */
1498 err
= update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, &cur_dest
);
1499 if (BE (err
!= REG_NOERROR
, 0))
1504 re_node_set_free (&cur_dest
);
1508 /* Helper functions. */
1510 static reg_errcode_t
1511 clean_state_log_if_need (mctx
, next_state_log_idx
)
1512 re_match_context_t
*mctx
;
1513 int next_state_log_idx
;
1515 int top
= mctx
->state_log_top
;
1517 if (next_state_log_idx
>= mctx
->input
->bufs_len
1518 || (next_state_log_idx
>= mctx
->input
->valid_len
1519 && mctx
->input
->valid_len
< mctx
->input
->len
))
1522 err
= extend_buffers (mctx
);
1523 if (BE (err
!= REG_NOERROR
, 0))
1527 if (top
< next_state_log_idx
)
1529 memset (mctx
->state_log
+ top
+ 1, '\0',
1530 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1531 mctx
->state_log_top
= next_state_log_idx
;
1536 static reg_errcode_t
1537 merge_state_array (dfa
, dst
, src
, num
)
1539 re_dfastate_t
**dst
;
1540 re_dfastate_t
**src
;
1545 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1547 if (dst
[st_idx
] == NULL
)
1548 dst
[st_idx
] = src
[st_idx
];
1549 else if (src
[st_idx
] != NULL
)
1551 re_node_set merged_set
;
1552 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1553 &src
[st_idx
]->nodes
);
1554 if (BE (err
!= REG_NOERROR
, 0))
1556 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1557 re_node_set_free (&merged_set
);
1558 if (BE (err
!= REG_NOERROR
, 0))
1565 static reg_errcode_t
1566 update_cur_sifted_state (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1567 const regex_t
*preg
;
1568 re_match_context_t
*mctx
;
1569 re_sift_context_t
*sctx
;
1571 re_node_set
*dest_nodes
;
1574 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1575 const re_node_set
*candidates
;
1576 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1577 : &mctx
->state_log
[str_idx
]->nodes
);
1579 /* At first, add the nodes which can epsilon transit to a node in
1581 if (dest_nodes
->nelem
)
1583 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1584 if (BE (err
!= REG_NOERROR
, 0))
1588 /* Then, check the limitations in the current sift_context. */
1589 if (dest_nodes
->nelem
&& sctx
->limits
.nelem
)
1591 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1592 mctx
->bkref_ents
, str_idx
);
1593 if (BE (err
!= REG_NOERROR
, 0))
1597 /* Update state_log. */
1598 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1599 if (BE (sctx
->sifted_states
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1602 if ((mctx
->state_log
[str_idx
] != NULL
1603 && mctx
->state_log
[str_idx
]->has_backref
))
1605 err
= sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
);
1606 if (BE (err
!= REG_NOERROR
, 0))
1612 static reg_errcode_t
1613 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1615 re_node_set
*dest_nodes
;
1616 const re_node_set
*candidates
;
1620 re_node_set src_copy
;
1622 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1623 if (BE (err
!= REG_NOERROR
, 0))
1625 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1627 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1629 + src_copy
.elems
[src_idx
]);
1630 if (BE (err
!= REG_NOERROR
, 0))
1632 re_node_set_free (&src_copy
);
1636 re_node_set_free (&src_copy
);
1640 static reg_errcode_t
1641 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1644 re_node_set
*dest_nodes
;
1645 const re_node_set
*candidates
;
1649 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1650 re_node_set except_nodes
;
1651 re_node_set_init_empty (&except_nodes
);
1652 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1654 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1655 if (cur_node
== node
)
1657 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1659 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1660 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1661 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1662 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1663 && re_node_set_contains (dest_nodes
, edst1
))
1665 && !re_node_set_contains (inv_eclosure
, edst2
)
1666 && re_node_set_contains (dest_nodes
, edst2
)))
1668 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1669 dfa
->inveclosures
+ cur_node
);
1670 if (BE (err
!= REG_NOERROR
, 0))
1672 re_node_set_free (&except_nodes
);
1678 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1680 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1681 if (!re_node_set_contains (&except_nodes
, cur_node
))
1683 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1684 re_node_set_remove_at (dest_nodes
, idx
);
1687 re_node_set_free (&except_nodes
);
1692 check_dst_limits (dfa
, limits
, mctx
, dst_node
, dst_idx
, src_node
, src_idx
)
1694 re_node_set
*limits
;
1695 re_match_context_t
*mctx
;
1696 int dst_node
, dst_idx
, src_node
, src_idx
;
1698 int lim_idx
, src_pos
, dst_pos
;
1700 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1703 struct re_backref_cache_entry
*ent
;
1704 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1705 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1707 dst_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1708 dfa
->eclosures
+ dst_node
,
1709 subexp_idx
, dst_node
, dst_idx
);
1710 src_pos
= check_dst_limits_calc_pos (dfa
, mctx
, limits
->elems
[lim_idx
],
1711 dfa
->eclosures
+ src_node
,
1712 subexp_idx
, src_node
, src_idx
);
1715 <src> <dst> ( <subexp> )
1716 ( <subexp> ) <src> <dst>
1717 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1718 if (src_pos
== dst_pos
)
1719 continue; /* This is unrelated limitation. */
1727 check_dst_limits_calc_pos (dfa
, mctx
, limit
, eclosures
, subexp_idx
, node
,
1730 re_match_context_t
*mctx
;
1731 re_node_set
*eclosures
;
1732 int limit
, subexp_idx
, node
, str_idx
;
1734 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1735 int pos
= (str_idx
< lim
->subexp_from
? -1
1736 : (lim
->subexp_to
< str_idx
? 1 : 0));
1738 && (str_idx
== lim
->subexp_from
|| str_idx
== lim
->subexp_to
))
1741 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1743 int node
= eclosures
->elems
[node_idx
];
1744 re_token_type_t type
= dfa
->nodes
[node
].type
;
1745 if (type
== OP_BACK_REF
)
1747 int bi
= search_cur_bkref_entry (mctx
, str_idx
);
1748 for (; bi
< mctx
->nbkref_ents
; ++bi
)
1750 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bi
;
1751 if (ent
->str_idx
> str_idx
)
1753 if (ent
->node
== node
&& ent
->subexp_from
== ent
->subexp_to
)
1756 dst
= dfa
->edests
[node
].elems
[0];
1757 cpos
= check_dst_limits_calc_pos (dfa
, mctx
, limit
,
1758 dfa
->eclosures
+ dst
,
1761 if ((str_idx
== lim
->subexp_from
&& cpos
== -1)
1762 || (str_idx
== lim
->subexp_to
&& cpos
== 0))
1767 if (type
== OP_OPEN_SUBEXP
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
1768 && str_idx
== lim
->subexp_from
)
1773 if (type
== OP_CLOSE_SUBEXP
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
1774 && str_idx
== lim
->subexp_to
)
1777 if (node_idx
== eclosures
->nelem
&& str_idx
== lim
->subexp_to
)
1783 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1784 which are against limitations from DEST_NODES. */
1786 static reg_errcode_t
1787 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1789 re_node_set
*dest_nodes
;
1790 const re_node_set
*candidates
;
1791 re_node_set
*limits
;
1792 struct re_backref_cache_entry
*bkref_ents
;
1796 int node_idx
, lim_idx
;
1798 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1801 struct re_backref_cache_entry
*ent
;
1802 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
1804 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
1805 continue; /* This is unrelated limitation. */
1807 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1808 if (ent
->subexp_to
== str_idx
)
1812 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1814 int node
= dest_nodes
->elems
[node_idx
];
1815 re_token_type_t type
= dfa
->nodes
[node
].type
;
1816 if (type
== OP_OPEN_SUBEXP
1817 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1819 else if (type
== OP_CLOSE_SUBEXP
1820 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1824 /* Check the limitation of the open subexpression. */
1825 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1828 err
= sub_epsilon_src_nodes(dfa
, ops_node
, dest_nodes
,
1830 if (BE (err
!= REG_NOERROR
, 0))
1833 /* Check the limitation of the close subexpression. */
1834 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1836 int node
= dest_nodes
->elems
[node_idx
];
1837 if (!re_node_set_contains (dfa
->inveclosures
+ node
, cls_node
)
1838 && !re_node_set_contains (dfa
->eclosures
+ node
, cls_node
))
1840 /* It is against this limitation.
1841 Remove it form the current sifted state. */
1842 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
,
1844 if (BE (err
!= REG_NOERROR
, 0))
1850 else /* (ent->subexp_to != str_idx) */
1852 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1854 int node
= dest_nodes
->elems
[node_idx
];
1855 re_token_type_t type
= dfa
->nodes
[node
].type
;
1856 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
1858 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
1860 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
1861 || (type
== OP_OPEN_SUBEXP
))
1863 /* It is against this limitation.
1864 Remove it form the current sifted state. */
1865 err
= sub_epsilon_src_nodes(dfa
, node
, dest_nodes
,
1867 if (BE (err
!= REG_NOERROR
, 0))
1877 static reg_errcode_t
1878 sift_states_bkref (preg
, mctx
, sctx
, str_idx
, dest_nodes
)
1879 const regex_t
*preg
;
1880 re_match_context_t
*mctx
;
1881 re_sift_context_t
*sctx
;
1883 re_node_set
*dest_nodes
;
1886 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1888 re_sift_context_t local_sctx
;
1889 const re_node_set
*candidates
;
1890 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1891 : &mctx
->state_log
[str_idx
]->nodes
);
1892 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
1894 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
1896 int cur_bkref_idx
= re_string_cur_idx (mctx
->input
);
1897 re_token_type_t type
;
1898 node
= candidates
->elems
[node_idx
];
1899 type
= dfa
->nodes
[node
].type
;
1900 if (node
== sctx
->cur_bkref
&& str_idx
== cur_bkref_idx
)
1902 /* Avoid infinite loop for the REs like "()\1+". */
1903 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
1905 if (type
== OP_BACK_REF
)
1907 int enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
1908 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
1910 int disabled_idx
, subexp_len
, to_idx
, dst_node
;
1911 struct re_backref_cache_entry
*entry
;
1912 entry
= mctx
->bkref_ents
+ enabled_idx
;
1913 if (entry
->str_idx
> str_idx
)
1915 if (entry
->node
!= node
)
1917 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
1918 to_idx
= str_idx
+ subexp_len
;
1919 dst_node
= (subexp_len
? dfa
->nexts
[node
]
1920 : dfa
->edests
[node
].elems
[0]);
1922 if (to_idx
> sctx
->last_str_idx
1923 || sctx
->sifted_states
[to_idx
] == NULL
1924 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
],
1926 || check_dst_limits (dfa
, &sctx
->limits
, mctx
, node
,
1927 str_idx
, dst_node
, to_idx
))
1930 re_dfastate_t
*cur_state
;
1932 for (disabled_idx
= enabled_idx
+ 1;
1933 disabled_idx
< mctx
->nbkref_ents
; ++disabled_idx
)
1935 struct re_backref_cache_entry
*entry2
;
1936 entry2
= mctx
->bkref_ents
+ disabled_idx
;
1937 if (entry2
->str_idx
> str_idx
)
1939 entry2
->flag
= (entry2
->node
== node
) ? 1 : entry2
->flag
;
1942 if (local_sctx
.sifted_states
== NULL
)
1945 err
= re_node_set_init_copy (&local_sctx
.limits
,
1947 if (BE (err
!= REG_NOERROR
, 0))
1950 local_sctx
.last_node
= node
;
1951 local_sctx
.last_str_idx
= str_idx
;
1952 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
1953 if (BE (err
< 0, 0))
1958 cur_state
= local_sctx
.sifted_states
[str_idx
];
1959 err
= sift_states_backward (preg
, mctx
, &local_sctx
);
1960 if (BE (err
!= REG_NOERROR
, 0))
1962 if (sctx
->limited_states
!= NULL
)
1964 err
= merge_state_array (dfa
, sctx
->limited_states
,
1965 local_sctx
.sifted_states
,
1967 if (BE (err
!= REG_NOERROR
, 0))
1970 local_sctx
.sifted_states
[str_idx
] = cur_state
;
1971 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
1972 /* We must not use the variable entry here, since
1973 mctx->bkref_ents might be realloced. */
1974 mctx
->bkref_ents
[enabled_idx
].flag
= 1;
1977 enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
1978 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
1980 struct re_backref_cache_entry
*entry
;
1981 entry
= mctx
->bkref_ents
+ enabled_idx
;
1982 if (entry
->str_idx
> str_idx
)
1984 if (entry
->node
== node
)
1991 if (local_sctx
.sifted_states
!= NULL
)
1993 re_node_set_free (&local_sctx
.limits
);
2000 #ifdef RE_ENABLE_I18N
2002 sift_states_iter_mb (preg
, mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2003 const regex_t
*preg
;
2004 const re_match_context_t
*mctx
;
2005 re_sift_context_t
*sctx
;
2006 int node_idx
, str_idx
, max_str_idx
;
2008 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2010 /* Check the node can accept `multi byte'. */
2011 naccepted
= check_node_accept_bytes (preg
, node_idx
, mctx
->input
, str_idx
);
2012 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2013 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2014 dfa
->nexts
[node_idx
]))
2015 /* The node can't accept the `multi byte', or the
2016 destination was already throwed away, then the node
2017 could't accept the current input `multi byte'. */
2019 /* Otherwise, it is sure that the node could accept
2020 `naccepted' bytes input. */
2023 #endif /* RE_ENABLE_I18N */
2026 /* Functions for state transition. */
2028 /* Return the next state to which the current state STATE will transit by
2029 accepting the current input byte, and update STATE_LOG if necessary.
2030 If STATE can accept a multibyte char/collating element/back reference
2031 update the destination of STATE_LOG. */
2033 static re_dfastate_t
*
2034 transit_state (err
, preg
, mctx
, state
, fl_search
)
2036 const regex_t
*preg
;
2037 re_match_context_t
*mctx
;
2038 re_dfastate_t
*state
;
2041 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2042 re_dfastate_t
**trtable
, *next_state
;
2046 if (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->bufs_len
2047 || (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->valid_len
2048 && mctx
->input
->valid_len
< mctx
->input
->len
))
2050 *err
= extend_buffers (mctx
);
2051 if (BE (*err
!= REG_NOERROR
, 0))
2059 re_string_skip_bytes (mctx
->input
, 1);
2063 #ifdef RE_ENABLE_I18N
2064 /* If the current state can accept multibyte. */
2065 if (state
->accept_mb
)
2067 *err
= transit_state_mb (preg
, state
, mctx
);
2068 if (BE (*err
!= REG_NOERROR
, 0))
2071 #endif /* RE_ENABLE_I18N */
2073 /* Then decide the next state with the single byte. */
2076 /* Use transition table */
2077 ch
= re_string_fetch_byte (mctx
->input
);
2078 trtable
= fl_search
? state
->trtable_search
: state
->trtable
;
2079 if (trtable
== NULL
)
2081 trtable
= build_trtable (preg
, state
, fl_search
);
2083 state
->trtable_search
= trtable
;
2085 state
->trtable
= trtable
;
2087 next_state
= trtable
[ch
];
2091 /* don't use transition table */
2092 next_state
= transit_state_sb (err
, preg
, state
, fl_search
, mctx
);
2093 if (BE (next_state
== NULL
&& err
!= REG_NOERROR
, 0))
2098 cur_idx
= re_string_cur_idx (mctx
->input
);
2099 /* Update the state_log if we need. */
2100 if (mctx
->state_log
!= NULL
)
2102 if (cur_idx
> mctx
->state_log_top
)
2104 mctx
->state_log
[cur_idx
] = next_state
;
2105 mctx
->state_log_top
= cur_idx
;
2107 else if (mctx
->state_log
[cur_idx
] == 0)
2109 mctx
->state_log
[cur_idx
] = next_state
;
2113 re_dfastate_t
*pstate
;
2114 unsigned int context
;
2115 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2116 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2117 the destination of a multibyte char/collating element/
2118 back reference. Then the next state is the union set of
2119 these destinations and the results of the transition table. */
2120 pstate
= mctx
->state_log
[cur_idx
];
2121 log_nodes
= pstate
->entrance_nodes
;
2122 if (next_state
!= NULL
)
2124 table_nodes
= next_state
->entrance_nodes
;
2125 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2127 if (BE (*err
!= REG_NOERROR
, 0))
2131 next_nodes
= *log_nodes
;
2132 /* Note: We already add the nodes of the initial state,
2133 then we don't need to add them here. */
2135 context
= re_string_context_at (mctx
->input
,
2136 re_string_cur_idx (mctx
->input
) - 1,
2137 mctx
->eflags
, preg
->newline_anchor
);
2138 next_state
= mctx
->state_log
[cur_idx
]
2139 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2140 /* We don't need to check errors here, since the return value of
2141 this function is next_state and ERR is already set. */
2143 if (table_nodes
!= NULL
)
2144 re_node_set_free (&next_nodes
);
2148 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2149 later. We must check them here, since the back references in the
2150 next state might use them. */
2151 if (dfa
->nbackref
&& next_state
/* && fl_process_bkref */)
2153 *err
= check_subexp_matching_top (dfa
, mctx
, &next_state
->nodes
,
2155 if (BE (*err
!= REG_NOERROR
, 0))
2159 /* If the next state has back references. */
2160 if (next_state
!= NULL
&& next_state
->has_backref
)
2162 *err
= transit_state_bkref (preg
, &next_state
->nodes
, mctx
);
2163 if (BE (*err
!= REG_NOERROR
, 0))
2165 next_state
= mctx
->state_log
[cur_idx
];
2170 /* Helper functions for transit_state. */
2172 /* From the node set CUR_NODES, pick up the nodes whose types are
2173 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2174 expression. And register them to use them later for evaluating the
2175 correspoding back references. */
2177 static reg_errcode_t
2178 check_subexp_matching_top (dfa
, mctx
, cur_nodes
, str_idx
)
2180 re_match_context_t
*mctx
;
2181 re_node_set
*cur_nodes
;
2187 /* TODO: This isn't efficient.
2188 Because there might be more than one nodes whose types are
2189 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2192 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2194 int node
= cur_nodes
->elems
[node_idx
];
2195 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2196 && dfa
->nodes
[node
].opr
.idx
< (8 * sizeof (dfa
->used_bkref_map
))
2197 && dfa
->used_bkref_map
& (1 << dfa
->nodes
[node
].opr
.idx
))
2199 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2200 if (BE (err
!= REG_NOERROR
, 0))
2207 /* Return the next state to which the current state STATE will transit by
2208 accepting the current input byte. */
2210 static re_dfastate_t
*
2211 transit_state_sb (err
, preg
, state
, fl_search
, mctx
)
2213 const regex_t
*preg
;
2214 re_dfastate_t
*state
;
2216 re_match_context_t
*mctx
;
2218 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2219 re_node_set next_nodes
;
2220 re_dfastate_t
*next_state
;
2221 int node_cnt
, cur_str_idx
= re_string_cur_idx (mctx
->input
);
2222 unsigned int context
;
2224 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2225 if (BE (*err
!= REG_NOERROR
, 0))
2227 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2229 int cur_node
= state
->nodes
.elems
[node_cnt
];
2230 if (check_node_accept (preg
, dfa
->nodes
+ cur_node
, mctx
, cur_str_idx
))
2232 *err
= re_node_set_merge (&next_nodes
,
2233 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2234 if (BE (*err
!= REG_NOERROR
, 0))
2236 re_node_set_free (&next_nodes
);
2243 #ifdef RE_ENABLE_I18N
2244 int not_initial
= 0;
2246 for (node_cnt
= 0; node_cnt
< next_nodes
.nelem
; ++node_cnt
)
2247 if (dfa
->nodes
[next_nodes
.elems
[node_cnt
]].type
== CHARACTER
)
2249 not_initial
= dfa
->nodes
[next_nodes
.elems
[node_cnt
]].mb_partial
;
2255 *err
= re_node_set_merge (&next_nodes
,
2256 dfa
->init_state
->entrance_nodes
);
2257 if (BE (*err
!= REG_NOERROR
, 0))
2259 re_node_set_free (&next_nodes
);
2264 context
= re_string_context_at (mctx
->input
, cur_str_idx
, mctx
->eflags
,
2265 preg
->newline_anchor
);
2266 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2267 /* We don't need to check errors here, since the return value of
2268 this function is next_state and ERR is already set. */
2270 re_node_set_free (&next_nodes
);
2271 re_string_skip_bytes (mctx
->input
, 1);
2275 #ifdef RE_ENABLE_I18N
2276 static reg_errcode_t
2277 transit_state_mb (preg
, pstate
, mctx
)
2278 const regex_t
*preg
;
2279 re_dfastate_t
*pstate
;
2280 re_match_context_t
*mctx
;
2283 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2286 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2288 re_node_set dest_nodes
, *new_nodes
;
2289 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2290 int naccepted
= 0, dest_idx
;
2291 unsigned int context
;
2292 re_dfastate_t
*dest_state
;
2294 if (dfa
->nodes
[cur_node_idx
].constraint
)
2296 context
= re_string_context_at (mctx
->input
,
2297 re_string_cur_idx (mctx
->input
),
2298 mctx
->eflags
, preg
->newline_anchor
);
2299 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2304 /* How many bytes the node can accepts? */
2305 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2306 naccepted
= check_node_accept_bytes (preg
, cur_node_idx
, mctx
->input
,
2307 re_string_cur_idx (mctx
->input
));
2311 /* The node can accepts `naccepted' bytes. */
2312 dest_idx
= re_string_cur_idx (mctx
->input
) + naccepted
;
2313 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2314 : mctx
->max_mb_elem_len
);
2315 err
= clean_state_log_if_need (mctx
, dest_idx
);
2316 if (BE (err
!= REG_NOERROR
, 0))
2319 assert (dfa
->nexts
[cur_node_idx
] != -1);
2321 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2322 then we use pstate->nodes.elems[i] instead. */
2323 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2325 dest_state
= mctx
->state_log
[dest_idx
];
2326 if (dest_state
== NULL
)
2327 dest_nodes
= *new_nodes
;
2330 err
= re_node_set_init_union (&dest_nodes
,
2331 dest_state
->entrance_nodes
, new_nodes
);
2332 if (BE (err
!= REG_NOERROR
, 0))
2335 context
= re_string_context_at (mctx
->input
, dest_idx
- 1, mctx
->eflags
,
2336 preg
->newline_anchor
);
2337 mctx
->state_log
[dest_idx
]
2338 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2339 if (dest_state
!= NULL
)
2340 re_node_set_free (&dest_nodes
);
2341 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2346 #endif /* RE_ENABLE_I18N */
2348 static reg_errcode_t
2349 transit_state_bkref (preg
, nodes
, mctx
)
2350 const regex_t
*preg
;
2352 re_match_context_t
*mctx
;
2355 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2357 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
2359 for (i
= 0; i
< nodes
->nelem
; ++i
)
2361 int dest_str_idx
, prev_nelem
, bkc_idx
;
2362 int node_idx
= nodes
->elems
[i
];
2363 unsigned int context
;
2364 re_token_t
*node
= dfa
->nodes
+ node_idx
;
2365 re_node_set
*new_dest_nodes
;
2367 /* Check whether `node' is a backreference or not. */
2368 if (node
->type
!= OP_BACK_REF
)
2371 if (node
->constraint
)
2373 context
= re_string_context_at (mctx
->input
, cur_str_idx
,
2374 mctx
->eflags
, preg
->newline_anchor
);
2375 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2379 /* `node' is a backreference.
2380 Check the substring which the substring matched. */
2381 bkc_idx
= mctx
->nbkref_ents
;
2382 err
= get_subexp (preg
, mctx
, node_idx
, cur_str_idx
);
2383 if (BE (err
!= REG_NOERROR
, 0))
2386 /* And add the epsilon closures (which is `new_dest_nodes') of
2387 the backreference to appropriate state_log. */
2389 assert (dfa
->nexts
[node_idx
] != -1);
2391 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2394 re_dfastate_t
*dest_state
;
2395 struct re_backref_cache_entry
*bkref_ent
;
2396 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2397 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2399 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2400 new_dest_nodes
= (subexp_len
== 0
2401 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2402 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2403 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2404 - bkref_ent
->subexp_from
);
2405 context
= re_string_context_at (mctx
->input
, dest_str_idx
- 1,
2406 mctx
->eflags
, preg
->newline_anchor
);
2407 dest_state
= mctx
->state_log
[dest_str_idx
];
2408 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2409 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2410 /* Add `new_dest_node' to state_log. */
2411 if (dest_state
== NULL
)
2413 mctx
->state_log
[dest_str_idx
]
2414 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2416 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2417 && err
!= REG_NOERROR
, 0))
2422 re_node_set dest_nodes
;
2423 err
= re_node_set_init_union (&dest_nodes
,
2424 dest_state
->entrance_nodes
,
2426 if (BE (err
!= REG_NOERROR
, 0))
2428 re_node_set_free (&dest_nodes
);
2431 mctx
->state_log
[dest_str_idx
]
2432 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2433 re_node_set_free (&dest_nodes
);
2434 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2435 && err
!= REG_NOERROR
, 0))
2438 /* We need to check recursively if the backreference can epsilon
2441 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2443 err
= check_subexp_matching_top (dfa
, mctx
, new_dest_nodes
,
2445 if (BE (err
!= REG_NOERROR
, 0))
2447 err
= transit_state_bkref (preg
, new_dest_nodes
, mctx
);
2448 if (BE (err
!= REG_NOERROR
, 0))
2458 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2459 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2460 Note that we might collect inappropriate candidates here.
2461 However, the cost of checking them strictly here is too high, then we
2462 delay these checking for prune_impossible_nodes(). */
2464 static reg_errcode_t
2465 get_subexp (preg
, mctx
, bkref_node
, bkref_str_idx
)
2466 const regex_t
*preg
;
2467 re_match_context_t
*mctx
;
2468 int bkref_node
, bkref_str_idx
;
2470 int subexp_num
, sub_top_idx
;
2471 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2472 char *buf
= (char *) re_string_get_buffer (mctx
->input
);
2473 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2474 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2475 for (; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
2477 struct re_backref_cache_entry
*entry
= mctx
->bkref_ents
+ cache_idx
;
2478 if (entry
->str_idx
> bkref_str_idx
)
2480 if (entry
->node
== bkref_node
)
2481 return REG_NOERROR
; /* We already checked it. */
2483 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
- 1;
2485 /* For each sub expression */
2486 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2489 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2490 re_sub_match_last_t
*sub_last
;
2491 int sub_last_idx
, sl_str
;
2494 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2495 continue; /* It isn't related. */
2497 sl_str
= sub_top
->str_idx
;
2498 bkref_str
= buf
+ bkref_str_idx
;
2499 /* At first, check the last node of sub expressions we already
2501 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2504 sub_last
= sub_top
->lasts
[sub_last_idx
];
2505 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2506 /* The matched string by the sub expression match with the substring
2507 at the back reference? */
2509 && memcmp (bkref_str
, buf
+ sl_str
, sl_str_diff
) != 0)
2510 break; /* We don't need to search this sub expression any more. */
2511 bkref_str
+= sl_str_diff
;
2512 sl_str
+= sl_str_diff
;
2513 err
= get_subexp_sub (preg
, mctx
, sub_top
, sub_last
, bkref_node
,
2515 if (err
== REG_NOMATCH
)
2517 if (BE (err
!= REG_NOERROR
, 0))
2520 if (sub_last_idx
< sub_top
->nlasts
)
2522 if (sub_last_idx
> 0)
2524 /* Then, search for the other last nodes of the sub expression. */
2525 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2527 int cls_node
, sl_str_off
;
2529 sl_str_off
= sl_str
- sub_top
->str_idx
;
2530 /* The matched string by the sub expression match with the substring
2531 at the back reference? */
2533 && memcmp (bkref_str
++, buf
+ sl_str
- 1, 1) != 0)
2534 break; /* We don't need to search this sub expression any more. */
2535 if (mctx
->state_log
[sl_str
] == NULL
)
2537 /* Does this state have a ')' of the sub expression? */
2538 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2539 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
, 0);
2542 if (sub_top
->path
== NULL
)
2544 sub_top
->path
= calloc (sizeof (state_array_t
),
2545 sl_str
- sub_top
->str_idx
+ 1);
2546 if (sub_top
->path
== NULL
)
2549 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2550 in the current context? */
2551 err
= check_arrival (preg
, mctx
, sub_top
->path
, sub_top
->node
,
2552 sub_top
->str_idx
, cls_node
, sl_str
, 0);
2553 if (err
== REG_NOMATCH
)
2555 if (BE (err
!= REG_NOERROR
, 0))
2557 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2558 if (BE (sub_last
== NULL
, 0))
2560 err
= get_subexp_sub (preg
, mctx
, sub_top
, sub_last
, bkref_node
,
2562 if (err
== REG_NOMATCH
)
2569 /* Helper functions for get_subexp(). */
2571 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2572 If it can arrive, register the sub expression expressed with SUB_TOP
2575 static reg_errcode_t
2576 get_subexp_sub (preg
, mctx
, sub_top
, sub_last
, bkref_node
, bkref_str
)
2577 const regex_t
*preg
;
2578 re_match_context_t
*mctx
;
2579 re_sub_match_top_t
*sub_top
;
2580 re_sub_match_last_t
*sub_last
;
2581 int bkref_node
, bkref_str
;
2585 /* Can the subexpression arrive the back reference? */
2586 err
= check_arrival (preg
, mctx
, &sub_last
->path
, sub_last
->node
,
2587 sub_last
->str_idx
, bkref_node
, bkref_str
, 1);
2588 if (err
!= REG_NOERROR
)
2590 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2592 if (BE (err
!= REG_NOERROR
, 0))
2594 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2595 clean_state_log_if_need (mctx
, to_idx
);
2599 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2600 Search '(' if FL_OPEN, or search ')' otherwise.
2601 TODO: This function isn't efficient...
2602 Because there might be more than one nodes whose types are
2603 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2608 find_subexp_node (dfa
, nodes
, subexp_idx
, fl_open
)
2611 int subexp_idx
, fl_open
;
2614 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2616 int cls_node
= nodes
->elems
[cls_idx
];
2617 re_token_t
*node
= dfa
->nodes
+ cls_node
;
2618 if (((fl_open
&& node
->type
== OP_OPEN_SUBEXP
)
2619 || (!fl_open
&& node
->type
== OP_CLOSE_SUBEXP
))
2620 && node
->opr
.idx
== subexp_idx
)
2626 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2627 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2629 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2631 static reg_errcode_t
2632 check_arrival (preg
, mctx
, path
, top_node
, top_str
, last_node
, last_str
,
2634 const regex_t
*preg
;
2635 re_match_context_t
*mctx
;
2636 state_array_t
*path
;
2637 int top_node
, top_str
, last_node
, last_str
, fl_open
;
2639 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2641 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2642 re_dfastate_t
*cur_state
= NULL
;
2643 re_node_set
*cur_nodes
, next_nodes
;
2644 re_dfastate_t
**backup_state_log
;
2645 unsigned int context
;
2647 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2648 /* Extend the buffer if we need. */
2649 if (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1)
2651 re_dfastate_t
**new_array
;
2652 int old_alloc
= path
->alloc
;
2653 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2654 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2655 if (new_array
== NULL
)
2657 path
->array
= new_array
;
2658 memset (new_array
+ old_alloc
, '\0',
2659 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2662 str_idx
= path
->next_idx
== 0 ? top_str
: path
->next_idx
;
2664 /* Temporary modify MCTX. */
2665 backup_state_log
= mctx
->state_log
;
2666 backup_cur_idx
= mctx
->input
->cur_idx
;
2667 mctx
->state_log
= path
->array
;
2668 mctx
->input
->cur_idx
= str_idx
;
2670 /* Setup initial node set. */
2671 context
= re_string_context_at (mctx
->input
, str_idx
- 1, mctx
->eflags
,
2672 preg
->newline_anchor
);
2673 if (str_idx
== top_str
)
2675 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2676 if (BE (err
!= REG_NOERROR
, 0))
2678 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, fl_open
);
2679 if (BE (err
!= REG_NOERROR
, 0))
2681 re_node_set_free (&next_nodes
);
2687 cur_state
= mctx
->state_log
[str_idx
];
2688 if (cur_state
&& cur_state
->has_backref
)
2690 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2691 if (BE ( err
!= REG_NOERROR
, 0))
2695 re_node_set_init_empty (&next_nodes
);
2697 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2699 if (next_nodes
.nelem
)
2701 err
= expand_bkref_cache (preg
, mctx
, &next_nodes
, str_idx
, last_str
,
2702 subexp_num
, fl_open
);
2703 if (BE ( err
!= REG_NOERROR
, 0))
2705 re_node_set_free (&next_nodes
);
2709 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2710 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2712 re_node_set_free (&next_nodes
);
2715 mctx
->state_log
[str_idx
] = cur_state
;
2718 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2720 re_node_set_empty (&next_nodes
);
2721 if (mctx
->state_log
[str_idx
+ 1])
2723 err
= re_node_set_merge (&next_nodes
,
2724 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2725 if (BE (err
!= REG_NOERROR
, 0))
2727 re_node_set_free (&next_nodes
);
2733 err
= check_arrival_add_next_nodes(preg
, dfa
, mctx
, str_idx
,
2734 &cur_state
->nodes
, &next_nodes
);
2735 if (BE (err
!= REG_NOERROR
, 0))
2737 re_node_set_free (&next_nodes
);
2742 if (next_nodes
.nelem
)
2744 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
,
2746 if (BE (err
!= REG_NOERROR
, 0))
2748 re_node_set_free (&next_nodes
);
2751 err
= expand_bkref_cache (preg
, mctx
, &next_nodes
, str_idx
, last_str
,
2752 subexp_num
, fl_open
);
2753 if (BE ( err
!= REG_NOERROR
, 0))
2755 re_node_set_free (&next_nodes
);
2759 context
= re_string_context_at (mctx
->input
, str_idx
- 1, mctx
->eflags
,
2760 preg
->newline_anchor
);
2761 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2762 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2764 re_node_set_free (&next_nodes
);
2767 mctx
->state_log
[str_idx
] = cur_state
;
2768 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2770 re_node_set_free (&next_nodes
);
2771 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2772 : &mctx
->state_log
[last_str
]->nodes
);
2773 path
->next_idx
= str_idx
;
2776 mctx
->state_log
= backup_state_log
;
2777 mctx
->input
->cur_idx
= backup_cur_idx
;
2779 if (cur_nodes
== NULL
)
2781 /* Then check the current node set has the node LAST_NODE. */
2782 return (re_node_set_contains (cur_nodes
, last_node
)
2783 || re_node_set_contains (cur_nodes
, last_node
) ? REG_NOERROR
2787 /* Helper functions for check_arrival. */
2789 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2791 TODO: This function is similar to the functions transit_state*(),
2792 however this function has many additional works.
2793 Can't we unify them? */
2795 static reg_errcode_t
2796 check_arrival_add_next_nodes (preg
, dfa
, mctx
, str_idx
, cur_nodes
, next_nodes
)
2797 const regex_t
*preg
;
2799 re_match_context_t
*mctx
;
2801 re_node_set
*cur_nodes
, *next_nodes
;
2805 re_node_set union_set
;
2806 re_node_set_init_empty (&union_set
);
2807 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
2810 int cur_node
= cur_nodes
->elems
[cur_idx
];
2811 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
2812 if (IS_EPSILON_NODE(type
))
2814 #ifdef RE_ENABLE_I18N
2815 /* If the node may accept `multi byte'. */
2816 if (ACCEPT_MB_NODE (type
))
2818 naccepted
= check_node_accept_bytes (preg
, cur_node
, mctx
->input
,
2822 re_dfastate_t
*dest_state
;
2823 int next_node
= dfa
->nexts
[cur_node
];
2824 int next_idx
= str_idx
+ naccepted
;
2825 dest_state
= mctx
->state_log
[next_idx
];
2826 re_node_set_empty (&union_set
);
2829 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
2830 if (BE (err
!= REG_NOERROR
, 0))
2832 re_node_set_free (&union_set
);
2835 err
= re_node_set_insert (&union_set
, next_node
);
2836 if (BE (err
< 0, 0))
2838 re_node_set_free (&union_set
);
2844 err
= re_node_set_insert (&union_set
, next_node
);
2845 if (BE (err
< 0, 0))
2847 re_node_set_free (&union_set
);
2851 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
2853 if (BE (mctx
->state_log
[next_idx
] == NULL
2854 && err
!= REG_NOERROR
, 0))
2856 re_node_set_free (&union_set
);
2861 #endif /* RE_ENABLE_I18N */
2863 || check_node_accept (preg
, dfa
->nodes
+ cur_node
, mctx
,
2866 err
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
2867 if (BE (err
< 0, 0))
2869 re_node_set_free (&union_set
);
2874 re_node_set_free (&union_set
);
2878 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
2879 CUR_NODES, however exclude the nodes which are:
2880 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2881 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2884 static reg_errcode_t
2885 check_arrival_expand_ecl (dfa
, cur_nodes
, ex_subexp
, fl_open
)
2887 re_node_set
*cur_nodes
;
2888 int ex_subexp
, fl_open
;
2891 int idx
, outside_node
;
2892 re_node_set new_nodes
;
2894 assert (cur_nodes
->nelem
);
2896 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
2897 if (BE (err
!= REG_NOERROR
, 0))
2899 /* Create a new node set NEW_NODES with the nodes which are epsilon
2900 closures of the node in CUR_NODES. */
2902 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
2904 int cur_node
= cur_nodes
->elems
[idx
];
2905 re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
2906 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, fl_open
);
2907 if (outside_node
== -1)
2909 /* There are no problematic nodes, just merge them. */
2910 err
= re_node_set_merge (&new_nodes
, eclosure
);
2911 if (BE (err
!= REG_NOERROR
, 0))
2913 re_node_set_free (&new_nodes
);
2919 /* There are problematic nodes, re-calculate incrementally. */
2920 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
2921 ex_subexp
, fl_open
);
2922 if (BE (err
!= REG_NOERROR
, 0))
2924 re_node_set_free (&new_nodes
);
2929 re_node_set_free (cur_nodes
);
2930 *cur_nodes
= new_nodes
;
2934 /* Helper function for check_arrival_expand_ecl.
2935 Check incrementally the epsilon closure of TARGET, and if it isn't
2936 problematic append it to DST_NODES. */
2938 static reg_errcode_t
2939 check_arrival_expand_ecl_sub (dfa
, dst_nodes
, target
, ex_subexp
, fl_open
)
2941 int target
, ex_subexp
, fl_open
;
2942 re_node_set
*dst_nodes
;
2945 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
2948 type
= dfa
->nodes
[cur_node
].type
;
2950 if (((type
== OP_OPEN_SUBEXP
&& fl_open
)
2951 || (type
== OP_CLOSE_SUBEXP
&& !fl_open
))
2952 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
2956 err
= re_node_set_insert (dst_nodes
, cur_node
);
2957 if (BE (err
== -1, 0))
2962 err
= re_node_set_insert (dst_nodes
, cur_node
);
2963 if (BE (err
== -1, 0))
2965 if (dfa
->edests
[cur_node
].nelem
== 0)
2967 if (dfa
->edests
[cur_node
].nelem
== 2)
2969 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
2970 dfa
->edests
[cur_node
].elems
[1],
2971 ex_subexp
, fl_open
);
2972 if (BE (err
!= REG_NOERROR
, 0))
2975 cur_node
= dfa
->edests
[cur_node
].elems
[0];
2981 /* For all the back references in the current state, calculate the
2982 destination of the back references by the appropriate entry
2983 in MCTX->BKREF_ENTS. */
2985 static reg_errcode_t
2986 expand_bkref_cache (preg
, mctx
, cur_nodes
, cur_str
, last_str
, subexp_num
,
2988 const regex_t
*preg
;
2989 re_match_context_t
*mctx
;
2990 int cur_str
, last_str
, subexp_num
, fl_open
;
2991 re_node_set
*cur_nodes
;
2994 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2995 int cache_idx
, cache_idx_start
;
2996 /* The current state. */
2998 cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
2999 for (cache_idx
= cache_idx_start
; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
3001 int to_idx
, next_node
;
3002 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ cache_idx
;
3003 if (ent
->str_idx
> cur_str
)
3005 /* Is this entry ENT is appropriate? */
3006 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3009 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3010 /* Calculate the destination of the back reference, and append it
3011 to MCTX->STATE_LOG. */
3012 if (to_idx
== cur_str
)
3014 /* The backreference did epsilon transit, we must re-check all the
3015 node in the current state. */
3016 re_node_set new_dests
;
3017 reg_errcode_t err2
, err3
;
3018 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3019 if (re_node_set_contains (cur_nodes
, next_node
))
3021 err
= re_node_set_init_1 (&new_dests
, next_node
);
3022 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
,
3024 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3025 re_node_set_free (&new_dests
);
3026 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3027 || err3
!= REG_NOERROR
, 0))
3029 err
= (err
!= REG_NOERROR
? err
3030 : (err2
!= REG_NOERROR
? err2
: err3
));
3033 /* TODO: It is still inefficient... */
3034 cache_idx
= cache_idx_start
- 1;
3039 re_node_set union_set
;
3040 next_node
= dfa
->nexts
[ent
->node
];
3041 if (mctx
->state_log
[to_idx
])
3044 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3047 err
= re_node_set_init_copy (&union_set
,
3048 &mctx
->state_log
[to_idx
]->nodes
);
3049 ret
= re_node_set_insert (&union_set
, next_node
);
3050 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3052 re_node_set_free (&union_set
);
3053 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3059 err
= re_node_set_init_1 (&union_set
, next_node
);
3060 if (BE (err
!= REG_NOERROR
, 0))
3063 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3064 re_node_set_free (&union_set
);
3065 if (BE (mctx
->state_log
[to_idx
] == NULL
3066 && err
!= REG_NOERROR
, 0))
3073 /* Build transition table for the state.
3074 Return the new table if succeeded, otherwise return NULL. */
3076 static re_dfastate_t
**
3077 build_trtable (preg
, state
, fl_search
)
3078 const regex_t
*preg
;
3079 const re_dfastate_t
*state
;
3083 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3085 int dests_node_malloced
= 0, dest_states_malloced
= 0;
3086 int ndests
; /* Number of the destination states from `state'. */
3087 re_dfastate_t
**trtable
;
3088 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3089 re_node_set follows
, *dests_node
;
3093 /* We build DFA states which corresponds to the destination nodes
3094 from `state'. `dests_node[i]' represents the nodes which i-th
3095 destination state contains, and `dests_ch[i]' represents the
3096 characters which i-th destination state accepts. */
3098 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
3099 dests_node
= (re_node_set
*)
3100 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3104 dests_node
= (re_node_set
*)
3105 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3106 if (BE (dests_node
== NULL
, 0))
3108 dests_node_malloced
= 1;
3110 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
3112 /* Initialize transiton table. */
3113 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3114 if (BE (trtable
== NULL
, 0))
3116 if (dests_node_malloced
)
3121 /* At first, group all nodes belonging to `state' into several
3123 ndests
= group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
);
3124 if (BE (ndests
<= 0, 0))
3126 if (dests_node_malloced
)
3128 /* Return NULL in case of an error, trtable otherwise. */
3135 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3136 if (BE (err
!= REG_NOERROR
, 0))
3140 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
3141 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3142 dest_states
= (re_dfastate_t
**)
3143 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3147 dest_states
= (re_dfastate_t
**)
3148 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3149 if (BE (dest_states
== NULL
, 0))
3152 if (dest_states_malloced
)
3154 re_node_set_free (&follows
);
3155 for (i
= 0; i
< ndests
; ++i
)
3156 re_node_set_free (dests_node
+ i
);
3158 if (dests_node_malloced
)
3162 dest_states_malloced
= 1;
3164 dest_states_word
= dest_states
+ ndests
;
3165 dest_states_nl
= dest_states_word
+ ndests
;
3166 bitset_empty (acceptable
);
3168 /* Then build the states for all destinations. */
3169 for (i
= 0; i
< ndests
; ++i
)
3172 re_node_set_empty (&follows
);
3173 /* Merge the follows of this destination states. */
3174 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3176 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3177 if (next_node
!= -1)
3179 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3180 if (BE (err
!= REG_NOERROR
, 0))
3184 /* If search flag is set, merge the initial state. */
3187 #ifdef RE_ENABLE_I18N
3188 int not_initial
= 0;
3189 for (j
= 0; j
< follows
.nelem
; ++j
)
3190 if (dfa
->nodes
[follows
.elems
[j
]].type
== CHARACTER
)
3192 not_initial
= dfa
->nodes
[follows
.elems
[j
]].mb_partial
;
3198 err
= re_node_set_merge (&follows
,
3199 dfa
->init_state
->entrance_nodes
);
3200 if (BE (err
!= REG_NOERROR
, 0))
3204 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3205 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3207 /* If the new state has context constraint,
3208 build appropriate states for these contexts. */
3209 if (dest_states
[i
]->has_constraint
)
3211 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3213 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3215 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3217 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3222 dest_states_word
[i
] = dest_states
[i
];
3223 dest_states_nl
[i
] = dest_states
[i
];
3225 bitset_merge (acceptable
, dests_ch
[i
]);
3228 /* Update the transition table. */
3229 /* For all characters ch...: */
3230 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
3231 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
3232 if ((acceptable
[i
] >> j
) & 1)
3234 /* The current state accepts the character ch. */
3235 if (IS_WORD_CHAR (ch
))
3237 for (k
= 0; k
< ndests
; ++k
)
3238 if ((dests_ch
[k
][i
] >> j
) & 1)
3240 /* k-th destination accepts the word character ch. */
3241 trtable
[ch
] = dest_states_word
[k
];
3242 /* There must be only one destination which accepts
3243 character ch. See group_nodes_into_DFAstates. */
3247 else /* not WORD_CHAR */
3249 for (k
= 0; k
< ndests
; ++k
)
3250 if ((dests_ch
[k
][i
] >> j
) & 1)
3252 /* k-th destination accepts the non-word character ch. */
3253 trtable
[ch
] = dest_states
[k
];
3254 /* There must be only one destination which accepts
3255 character ch. See group_nodes_into_DFAstates. */
3261 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3263 /* The current state accepts newline character. */
3264 for (k
= 0; k
< ndests
; ++k
)
3265 if (bitset_contain (dests_ch
[k
], NEWLINE_CHAR
))
3267 /* k-th destination accepts newline character. */
3268 trtable
[NEWLINE_CHAR
] = dest_states_nl
[k
];
3269 /* There must be only one destination which accepts
3270 newline. See group_nodes_into_DFAstates. */
3275 if (dest_states_malloced
)
3278 re_node_set_free (&follows
);
3279 for (i
= 0; i
< ndests
; ++i
)
3280 re_node_set_free (dests_node
+ i
);
3282 if (dests_node_malloced
)
3288 /* Group all nodes belonging to STATE into several destinations.
3289 Then for all destinations, set the nodes belonging to the destination
3290 to DESTS_NODE[i] and set the characters accepted by the destination
3291 to DEST_CH[i]. This function return the number of destinations. */
3294 group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
)
3295 const regex_t
*preg
;
3296 const re_dfastate_t
*state
;
3297 re_node_set
*dests_node
;
3301 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3303 int ndests
; /* Number of the destinations from `state'. */
3304 bitset accepts
; /* Characters a node can accept. */
3305 const re_node_set
*cur_nodes
= &state
->nodes
;
3306 bitset_empty (accepts
);
3309 /* For all the nodes belonging to `state', */
3310 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3312 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3313 re_token_type_t type
= node
->type
;
3314 unsigned int constraint
= node
->constraint
;
3316 /* Enumerate all single byte character this node can accept. */
3317 if (type
== CHARACTER
)
3318 bitset_set (accepts
, node
->opr
.c
);
3319 else if (type
== SIMPLE_BRACKET
)
3321 bitset_merge (accepts
, node
->opr
.sbcset
);
3323 else if (type
== OP_PERIOD
)
3325 bitset_set_all (accepts
);
3326 if (!(preg
->syntax
& RE_DOT_NEWLINE
))
3327 bitset_clear (accepts
, '\n');
3328 if (preg
->syntax
& RE_DOT_NOT_NULL
)
3329 bitset_clear (accepts
, '\0');
3334 /* Check the `accepts' and sift the characters which are not
3335 match it the context. */
3338 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3340 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3341 bitset_empty (accepts
);
3342 if (accepts_newline
)
3343 bitset_set (accepts
, NEWLINE_CHAR
);
3347 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3349 bitset_empty (accepts
);
3352 if (constraint
& NEXT_WORD_CONSTRAINT
)
3353 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3354 accepts
[j
] &= dfa
->word_char
[j
];
3355 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3356 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3357 accepts
[j
] &= ~dfa
->word_char
[j
];
3360 /* Then divide `accepts' into DFA states, or create a new
3362 for (j
= 0; j
< ndests
; ++j
)
3364 bitset intersec
; /* Intersection sets, see below. */
3366 /* Flags, see below. */
3367 int has_intersec
, not_subset
, not_consumed
;
3369 /* Optimization, skip if this state doesn't accept the character. */
3370 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3373 /* Enumerate the intersection set of this state and `accepts'. */
3375 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3376 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3377 /* And skip if the intersection set is empty. */
3381 /* Then check if this state is a subset of `accepts'. */
3382 not_subset
= not_consumed
= 0;
3383 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3385 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3386 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3389 /* If this state isn't a subset of `accepts', create a
3390 new group state, which has the `remains'. */
3393 bitset_copy (dests_ch
[ndests
], remains
);
3394 bitset_copy (dests_ch
[j
], intersec
);
3395 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3396 if (BE (err
!= REG_NOERROR
, 0))
3401 /* Put the position in the current group. */
3402 err
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3403 if (BE (err
< 0, 0))
3406 /* If all characters are consumed, go to next node. */
3410 /* Some characters remain, create a new group. */
3413 bitset_copy (dests_ch
[ndests
], accepts
);
3414 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3415 if (BE (err
!= REG_NOERROR
, 0))
3418 bitset_empty (accepts
);
3423 for (j
= 0; j
< ndests
; ++j
)
3424 re_node_set_free (dests_node
+ j
);
3428 #ifdef RE_ENABLE_I18N
3429 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3430 Return the number of the bytes the node accepts.
3431 STR_IDX is the current index of the input string.
3433 This function handles the nodes which can accept one character, or
3434 one collating element like '.', '[a-z]', opposite to the other nodes
3435 can only accept one byte. */
3438 check_node_accept_bytes (preg
, node_idx
, input
, str_idx
)
3439 const regex_t
*preg
;
3440 int node_idx
, str_idx
;
3441 const re_string_t
*input
;
3443 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
3444 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3445 int elem_len
= re_string_elem_size_at (input
, str_idx
);
3446 int char_len
= re_string_char_size_at (input
, str_idx
);
3450 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3452 if (elem_len
<= 1 && char_len
<= 1)
3454 if (node
->type
== OP_PERIOD
)
3456 /* '.' accepts any one character except the following two cases. */
3457 if ((!(preg
->syntax
& RE_DOT_NEWLINE
) &&
3458 re_string_byte_at (input
, str_idx
) == '\n') ||
3459 ((preg
->syntax
& RE_DOT_NOT_NULL
) &&
3460 re_string_byte_at (input
, str_idx
) == '\0'))
3464 else if (node
->type
== COMPLEX_BRACKET
)
3466 const re_charset_t
*cset
= node
->opr
.mbcset
;
3468 const unsigned char *pin
= ((char *) re_string_get_buffer (input
)
3472 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3473 ? re_string_wchar_at (input
, str_idx
) : 0);
3475 /* match with multibyte character? */
3476 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3477 if (wc
== cset
->mbchars
[i
])
3479 match_len
= char_len
;
3480 goto check_node_accept_bytes_match
;
3482 /* match with character_class? */
3483 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3485 wctype_t wt
= cset
->char_classes
[i
];
3486 if (__iswctype (wc
, wt
))
3488 match_len
= char_len
;
3489 goto check_node_accept_bytes_match
;
3496 unsigned int in_collseq
= 0;
3497 const int32_t *table
, *indirect
;
3498 const unsigned char *weights
, *extra
;
3499 const char *collseqwc
;
3501 /* This #include defines a local function! */
3502 # include <locale/weight.h>
3504 /* match with collating_symbol? */
3505 if (cset
->ncoll_syms
)
3506 extra
= (const unsigned char *)
3507 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3508 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3510 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3511 /* Compare the length of input collating element and
3512 the length of current collating element. */
3513 if (*coll_sym
!= elem_len
)
3515 /* Compare each bytes. */
3516 for (j
= 0; j
< *coll_sym
; j
++)
3517 if (pin
[j
] != coll_sym
[1 + j
])
3521 /* Match if every bytes is equal. */
3523 goto check_node_accept_bytes_match
;
3529 if (elem_len
<= char_len
)
3531 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3532 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3535 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3537 /* match with range expression? */
3538 for (i
= 0; i
< cset
->nranges
; ++i
)
3539 if (cset
->range_starts
[i
] <= in_collseq
3540 && in_collseq
<= cset
->range_ends
[i
])
3542 match_len
= elem_len
;
3543 goto check_node_accept_bytes_match
;
3546 /* match with equivalence_class? */
3547 if (cset
->nequiv_classes
)
3549 const unsigned char *cp
= pin
;
3550 table
= (const int32_t *)
3551 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3552 weights
= (const unsigned char *)
3553 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3554 extra
= (const unsigned char *)
3555 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3556 indirect
= (const int32_t *)
3557 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3558 idx
= findidx (&cp
);
3560 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3562 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3563 size_t weight_len
= weights
[idx
];
3564 if (weight_len
== weights
[equiv_class_idx
])
3567 while (cnt
<= weight_len
3568 && (weights
[equiv_class_idx
+ 1 + cnt
]
3569 == weights
[idx
+ 1 + cnt
]))
3571 if (cnt
> weight_len
)
3573 match_len
= elem_len
;
3574 goto check_node_accept_bytes_match
;
3583 /* match with range expression? */
3585 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3587 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3590 for (i
= 0; i
< cset
->nranges
; ++i
)
3592 cmp_buf
[0] = cset
->range_starts
[i
];
3593 cmp_buf
[4] = cset
->range_ends
[i
];
3594 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3595 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3597 match_len
= char_len
;
3598 goto check_node_accept_bytes_match
;
3602 check_node_accept_bytes_match
:
3603 if (!cset
->non_match
)
3610 return (elem_len
> char_len
) ? elem_len
: char_len
;
3618 find_collation_sequence_value (mbs
, mbs_len
)
3619 const unsigned char *mbs
;
3622 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3627 /* No valid character. Match it as a single byte character. */
3628 const unsigned char *collseq
= (const unsigned char *)
3629 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3630 return collseq
[mbs
[0]];
3637 const unsigned char *extra
= (const unsigned char *)
3638 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3642 int mbs_cnt
, found
= 0;
3643 int32_t elem_mbs_len
;
3644 /* Skip the name of collating element name. */
3645 idx
= idx
+ extra
[idx
] + 1;
3646 elem_mbs_len
= extra
[idx
++];
3647 if (mbs_len
== elem_mbs_len
)
3649 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3650 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3652 if (mbs_cnt
== elem_mbs_len
)
3653 /* Found the entry. */
3656 /* Skip the byte sequence of the collating element. */
3657 idx
+= elem_mbs_len
;
3658 /* Adjust for the alignment. */
3659 idx
= (idx
+ 3) & ~3;
3660 /* Skip the collation sequence value. */
3661 idx
+= sizeof (uint32_t);
3662 /* Skip the wide char sequence of the collating element. */
3663 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3664 /* If we found the entry, return the sequence value. */
3666 return *(uint32_t *) (extra
+ idx
);
3667 /* Skip the collation sequence value. */
3668 idx
+= sizeof (uint32_t);
3673 #endif /* RE_ENABLE_I18N */
3675 /* Check whether the node accepts the byte which is IDX-th
3676 byte of the INPUT. */
3679 check_node_accept (preg
, node
, mctx
, idx
)
3680 const regex_t
*preg
;
3681 const re_token_t
*node
;
3682 const re_match_context_t
*mctx
;
3686 if (node
->constraint
)
3688 /* The node has constraints. Check whether the current context
3689 satisfies the constraints. */
3690 unsigned int context
= re_string_context_at (mctx
->input
, idx
,
3692 preg
->newline_anchor
);
3693 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
3696 ch
= re_string_byte_at (mctx
->input
, idx
);
3697 if (node
->type
== CHARACTER
)
3698 return node
->opr
.c
== ch
;
3699 else if (node
->type
== SIMPLE_BRACKET
)
3700 return bitset_contain (node
->opr
.sbcset
, ch
);
3701 else if (node
->type
== OP_PERIOD
)
3702 return !((ch
== '\n' && !(preg
->syntax
& RE_DOT_NEWLINE
))
3703 || (ch
== '\0' && (preg
->syntax
& RE_DOT_NOT_NULL
)));
3708 /* Extend the buffers, if the buffers have run out. */
3710 static reg_errcode_t
3711 extend_buffers (mctx
)
3712 re_match_context_t
*mctx
;
3715 re_string_t
*pstr
= mctx
->input
;
3717 /* Double the lengthes of the buffers. */
3718 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
3719 if (BE (ret
!= REG_NOERROR
, 0))
3722 if (mctx
->state_log
!= NULL
)
3724 /* And double the length of state_log. */
3725 re_dfastate_t
**new_array
;
3726 new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
3727 pstr
->bufs_len
* 2);
3728 if (BE (new_array
== NULL
, 0))
3730 mctx
->state_log
= new_array
;
3733 /* Then reconstruct the buffers. */
3736 #ifdef RE_ENABLE_I18N
3738 build_wcs_upper_buffer (pstr
);
3740 #endif /* RE_ENABLE_I18N */
3741 build_upper_buffer (pstr
);
3745 #ifdef RE_ENABLE_I18N
3747 build_wcs_buffer (pstr
);
3749 #endif /* RE_ENABLE_I18N */
3751 if (pstr
->trans
!= NULL
)
3752 re_string_translate_buffer (pstr
);
3754 pstr
->valid_len
= pstr
->bufs_len
;
3761 /* Functions for matching context. */
3763 /* Initialize MCTX. */
3765 static reg_errcode_t
3766 match_ctx_init (mctx
, eflags
, input
, n
)
3767 re_match_context_t
*mctx
;
3771 mctx
->eflags
= eflags
;
3772 mctx
->input
= input
;
3773 mctx
->match_last
= -1;
3776 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
3777 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
3778 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
3782 mctx
->bkref_ents
= NULL
;
3783 mctx
->nbkref_ents
= 0;
3784 mctx
->abkref_ents
= n
;
3785 mctx
->max_mb_elem_len
= 1;
3786 mctx
->nsub_tops
= 0;
3787 mctx
->asub_tops
= n
;
3791 /* Clean the entries which depend on the current input in MCTX.
3792 This function must be invoked when the matcher changes the start index
3793 of the input, or changes the input string. */
3796 match_ctx_clean (mctx
)
3797 re_match_context_t
*mctx
;
3799 match_ctx_free_subtops (mctx
);
3800 mctx
->nsub_tops
= 0;
3801 mctx
->nbkref_ents
= 0;
3804 /* Free all the memory associated with MCTX. */
3807 match_ctx_free (mctx
)
3808 re_match_context_t
*mctx
;
3810 match_ctx_free_subtops (mctx
);
3811 re_free (mctx
->sub_tops
);
3812 re_free (mctx
->bkref_ents
);
3815 /* Free all the memory associated with MCTX->SUB_TOPS. */
3818 match_ctx_free_subtops (mctx
)
3819 re_match_context_t
*mctx
;
3822 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
3825 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
3826 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
3828 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
3829 re_free (last
->path
.array
);
3832 re_free (top
->lasts
);
3835 re_free (top
->path
->array
);
3836 re_free (top
->path
);
3842 /* Add a new backreference entry to MCTX.
3843 Note that we assume that caller never call this function with duplicate
3844 entry, and call with STR_IDX which isn't smaller than any existing entry.
3847 static reg_errcode_t
3848 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
3849 re_match_context_t
*mctx
;
3850 int node
, str_idx
, from
, to
;
3852 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
3854 struct re_backref_cache_entry
* new_entry
;
3855 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
3856 mctx
->abkref_ents
* 2);
3857 if (BE (new_entry
== NULL
, 0))
3859 re_free (mctx
->bkref_ents
);
3862 mctx
->bkref_ents
= new_entry
;
3863 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
3864 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
3865 mctx
->abkref_ents
*= 2;
3867 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
3868 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
3869 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
3870 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
3871 mctx
->bkref_ents
[mctx
->nbkref_ents
++].flag
= 0;
3872 if (mctx
->max_mb_elem_len
< to
- from
)
3873 mctx
->max_mb_elem_len
= to
- from
;
3877 /* Search for the first entry which has the same str_idx.
3878 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
3881 search_cur_bkref_entry (mctx
, str_idx
)
3882 re_match_context_t
*mctx
;
3885 int left
, right
, mid
;
3886 right
= mctx
->nbkref_ents
;
3887 for (left
= 0; left
< right
;)
3889 mid
= (left
+ right
) / 2;
3890 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
3899 match_ctx_clear_flag (mctx
)
3900 re_match_context_t
*mctx
;
3903 for (i
= 0; i
< mctx
->nbkref_ents
; ++i
)
3905 mctx
->bkref_ents
[i
].flag
= 0;
3909 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
3912 static reg_errcode_t
3913 match_ctx_add_subtop (mctx
, node
, str_idx
)
3914 re_match_context_t
*mctx
;
3918 assert (mctx
->sub_tops
!= NULL
);
3919 assert (mctx
->asub_tops
> 0);
3921 if (mctx
->nsub_tops
== mctx
->asub_tops
)
3923 re_sub_match_top_t
**new_array
;
3924 mctx
->asub_tops
*= 2;
3925 new_array
= re_realloc (mctx
->sub_tops
, re_sub_match_top_t
*,
3927 if (BE (new_array
== NULL
, 0))
3929 mctx
->sub_tops
= new_array
;
3931 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
3932 if (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
)
3934 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
3935 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
3939 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
3940 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
3942 static re_sub_match_last_t
*
3943 match_ctx_add_sublast (subtop
, node
, str_idx
)
3944 re_sub_match_top_t
*subtop
;
3947 re_sub_match_last_t
*new_entry
;
3948 if (subtop
->nlasts
== subtop
->alasts
)
3950 re_sub_match_last_t
**new_array
;
3951 subtop
->alasts
= 2 * subtop
->alasts
+ 1;
3952 new_array
= re_realloc (subtop
->lasts
, re_sub_match_last_t
*,
3954 if (BE (new_array
== NULL
, 0))
3956 subtop
->lasts
= new_array
;
3958 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
3959 if (BE (new_entry
== NULL
, 0))
3961 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
3962 new_entry
->node
= node
;
3963 new_entry
->str_idx
= str_idx
;
3969 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
,
3971 re_sift_context_t
*sctx
;
3972 re_dfastate_t
**sifted_sts
, **limited_sts
;
3973 int last_node
, last_str_idx
, check_subexp
;
3975 sctx
->sifted_states
= sifted_sts
;
3976 sctx
->limited_states
= limited_sts
;
3977 sctx
->last_node
= last_node
;
3978 sctx
->last_str_idx
= last_str_idx
;
3979 sctx
->check_subexp
= check_subexp
;
3980 sctx
->cur_bkref
= -1;
3981 sctx
->cls_subexp_idx
= -1;
3982 re_node_set_init_empty (&sctx
->limits
);