1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
22 int n
) internal_function
;
23 static void match_ctx_clean (re_match_context_t
*mctx
) internal_function
;
24 static void match_ctx_free (re_match_context_t
*cache
) internal_function
;
25 static void match_ctx_free_subtops (re_match_context_t
*mctx
)
27 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
28 int str_idx
, int from
, int to
)
30 static int search_cur_bkref_entry (re_match_context_t
*mctx
, int str_idx
)
32 static void match_ctx_clear_flag (re_match_context_t
*mctx
) internal_function
;
33 static reg_errcode_t
match_ctx_add_subtop (re_match_context_t
*mctx
, int node
,
34 int str_idx
) internal_function
;
35 static re_sub_match_last_t
* match_ctx_add_sublast (re_sub_match_top_t
*subtop
,
36 int node
, int str_idx
)
38 static void sift_ctx_init (re_sift_context_t
*sctx
, re_dfastate_t
**sifted_sts
,
39 re_dfastate_t
**limited_sts
, int last_node
,
40 int last_str_idx
, int check_subexp
)
42 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
43 const char *string
, int length
,
44 int start
, int range
, int stop
,
45 size_t nmatch
, regmatch_t pmatch
[],
46 int eflags
) internal_function
;
47 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
48 const char *string1
, int length1
,
49 const char *string2
, int length2
,
50 int start
, int range
, struct re_registers
*regs
,
51 int stop
, int ret_len
) internal_function
;
52 static int re_search_stub (struct re_pattern_buffer
*bufp
,
53 const char *string
, int length
, int start
,
54 int range
, int stop
, struct re_registers
*regs
,
55 int ret_len
) internal_function
;
56 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
57 int nregs
, int regs_allocated
) internal_function
;
58 static inline re_dfastate_t
*acquire_init_state_context
59 (reg_errcode_t
*err
, const re_match_context_t
*mctx
, int idx
)
60 __attribute ((always_inline
)) internal_function
;
61 static reg_errcode_t
prune_impossible_nodes (re_match_context_t
*mctx
)
63 static int check_matching (re_match_context_t
*mctx
, int fl_longest_match
,
66 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
67 unsigned int context
) internal_function
;
68 static int check_halt_state_context (const re_match_context_t
*mctx
,
69 const re_dfastate_t
*state
, int idx
)
71 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
,
72 regmatch_t
*prev_idx_match
, int cur_node
,
73 int cur_idx
, int nmatch
) internal_function
;
74 static int proceed_next_node (const re_match_context_t
*mctx
,
75 int nregs
, regmatch_t
*regs
,
76 int *pidx
, int node
, re_node_set
*eps_via_nodes
,
77 struct re_fail_stack_t
*fs
) internal_function
;
78 static reg_errcode_t
push_fail_stack (struct re_fail_stack_t
*fs
,
79 int str_idx
, int *dests
, int nregs
,
81 re_node_set
*eps_via_nodes
) internal_function
;
82 static int pop_fail_stack (struct re_fail_stack_t
*fs
, int *pidx
, int nregs
,
83 regmatch_t
*regs
, re_node_set
*eps_via_nodes
) internal_function
;
84 static reg_errcode_t
set_regs (const regex_t
*preg
,
85 const re_match_context_t
*mctx
,
86 size_t nmatch
, regmatch_t
*pmatch
,
87 int fl_backtrack
) internal_function
;
88 static reg_errcode_t
free_fail_stack_return (struct re_fail_stack_t
*fs
) internal_function
;
91 static int sift_states_iter_mb (const re_match_context_t
*mctx
,
92 re_sift_context_t
*sctx
,
93 int node_idx
, int str_idx
, int max_str_idx
) internal_function
;
94 #endif /* RE_ENABLE_I18N */
95 static reg_errcode_t
sift_states_backward (re_match_context_t
*mctx
,
96 re_sift_context_t
*sctx
) internal_function
;
97 static reg_errcode_t
update_cur_sifted_state (re_match_context_t
*mctx
,
98 re_sift_context_t
*sctx
,
100 re_node_set
*dest_nodes
) internal_function
;
101 static reg_errcode_t
add_epsilon_src_nodes (re_dfa_t
*dfa
,
102 re_node_set
*dest_nodes
,
103 const re_node_set
*candidates
) internal_function
;
104 static reg_errcode_t
sub_epsilon_src_nodes (re_dfa_t
*dfa
, int node
,
105 re_node_set
*dest_nodes
,
106 const re_node_set
*and_nodes
) internal_function
;
107 static int check_dst_limits (re_match_context_t
*mctx
, re_node_set
*limits
,
108 int dst_node
, int dst_idx
, int src_node
,
109 int src_idx
) internal_function
;
110 static int check_dst_limits_calc_pos (re_match_context_t
*mctx
,
111 int limit
, re_node_set
*eclosures
,
112 int subexp_idx
, int node
, int str_idx
) internal_function
;
113 static reg_errcode_t
check_subexp_limits (re_dfa_t
*dfa
,
114 re_node_set
*dest_nodes
,
115 const re_node_set
*candidates
,
117 struct re_backref_cache_entry
*bkref_ents
,
118 int str_idx
) internal_function
;
119 static reg_errcode_t
sift_states_bkref (re_match_context_t
*mctx
,
120 re_sift_context_t
*sctx
,
121 int str_idx
, re_node_set
*dest_nodes
) internal_function
;
122 static reg_errcode_t
clean_state_log_if_needed (re_match_context_t
*mctx
,
123 int next_state_log_idx
) internal_function
;
124 static reg_errcode_t
merge_state_array (re_dfa_t
*dfa
, re_dfastate_t
**dst
,
125 re_dfastate_t
**src
, int num
) internal_function
;
126 static re_dfastate_t
*find_recover_state (reg_errcode_t
*err
,
127 re_match_context_t
*mctx
) internal_function
;
128 static re_dfastate_t
*transit_state (reg_errcode_t
*err
,
129 re_match_context_t
*mctx
,
130 re_dfastate_t
*state
) internal_function
;
131 static re_dfastate_t
*merge_state_with_log (reg_errcode_t
*err
,
132 re_match_context_t
*mctx
,
133 re_dfastate_t
*next_state
) internal_function
;
134 static reg_errcode_t
check_subexp_matching_top (re_match_context_t
*mctx
,
135 re_node_set
*cur_nodes
,
136 int str_idx
) internal_function
;
138 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
,
139 re_match_context_t
*mctx
,
140 re_dfastate_t
*pstate
) internal_function
;
142 #ifdef RE_ENABLE_I18N
143 static reg_errcode_t
transit_state_mb (re_match_context_t
*mctx
,
144 re_dfastate_t
*pstate
) internal_function
;
145 #endif /* RE_ENABLE_I18N */
146 static reg_errcode_t
transit_state_bkref (re_match_context_t
*mctx
,
147 const re_node_set
*nodes
) internal_function
;
148 static reg_errcode_t
get_subexp (re_match_context_t
*mctx
,
149 int bkref_node
, int bkref_str_idx
) internal_function
;
150 static reg_errcode_t
get_subexp_sub (re_match_context_t
*mctx
,
151 const re_sub_match_top_t
*sub_top
,
152 re_sub_match_last_t
*sub_last
,
153 int bkref_node
, int bkref_str
) internal_function
;
154 static int find_subexp_node (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
155 int subexp_idx
, int type
) internal_function
;
156 static reg_errcode_t
check_arrival (re_match_context_t
*mctx
,
157 state_array_t
*path
, int top_node
,
158 int top_str
, int last_node
, int last_str
,
159 int type
) internal_function
;
160 static reg_errcode_t
check_arrival_add_next_nodes (re_match_context_t
*mctx
,
162 re_node_set
*cur_nodes
,
163 re_node_set
*next_nodes
) internal_function
;
164 static reg_errcode_t
check_arrival_expand_ecl (re_dfa_t
*dfa
,
165 re_node_set
*cur_nodes
,
166 int ex_subexp
, int type
) internal_function
;
167 static reg_errcode_t
check_arrival_expand_ecl_sub (re_dfa_t
*dfa
,
168 re_node_set
*dst_nodes
,
169 int target
, int ex_subexp
,
170 int type
) internal_function
;
171 static reg_errcode_t
expand_bkref_cache (re_match_context_t
*mctx
,
172 re_node_set
*cur_nodes
, int cur_str
,
173 int last_str
, int subexp_num
,
174 int type
) internal_function
;
175 static re_dfastate_t
**build_trtable (re_dfa_t
*dfa
,
176 re_dfastate_t
*state
) internal_function
;
177 #ifdef RE_ENABLE_I18N
178 static int check_node_accept_bytes (re_dfa_t
*dfa
, int node_idx
,
179 const re_string_t
*input
, int idx
) internal_function
;
181 static unsigned int find_collation_sequence_value (const unsigned char *mbs
,
182 size_t name_len
) internal_function
;
184 #endif /* RE_ENABLE_I18N */
185 static int group_nodes_into_DFAstates (re_dfa_t
*dfa
,
186 const re_dfastate_t
*state
,
187 re_node_set
*states_node
,
188 bitset
*states_ch
) internal_function
;
189 static int check_node_accept (const re_match_context_t
*mctx
,
190 const re_token_t
*node
, int idx
) internal_function
;
191 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
) internal_function
;
193 /* Entry point for POSIX code. */
195 /* regexec searches for a given pattern, specified by PREG, in the
198 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
199 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
200 least NMATCH elements, and we set them to the offsets of the
201 corresponding matched substrings.
203 EFLAGS specifies `execution flags' which affect matching: if
204 REG_NOTBOL is set, then ^ does not match at the beginning of the
205 string; if REG_NOTEOL is set, then $ does not match at the end.
207 We return 0 if we find a match and REG_NOMATCH if not. */
210 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
211 const regex_t
*__restrict preg
;
212 const char *__restrict string
;
219 if (eflags
& REG_STARTEND
)
221 start
= pmatch
[0].rm_so
;
222 length
= pmatch
[0].rm_eo
;
227 length
= strlen (string
);
230 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
231 length
, 0, NULL
, eflags
);
233 err
= re_search_internal (preg
, string
, length
, start
, length
- start
,
234 length
, nmatch
, pmatch
, eflags
);
235 return err
!= REG_NOERROR
;
238 weak_alias (__regexec
, regexec
)
241 /* Entry points for GNU code. */
243 /* re_match, re_search, re_match_2, re_search_2
245 The former two functions operate on STRING with length LENGTH,
246 while the later two operate on concatenation of STRING1 and STRING2
247 with lengths LENGTH1 and LENGTH2, respectively.
249 re_match() matches the compiled pattern in BUFP against the string,
250 starting at index START.
252 re_search() first tries matching at index START, then it tries to match
253 starting from index START + 1, and so on. The last start position tried
254 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
257 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258 the first STOP characters of the concatenation of the strings should be
261 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
263 computed relative to the concatenation, not relative to the individual
266 On success, re_match* functions return the length of the match, re_search*
267 return the position of the start of the match. Return value -1 means no
268 match was found and -2 indicates an internal error. */
271 re_match (bufp
, string
, length
, start
, regs
)
272 struct re_pattern_buffer
*bufp
;
275 struct re_registers
*regs
;
277 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
280 weak_alias (__re_match
, re_match
)
284 re_search (bufp
, string
, length
, start
, range
, regs
)
285 struct re_pattern_buffer
*bufp
;
287 int length
, start
, range
;
288 struct re_registers
*regs
;
290 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
293 weak_alias (__re_search
, re_search
)
297 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
298 struct re_pattern_buffer
*bufp
;
299 const char *string1
, *string2
;
300 int length1
, length2
, start
, stop
;
301 struct re_registers
*regs
;
303 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
304 start
, 0, regs
, stop
, 1);
307 weak_alias (__re_match_2
, re_match_2
)
311 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
312 struct re_pattern_buffer
*bufp
;
313 const char *string1
, *string2
;
314 int length1
, length2
, start
, range
, stop
;
315 struct re_registers
*regs
;
317 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
318 start
, range
, regs
, stop
, 0);
321 weak_alias (__re_search_2
, re_search_2
)
325 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
327 struct re_pattern_buffer
*bufp
;
328 const char *string1
, *string2
;
329 int length1
, length2
, start
, range
, stop
, ret_len
;
330 struct re_registers
*regs
;
334 int len
= length1
+ length2
;
337 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
340 /* Concatenate the strings. */
344 char *s
= re_malloc (char, len
);
346 if (BE (s
== NULL
, 0))
348 memcpy (s
, string1
, length1
);
349 memcpy (s
+ length1
, string2
, length2
);
358 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
361 re_free ((char *) str
);
365 /* The parameters have the same meaning as those of re_search.
366 Additional parameters:
367 If RET_LEN is nonzero the length of the match is returned (re_match style);
368 otherwise the position of the match is returned. */
371 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
372 struct re_pattern_buffer
*bufp
;
374 int length
, start
, range
, stop
, ret_len
;
375 struct re_registers
*regs
;
377 reg_errcode_t result
;
382 /* Check for out-of-range. */
383 if (BE (start
< 0 || start
> length
, 0))
385 if (BE (start
+ range
> length
, 0))
386 range
= length
- start
;
387 else if (BE (start
+ range
< 0, 0))
390 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
391 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
393 /* Compile fastmap if we haven't yet. */
394 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
395 re_compile_fastmap (bufp
);
397 if (BE (bufp
->no_sub
, 0))
400 /* We need at least 1 register. */
403 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
404 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
406 nregs
= regs
->num_regs
;
407 if (BE (nregs
< 1, 0))
409 /* Nothing can be copied to regs. */
415 nregs
= bufp
->re_nsub
+ 1;
416 pmatch
= re_malloc (regmatch_t
, nregs
);
417 if (BE (pmatch
== NULL
, 0))
420 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
421 nregs
, pmatch
, eflags
);
425 /* I hope we needn't fill ther regs with -1's when no match was found. */
426 if (result
!= REG_NOERROR
)
428 else if (regs
!= NULL
)
430 /* If caller wants register contents data back, copy them. */
431 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
432 bufp
->regs_allocated
);
433 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
437 if (BE (rval
== 0, 1))
441 assert (pmatch
[0].rm_so
== start
);
442 rval
= pmatch
[0].rm_eo
- start
;
445 rval
= pmatch
[0].rm_so
;
452 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
453 struct re_registers
*regs
;
455 int nregs
, regs_allocated
;
457 int rval
= REGS_REALLOCATE
;
459 int need_regs
= nregs
+ 1;
460 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
463 /* Have the register data arrays been allocated? */
464 if (regs_allocated
== REGS_UNALLOCATED
)
465 { /* No. So allocate them with malloc. */
466 regs
->start
= re_malloc (regoff_t
, need_regs
);
467 regs
->end
= re_malloc (regoff_t
, need_regs
);
468 if (BE (regs
->start
== NULL
, 0) || BE (regs
->end
== NULL
, 0))
469 return REGS_UNALLOCATED
;
470 regs
->num_regs
= need_regs
;
472 else if (regs_allocated
== REGS_REALLOCATE
)
473 { /* Yes. If we need more elements than were already
474 allocated, reallocate them. If we need fewer, just
476 if (BE (need_regs
> regs
->num_regs
, 0))
478 regoff_t
*new_start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
479 regoff_t
*new_end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
480 if (BE (new_start
== NULL
, 0) || BE (new_end
== NULL
, 0))
481 return REGS_UNALLOCATED
;
482 regs
->start
= new_start
;
484 regs
->num_regs
= need_regs
;
489 assert (regs_allocated
== REGS_FIXED
);
490 /* This function may not be called with REGS_FIXED and nregs too big. */
491 assert (regs
->num_regs
>= nregs
);
496 for (i
= 0; i
< nregs
; ++i
)
498 regs
->start
[i
] = pmatch
[i
].rm_so
;
499 regs
->end
[i
] = pmatch
[i
].rm_eo
;
501 for ( ; i
< regs
->num_regs
; ++i
)
502 regs
->start
[i
] = regs
->end
[i
] = -1;
507 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
508 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
509 this memory for recording register information. STARTS and ENDS
510 must be allocated using the malloc library routine, and must each
511 be at least NUM_REGS * sizeof (regoff_t) bytes long.
513 If NUM_REGS == 0, then subsequent matches should allocate their own
516 Unless this function is called, the first search or match using
517 PATTERN_BUFFER will allocate its own register data, without
518 freeing the old data. */
521 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
522 struct re_pattern_buffer
*bufp
;
523 struct re_registers
*regs
;
525 regoff_t
*starts
, *ends
;
529 bufp
->regs_allocated
= REGS_REALLOCATE
;
530 regs
->num_regs
= num_regs
;
531 regs
->start
= starts
;
536 bufp
->regs_allocated
= REGS_UNALLOCATED
;
538 regs
->start
= regs
->end
= (regoff_t
*) 0;
542 weak_alias (__re_set_registers
, re_set_registers
)
545 /* Entry points compatible with 4.2 BSD regex library. We don't define
546 them unless specifically requested. */
548 #if defined _REGEX_RE_COMP || defined _LIBC
556 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
558 #endif /* _REGEX_RE_COMP */
560 static re_node_set empty_set
;
562 /* Internal entry point. */
564 /* Searches for a compiled pattern PREG in the string STRING, whose
565 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
566 mingings with regexec. START, and RANGE have the same meanings
568 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
569 otherwise return the error code.
570 Note: We assume front end functions already check ranges.
571 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
574 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
578 int length
, start
, range
, stop
, eflags
;
583 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
584 int left_lim
, right_lim
, incr
;
585 int fl_longest_match
, match_first
, match_last
= -1;
586 int fast_translate
, sb
;
587 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
588 re_match_context_t mctx
= { .dfa
= dfa
};
590 re_match_context_t mctx
;
592 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
593 && range
&& !preg
->can_be_null
) ? preg
->fastmap
: NULL
);
595 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
596 memset (&mctx
, '\0', sizeof (re_match_context_t
));
600 /* Check if the DFA haven't been compiled. */
601 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
602 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
603 || dfa
->init_state_begbuf
== NULL
, 0))
607 /* We assume front-end functions already check them. */
608 assert (start
+ range
>= 0 && start
+ range
<= length
);
611 /* If initial states with non-begbuf contexts have no elements,
612 the regex must be anchored. If preg->newline_anchor is set,
613 we'll never use init_state_nl, so do not check it. */
614 if (dfa
->init_state
->nodes
.nelem
== 0
615 && dfa
->init_state_word
->nodes
.nelem
== 0
616 && (dfa
->init_state_nl
->nodes
.nelem
== 0
617 || !preg
->newline_anchor
))
619 if (start
!= 0 && start
+ range
!= 0)
624 re_node_set_init_empty (&empty_set
);
626 /* We must check the longest matching, if nmatch > 0. */
627 fl_longest_match
= (nmatch
!= 0 || dfa
->nbackref
);
629 err
= re_string_allocate (&mctx
.input
, string
, length
, dfa
->nodes_len
+ 1,
630 preg
->translate
, preg
->syntax
& RE_ICASE
, dfa
);
631 if (BE (err
!= REG_NOERROR
, 0))
633 mctx
.input
.stop
= stop
;
634 mctx
.input
.raw_stop
= stop
;
635 mctx
.input
.newline_anchor
= preg
->newline_anchor
;
637 err
= match_ctx_init (&mctx
, eflags
, dfa
->nbackref
* 2);
638 if (BE (err
!= REG_NOERROR
, 0))
641 /* We will log all the DFA states through which the dfa pass,
642 if nmatch > 1, or this dfa has "multibyte node", which is a
643 back-reference or a node which can accept multibyte character or
644 multi character collating element. */
645 if (nmatch
> 1 || dfa
->has_mb_node
)
647 mctx
.state_log
= re_malloc (re_dfastate_t
*, mctx
.input
.bufs_len
+ 1);
648 if (BE (mctx
.state_log
== NULL
, 0))
655 mctx
.state_log
= NULL
;
658 mctx
.input
.tip_context
= (eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
659 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
;
661 /* Check incrementally whether of not the input string match. */
662 incr
= (range
< 0) ? -1 : 1;
663 left_lim
= (range
< 0) ? start
+ range
: start
;
664 right_lim
= (range
< 0) ? start
: start
+ range
;
665 sb
= dfa
->mb_cur_max
== 1;
666 fast_translate
= sb
|| !(preg
->syntax
& RE_ICASE
|| preg
->translate
);
670 /* At first get the current byte from input string. */
673 if (BE (fast_translate
, 1))
675 unsigned RE_TRANSLATE_TYPE t
676 = (unsigned RE_TRANSLATE_TYPE
) preg
->translate
;
677 if (BE (range
>= 0, 1))
679 if (BE (t
!= NULL
, 0))
681 while (BE (match_first
< right_lim
, 1)
682 && !fastmap
[t
[(unsigned char) string
[match_first
]]])
687 while (BE (match_first
< right_lim
, 1)
688 && !fastmap
[(unsigned char) string
[match_first
]])
691 if (BE (match_first
== right_lim
, 0))
693 int ch
= match_first
>= length
694 ? 0 : (unsigned char) string
[match_first
];
695 if (!fastmap
[t
? t
[ch
] : ch
])
701 while (match_first
>= left_lim
)
703 int ch
= match_first
>= length
704 ? 0 : (unsigned char) string
[match_first
];
705 if (fastmap
[t
? t
[ch
] : ch
])
709 if (match_first
< left_lim
)
719 /* In this case, we can't determine easily the current byte,
720 since it might be a component byte of a multibyte
721 character. Then we use the constructed buffer
723 /* If MATCH_FIRST is out of the valid range, reconstruct the
725 if (mctx
.input
.raw_mbs_idx
+ mctx
.input
.valid_raw_len
727 || match_first
< mctx
.input
.raw_mbs_idx
)
729 err
= re_string_reconstruct (&mctx
.input
, match_first
,
731 if (BE (err
!= REG_NOERROR
, 0))
734 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
735 Note that MATCH_FIRST must not be smaller than 0. */
736 ch
= ((match_first
>= length
) ? 0
737 : re_string_byte_at (&mctx
.input
,
739 - mctx
.input
.raw_mbs_idx
));
744 while (match_first
>= left_lim
&& match_first
<= right_lim
);
750 /* Reconstruct the buffers so that the matcher can assume that
751 the matching starts from the beginning of the buffer. */
752 err
= re_string_reconstruct (&mctx
.input
, match_first
, eflags
);
753 if (BE (err
!= REG_NOERROR
, 0))
755 #ifdef RE_ENABLE_I18N
756 /* Eliminate it when it is a component of a multibyte character
757 and isn't the head of a multibyte character. */
758 if (sb
|| re_string_first_byte (&mctx
.input
, 0))
761 /* It seems to be appropriate one, then use the matcher. */
762 /* We assume that the matching starts from 0. */
763 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_mb_elem_len
= 0;
764 match_last
= check_matching (&mctx
, fl_longest_match
,
765 range
>= 0 ? &match_first
: NULL
);
766 if (match_last
!= -1)
768 if (BE (match_last
== -2, 0))
775 mctx
.match_last
= match_last
;
776 if ((!preg
->no_sub
&& nmatch
> 1) || dfa
->nbackref
)
778 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
779 mctx
.last_node
= check_halt_state_context (&mctx
, pstate
,
782 if ((!preg
->no_sub
&& nmatch
> 1 && dfa
->has_plural_match
)
785 err
= prune_impossible_nodes (&mctx
);
786 if (err
== REG_NOERROR
)
788 if (BE (err
!= REG_NOMATCH
, 0))
793 break; /* We found a match. */
796 match_ctx_clean (&mctx
);
798 /* Update counter. */
800 if (match_first
< left_lim
|| right_lim
< match_first
)
804 /* Set pmatch[] if we need. */
805 if (match_last
!= -1 && nmatch
> 0)
809 /* Initialize registers. */
810 for (reg_idx
= 1; reg_idx
< nmatch
; ++reg_idx
)
811 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
813 /* Set the points where matching start/end. */
815 pmatch
[0].rm_eo
= mctx
.match_last
;
817 if (!preg
->no_sub
&& nmatch
> 1)
819 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
,
820 dfa
->has_plural_match
&& dfa
->nbackref
> 0);
821 if (BE (err
!= REG_NOERROR
, 0))
825 /* At last, add the offset to the each registers, since we slided
826 the buffers so that we could assume that the matching starts
828 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
829 if (pmatch
[reg_idx
].rm_so
!= -1)
831 #ifdef RE_ENABLE_I18N
832 if (BE (mctx
.input
.offsets_needed
!= 0, 0))
834 if (pmatch
[reg_idx
].rm_so
== mctx
.input
.valid_len
)
835 pmatch
[reg_idx
].rm_so
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
837 pmatch
[reg_idx
].rm_so
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_so
];
838 if (pmatch
[reg_idx
].rm_eo
== mctx
.input
.valid_len
)
839 pmatch
[reg_idx
].rm_eo
+= mctx
.input
.valid_raw_len
- mctx
.input
.valid_len
;
841 pmatch
[reg_idx
].rm_eo
= mctx
.input
.offsets
[pmatch
[reg_idx
].rm_eo
];
844 assert (mctx
.input
.offsets_needed
== 0);
846 pmatch
[reg_idx
].rm_so
+= match_first
;
847 pmatch
[reg_idx
].rm_eo
+= match_first
;
850 err
= (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
852 re_free (mctx
.state_log
);
854 match_ctx_free (&mctx
);
855 re_string_destruct (&mctx
.input
);
860 prune_impossible_nodes (mctx
)
861 re_match_context_t
*mctx
;
863 re_dfa_t
*const dfa
= mctx
->dfa
;
864 int halt_node
, match_last
;
866 re_dfastate_t
**sifted_states
;
867 re_dfastate_t
**lim_states
= NULL
;
868 re_sift_context_t sctx
;
870 assert (mctx
->state_log
!= NULL
);
872 match_last
= mctx
->match_last
;
873 halt_node
= mctx
->last_node
;
874 sifted_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
875 if (BE (sifted_states
== NULL
, 0))
882 lim_states
= re_malloc (re_dfastate_t
*, match_last
+ 1);
883 if (BE (lim_states
== NULL
, 0))
890 memset (lim_states
, '\0',
891 sizeof (re_dfastate_t
*) * (match_last
+ 1));
892 match_ctx_clear_flag (mctx
);
893 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
895 ret
= sift_states_backward (mctx
, &sctx
);
896 re_node_set_free (&sctx
.limits
);
897 if (BE (ret
!= REG_NOERROR
, 0))
899 if (sifted_states
[0] != NULL
|| lim_states
[0] != NULL
)
909 } while (mctx
->state_log
[match_last
] == NULL
910 || !mctx
->state_log
[match_last
]->halt
);
911 halt_node
= check_halt_state_context (mctx
,
912 mctx
->state_log
[match_last
],
915 ret
= merge_state_array (dfa
, sifted_states
, lim_states
,
917 re_free (lim_states
);
919 if (BE (ret
!= REG_NOERROR
, 0))
924 sift_ctx_init (&sctx
, sifted_states
, lim_states
, halt_node
,
926 ret
= sift_states_backward (mctx
, &sctx
);
927 re_node_set_free (&sctx
.limits
);
928 if (BE (ret
!= REG_NOERROR
, 0))
931 re_free (mctx
->state_log
);
932 mctx
->state_log
= sifted_states
;
933 sifted_states
= NULL
;
934 mctx
->last_node
= halt_node
;
935 mctx
->match_last
= match_last
;
938 re_free (sifted_states
);
939 re_free (lim_states
);
943 /* Acquire an initial state and return it.
944 We must select appropriate initial state depending on the context,
945 since initial states may have constraints like "\<", "^", etc.. */
947 static inline re_dfastate_t
*
948 acquire_init_state_context (err
, mctx
, idx
)
950 const re_match_context_t
*mctx
;
953 re_dfa_t
*const dfa
= mctx
->dfa
;
954 if (dfa
->init_state
->has_constraint
)
956 unsigned int context
;
957 context
= re_string_context_at (&mctx
->input
, idx
- 1, mctx
->eflags
);
958 if (IS_WORD_CONTEXT (context
))
959 return dfa
->init_state_word
;
960 else if (IS_ORDINARY_CONTEXT (context
))
961 return dfa
->init_state
;
962 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
963 return dfa
->init_state_begbuf
;
964 else if (IS_NEWLINE_CONTEXT (context
))
965 return dfa
->init_state_nl
;
966 else if (IS_BEGBUF_CONTEXT (context
))
968 /* It is relatively rare case, then calculate on demand. */
969 return re_acquire_state_context (err
, dfa
,
970 dfa
->init_state
->entrance_nodes
,
974 /* Must not happen? */
975 return dfa
->init_state
;
978 return dfa
->init_state
;
981 /* Check whether the regular expression match input string INPUT or not,
982 and return the index where the matching end, return -1 if not match,
983 or return -2 in case of an error.
984 FL_LONGEST_MATCH means we want the POSIX longest matching.
985 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
986 next place where we may want to try matching.
987 Note that the matcher assume that the maching starts from the current
988 index of the buffer. */
991 check_matching (mctx
, fl_longest_match
, p_match_first
)
992 re_match_context_t
*mctx
;
993 int fl_longest_match
;
996 re_dfa_t
*const dfa
= mctx
->dfa
;
1000 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
1001 re_dfastate_t
*cur_state
;
1002 int at_init_state
= p_match_first
!= NULL
;
1003 int next_start_idx
= cur_str_idx
;
1006 cur_state
= acquire_init_state_context (&err
, mctx
, cur_str_idx
);
1007 /* An initial state must not be NULL (invalid). */
1008 if (BE (cur_state
== NULL
, 0))
1010 assert (err
== REG_ESPACE
);
1014 if (mctx
->state_log
!= NULL
)
1016 mctx
->state_log
[cur_str_idx
] = cur_state
;
1018 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1019 later. E.g. Processing back references. */
1020 if (BE (dfa
->nbackref
, 0))
1023 err
= check_subexp_matching_top (mctx
, &cur_state
->nodes
, 0);
1024 if (BE (err
!= REG_NOERROR
, 0))
1027 if (cur_state
->has_backref
)
1029 err
= transit_state_bkref (mctx
, &cur_state
->nodes
);
1030 if (BE (err
!= REG_NOERROR
, 0))
1036 /* If the RE accepts NULL string. */
1037 if (BE (cur_state
->halt
, 0))
1039 if (!cur_state
->has_constraint
1040 || check_halt_state_context (mctx
, cur_state
, cur_str_idx
))
1042 if (!fl_longest_match
)
1046 match_last
= cur_str_idx
;
1052 while (!re_string_eoi (&mctx
->input
))
1054 re_dfastate_t
*old_state
= cur_state
;
1055 cur_state
= transit_state (&err
, mctx
, cur_state
);
1056 if (mctx
->state_log
!= NULL
)
1057 cur_state
= merge_state_with_log (&err
, mctx
, cur_state
);
1059 if (cur_state
== NULL
)
1061 /* Reached the invalid state or an error. Try to recover a valid
1062 state using the state log, if available and if we have not
1063 already found a valid (even if not the longest) match. */
1064 if (BE (err
!= REG_NOERROR
, 0))
1067 if (mctx
->state_log
== NULL
1068 || (match
&& !fl_longest_match
)
1069 || (cur_state
= find_recover_state (&err
, mctx
)) == NULL
)
1075 if (old_state
== cur_state
)
1076 next_start_idx
= re_string_cur_idx (&mctx
->input
);
1081 if (cur_state
->halt
)
1083 /* Reached a halt state.
1084 Check the halt state can satisfy the current context. */
1085 if (!cur_state
->has_constraint
1086 || check_halt_state_context (mctx
, cur_state
,
1087 re_string_cur_idx (&mctx
->input
)))
1089 /* We found an appropriate halt state. */
1090 match_last
= re_string_cur_idx (&mctx
->input
);
1092 if (!fl_longest_match
)
1098 if (match_last
== -1 && p_match_first
)
1099 *p_match_first
+= next_start_idx
;
1104 /* Check NODE match the current context. */
1106 static int check_halt_node_context (dfa
, node
, context
)
1107 const re_dfa_t
*dfa
;
1109 unsigned int context
;
1111 re_token_type_t type
= dfa
->nodes
[node
].type
;
1112 unsigned int constraint
= dfa
->nodes
[node
].constraint
;
1113 if (type
!= END_OF_RE
)
1117 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint
, context
))
1122 /* Check the halt state STATE match the current context.
1123 Return 0 if not match, if the node, STATE has, is a halt node and
1124 match the context, return the node. */
1127 check_halt_state_context (mctx
, state
, idx
)
1128 const re_match_context_t
*mctx
;
1129 const re_dfastate_t
*state
;
1133 unsigned int context
;
1135 assert (state
->halt
);
1137 context
= re_string_context_at (&mctx
->input
, idx
, mctx
->eflags
);
1138 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
1139 if (check_halt_node_context (mctx
->dfa
, state
->nodes
.elems
[i
], context
))
1140 return state
->nodes
.elems
[i
];
1144 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1145 corresponding to the DFA).
1146 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1150 proceed_next_node (mctx
, nregs
, regs
, pidx
, node
, eps_via_nodes
, fs
)
1151 const re_match_context_t
*mctx
;
1153 int nregs
, *pidx
, node
;
1154 re_node_set
*eps_via_nodes
;
1155 struct re_fail_stack_t
*fs
;
1157 re_dfa_t
*const dfa
= mctx
->dfa
;
1158 int i
, err
, dest_node
;
1160 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
1162 re_node_set
*cur_nodes
= &mctx
->state_log
[*pidx
]->nodes
;
1163 int ndest
, dest_nodes
[2];
1164 err
= re_node_set_insert (eps_via_nodes
, node
);
1165 if (BE (err
< 0, 0))
1167 /* Pick up valid destinations. */
1168 for (ndest
= 0, i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1170 int candidate
= dfa
->edests
[node
].elems
[i
];
1171 if (!re_node_set_contains (cur_nodes
, candidate
))
1173 dest_nodes
[0] = (ndest
== 0) ? candidate
: dest_nodes
[0];
1174 dest_nodes
[1] = (ndest
== 1) ? candidate
: dest_nodes
[1];
1178 return ndest
== 0 ? -1 : (ndest
== 1 ? dest_nodes
[0] : 0);
1179 /* In order to avoid infinite loop like "(a*)*". */
1180 if (re_node_set_contains (eps_via_nodes
, dest_nodes
[0]))
1181 return dest_nodes
[1];
1183 && push_fail_stack (fs
, *pidx
, dest_nodes
, nregs
, regs
,
1186 return dest_nodes
[0];
1191 re_token_type_t type
= dfa
->nodes
[node
].type
;
1193 #ifdef RE_ENABLE_I18N
1194 if (ACCEPT_MB_NODE (type
))
1195 naccepted
= check_node_accept_bytes (dfa
, node
, &mctx
->input
, *pidx
);
1197 #endif /* RE_ENABLE_I18N */
1198 if (type
== OP_BACK_REF
)
1200 int subexp_idx
= dfa
->nodes
[node
].opr
.idx
;
1201 naccepted
= regs
[subexp_idx
].rm_eo
- regs
[subexp_idx
].rm_so
;
1204 if (regs
[subexp_idx
].rm_so
== -1 || regs
[subexp_idx
].rm_eo
== -1)
1208 char *buf
= (char *) re_string_get_buffer (&mctx
->input
);
1209 if (memcmp (buf
+ regs
[subexp_idx
].rm_so
, buf
+ *pidx
,
1217 err
= re_node_set_insert (eps_via_nodes
, node
);
1218 if (BE (err
< 0, 0))
1220 dest_node
= dfa
->edests
[node
].elems
[0];
1221 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1228 || check_node_accept (mctx
, dfa
->nodes
+ node
, *pidx
))
1230 dest_node
= dfa
->nexts
[node
];
1231 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
1232 if (fs
&& (*pidx
> mctx
->match_last
|| mctx
->state_log
[*pidx
] == NULL
1233 || !re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
1236 re_node_set_empty (eps_via_nodes
);
1243 static reg_errcode_t
1244 push_fail_stack (fs
, str_idx
, dests
, nregs
, regs
, eps_via_nodes
)
1245 struct re_fail_stack_t
*fs
;
1246 int str_idx
, *dests
, nregs
;
1248 re_node_set
*eps_via_nodes
;
1251 int num
= fs
->num
++;
1252 if (fs
->num
== fs
->alloc
)
1254 struct re_fail_stack_ent_t
*new_array
;
1255 new_array
= realloc (fs
->stack
, (sizeof (struct re_fail_stack_ent_t
)
1257 if (new_array
== NULL
)
1260 fs
->stack
= new_array
;
1262 fs
->stack
[num
].idx
= str_idx
;
1263 fs
->stack
[num
].node
= dests
[1];
1264 fs
->stack
[num
].regs
= re_malloc (regmatch_t
, nregs
);
1265 if (fs
->stack
[num
].regs
== NULL
)
1267 memcpy (fs
->stack
[num
].regs
, regs
, sizeof (regmatch_t
) * nregs
);
1268 err
= re_node_set_init_copy (&fs
->stack
[num
].eps_via_nodes
, eps_via_nodes
);
1273 pop_fail_stack (fs
, pidx
, nregs
, regs
, eps_via_nodes
)
1274 struct re_fail_stack_t
*fs
;
1277 re_node_set
*eps_via_nodes
;
1279 int num
= --fs
->num
;
1281 *pidx
= fs
->stack
[num
].idx
;
1282 memcpy (regs
, fs
->stack
[num
].regs
, sizeof (regmatch_t
) * nregs
);
1283 re_node_set_free (eps_via_nodes
);
1284 re_free (fs
->stack
[num
].regs
);
1285 *eps_via_nodes
= fs
->stack
[num
].eps_via_nodes
;
1286 return fs
->stack
[num
].node
;
1289 /* Set the positions where the subexpressions are starts/ends to registers
1291 Note: We assume that pmatch[0] is already set, and
1292 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1294 static reg_errcode_t
1295 set_regs (preg
, mctx
, nmatch
, pmatch
, fl_backtrack
)
1296 const regex_t
*preg
;
1297 const re_match_context_t
*mctx
;
1302 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1303 int idx
, cur_node
, real_nmatch
;
1304 re_node_set eps_via_nodes
;
1305 struct re_fail_stack_t
*fs
;
1306 struct re_fail_stack_t fs_body
= { 0, 2, NULL
};
1307 regmatch_t
*prev_idx_match
;
1310 assert (nmatch
> 1);
1311 assert (mctx
->state_log
!= NULL
);
1316 fs
->stack
= re_malloc (struct re_fail_stack_ent_t
, fs
->alloc
);
1317 if (fs
->stack
== NULL
)
1323 cur_node
= dfa
->init_node
;
1324 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1325 re_node_set_init_empty (&eps_via_nodes
);
1327 prev_idx_match
= (regmatch_t
*) alloca (sizeof (regmatch_t
) * real_nmatch
);
1328 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * real_nmatch
);
1330 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1332 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, idx
, real_nmatch
);
1334 if (idx
== pmatch
[0].rm_eo
&& cur_node
== mctx
->last_node
)
1339 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
1340 if (pmatch
[reg_idx
].rm_so
> -1 && pmatch
[reg_idx
].rm_eo
== -1)
1342 if (reg_idx
== nmatch
)
1344 re_node_set_free (&eps_via_nodes
);
1345 return free_fail_stack_return (fs
);
1347 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1352 re_node_set_free (&eps_via_nodes
);
1357 /* Proceed to next node. */
1358 cur_node
= proceed_next_node (mctx
, nmatch
, pmatch
, &idx
, cur_node
,
1359 &eps_via_nodes
, fs
);
1361 if (BE (cur_node
< 0, 0))
1363 if (BE (cur_node
== -2, 0))
1365 re_node_set_free (&eps_via_nodes
);
1366 free_fail_stack_return (fs
);
1370 cur_node
= pop_fail_stack (fs
, &idx
, nmatch
, pmatch
,
1374 re_node_set_free (&eps_via_nodes
);
1379 re_node_set_free (&eps_via_nodes
);
1380 return free_fail_stack_return (fs
);
1383 static reg_errcode_t
1384 free_fail_stack_return (fs
)
1385 struct re_fail_stack_t
*fs
;
1390 for (fs_idx
= 0; fs_idx
< fs
->num
; ++fs_idx
)
1392 re_node_set_free (&fs
->stack
[fs_idx
].eps_via_nodes
);
1393 re_free (fs
->stack
[fs_idx
].regs
);
1395 re_free (fs
->stack
);
1401 update_regs (dfa
, pmatch
, prev_idx_match
, cur_node
, cur_idx
, nmatch
)
1403 regmatch_t
*pmatch
, *prev_idx_match
;
1404 int cur_node
, cur_idx
, nmatch
;
1406 int type
= dfa
->nodes
[cur_node
].type
;
1407 if (type
== OP_OPEN_SUBEXP
)
1409 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1411 /* We are at the first node of this sub expression. */
1412 if (reg_num
< nmatch
)
1414 pmatch
[reg_num
].rm_so
= cur_idx
;
1415 pmatch
[reg_num
].rm_eo
= -1;
1418 else if (type
== OP_CLOSE_SUBEXP
)
1420 int reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1421 if (reg_num
< nmatch
)
1423 /* We are at the last node of this sub expression. */
1424 if (pmatch
[reg_num
].rm_so
< cur_idx
)
1426 pmatch
[reg_num
].rm_eo
= cur_idx
;
1427 /* This is a non-empty match or we are not inside an optional
1428 subexpression. Accept this right away. */
1429 memcpy (prev_idx_match
, pmatch
, sizeof (regmatch_t
) * nmatch
);
1433 if (dfa
->nodes
[cur_node
].opt_subexp
1434 && prev_idx_match
[reg_num
].rm_so
!= -1)
1435 /* We transited through an empty match for an optional
1436 subexpression, like (a?)*, and this is not the subexp's
1437 first match. Copy back the old content of the registers
1438 so that matches of an inner subexpression are undone as
1439 well, like in ((a?))*. */
1440 memcpy (pmatch
, prev_idx_match
, sizeof (regmatch_t
) * nmatch
);
1442 /* We completed a subexpression, but it may be part of
1443 an optional one, so do not update PREV_IDX_MATCH. */
1444 pmatch
[reg_num
].rm_eo
= cur_idx
;
1450 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1451 and sift the nodes in each states according to the following rules.
1452 Updated state_log will be wrote to STATE_LOG.
1454 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1455 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1456 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1457 the LAST_NODE, we throw away the node `a'.
1458 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1459 string `s' and transit to `b':
1460 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1462 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1463 thrown away, we throw away the node `a'.
1464 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1465 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1467 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1468 we throw away the node `a'. */
1470 #define STATE_NODE_CONTAINS(state,node) \
1471 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1473 static reg_errcode_t
1474 sift_states_backward (mctx
, sctx
)
1475 re_match_context_t
*mctx
;
1476 re_sift_context_t
*sctx
;
1478 re_dfa_t
*const dfa
= mctx
->dfa
;
1481 int str_idx
= sctx
->last_str_idx
;
1482 re_node_set cur_dest
;
1483 re_node_set
*cur_src
; /* Points the state_log[str_idx]->nodes */
1486 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1488 cur_src
= &mctx
->state_log
[str_idx
]->nodes
;
1490 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1491 transit to the last_node and the last_node itself. */
1492 err
= re_node_set_init_1 (&cur_dest
, sctx
->last_node
);
1493 if (BE (err
!= REG_NOERROR
, 0))
1495 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1496 if (BE (err
!= REG_NOERROR
, 0))
1499 /* Then check each states in the state_log. */
1503 /* Update counters. */
1504 null_cnt
= (sctx
->sifted_states
[str_idx
] == NULL
) ? null_cnt
+ 1 : 0;
1505 if (null_cnt
> mctx
->max_mb_elem_len
)
1507 memset (sctx
->sifted_states
, '\0',
1508 sizeof (re_dfastate_t
*) * str_idx
);
1509 re_node_set_free (&cur_dest
);
1512 re_node_set_empty (&cur_dest
);
1514 cur_src
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1515 : &mctx
->state_log
[str_idx
]->nodes
);
1517 /* Then build the next sifted state.
1518 We build the next sifted state on `cur_dest', and update
1519 `sifted_states[str_idx]' with `cur_dest'.
1521 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1522 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1523 for (i
= 0; i
< cur_src
->nelem
; i
++)
1525 int prev_node
= cur_src
->elems
[i
];
1527 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1529 if (IS_EPSILON_NODE (type
))
1531 #ifdef RE_ENABLE_I18N
1532 /* If the node may accept `multi byte'. */
1533 if (ACCEPT_MB_NODE (type
))
1534 naccepted
= sift_states_iter_mb (mctx
, sctx
, prev_node
,
1535 str_idx
, sctx
->last_str_idx
);
1537 #endif /* RE_ENABLE_I18N */
1538 /* We don't check backreferences here.
1539 See update_cur_sifted_state(). */
1542 && check_node_accept (mctx
, dfa
->nodes
+ prev_node
, str_idx
)
1543 && STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ 1],
1544 dfa
->nexts
[prev_node
]))
1550 if (sctx
->limits
.nelem
)
1552 int to_idx
= str_idx
+ naccepted
;
1553 if (check_dst_limits (mctx
, &sctx
->limits
,
1554 dfa
->nexts
[prev_node
], to_idx
,
1555 prev_node
, str_idx
))
1558 ret
= re_node_set_insert (&cur_dest
, prev_node
);
1559 if (BE (ret
== -1, 0))
1566 /* Add all the nodes which satisfy the following conditions:
1567 - It can epsilon transit to a node in CUR_DEST.
1569 And update state_log. */
1570 err
= update_cur_sifted_state (mctx
, sctx
, str_idx
, &cur_dest
);
1571 if (BE (err
!= REG_NOERROR
, 0))
1576 re_node_set_free (&cur_dest
);
1580 /* Helper functions. */
1582 static reg_errcode_t
1583 clean_state_log_if_needed (mctx
, next_state_log_idx
)
1584 re_match_context_t
*mctx
;
1585 int next_state_log_idx
;
1587 int top
= mctx
->state_log_top
;
1589 if (next_state_log_idx
>= mctx
->input
.bufs_len
1590 || (next_state_log_idx
>= mctx
->input
.valid_len
1591 && mctx
->input
.valid_len
< mctx
->input
.len
))
1594 err
= extend_buffers (mctx
);
1595 if (BE (err
!= REG_NOERROR
, 0))
1599 if (top
< next_state_log_idx
)
1601 memset (mctx
->state_log
+ top
+ 1, '\0',
1602 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1603 mctx
->state_log_top
= next_state_log_idx
;
1608 static reg_errcode_t
1609 merge_state_array (dfa
, dst
, src
, num
)
1611 re_dfastate_t
**dst
;
1612 re_dfastate_t
**src
;
1617 for (st_idx
= 0; st_idx
< num
; ++st_idx
)
1619 if (dst
[st_idx
] == NULL
)
1620 dst
[st_idx
] = src
[st_idx
];
1621 else if (src
[st_idx
] != NULL
)
1623 re_node_set merged_set
;
1624 err
= re_node_set_init_union (&merged_set
, &dst
[st_idx
]->nodes
,
1625 &src
[st_idx
]->nodes
);
1626 if (BE (err
!= REG_NOERROR
, 0))
1628 dst
[st_idx
] = re_acquire_state (&err
, dfa
, &merged_set
);
1629 re_node_set_free (&merged_set
);
1630 if (BE (err
!= REG_NOERROR
, 0))
1637 static reg_errcode_t
1638 update_cur_sifted_state (mctx
, sctx
, str_idx
, dest_nodes
)
1639 re_match_context_t
*mctx
;
1640 re_sift_context_t
*sctx
;
1642 re_node_set
*dest_nodes
;
1644 re_dfa_t
*const dfa
= mctx
->dfa
;
1646 const re_node_set
*candidates
;
1647 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1648 : &mctx
->state_log
[str_idx
]->nodes
);
1650 /* At first, add the nodes which can epsilon transit to a node in
1652 if (dest_nodes
->nelem
)
1654 err
= add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
);
1655 if (BE (err
!= REG_NOERROR
, 0))
1659 /* Then, check the limitations in the current sift_context. */
1660 if (dest_nodes
->nelem
&& sctx
->limits
.nelem
)
1662 err
= check_subexp_limits (dfa
, dest_nodes
, candidates
, &sctx
->limits
,
1663 mctx
->bkref_ents
, str_idx
);
1664 if (BE (err
!= REG_NOERROR
, 0))
1668 /* Update state_log. */
1669 sctx
->sifted_states
[str_idx
] = re_acquire_state (&err
, dfa
, dest_nodes
);
1670 if (BE (sctx
->sifted_states
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1673 if ((mctx
->state_log
[str_idx
] != NULL
1674 && mctx
->state_log
[str_idx
]->has_backref
))
1676 err
= sift_states_bkref (mctx
, sctx
, str_idx
, dest_nodes
);
1677 if (BE (err
!= REG_NOERROR
, 0))
1683 static reg_errcode_t
1684 add_epsilon_src_nodes (dfa
, dest_nodes
, candidates
)
1686 re_node_set
*dest_nodes
;
1687 const re_node_set
*candidates
;
1691 re_node_set src_copy
;
1693 err
= re_node_set_init_copy (&src_copy
, dest_nodes
);
1694 if (BE (err
!= REG_NOERROR
, 0))
1696 for (src_idx
= 0; src_idx
< src_copy
.nelem
; ++src_idx
)
1698 err
= re_node_set_add_intersect (dest_nodes
, candidates
,
1700 + src_copy
.elems
[src_idx
]);
1701 if (BE (err
!= REG_NOERROR
, 0))
1703 re_node_set_free (&src_copy
);
1707 re_node_set_free (&src_copy
);
1711 static reg_errcode_t
1712 sub_epsilon_src_nodes (dfa
, node
, dest_nodes
, candidates
)
1715 re_node_set
*dest_nodes
;
1716 const re_node_set
*candidates
;
1720 re_node_set
*inv_eclosure
= dfa
->inveclosures
+ node
;
1721 re_node_set except_nodes
;
1722 re_node_set_init_empty (&except_nodes
);
1723 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1725 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1726 if (cur_node
== node
)
1728 if (IS_EPSILON_NODE (dfa
->nodes
[cur_node
].type
))
1730 int edst1
= dfa
->edests
[cur_node
].elems
[0];
1731 int edst2
= ((dfa
->edests
[cur_node
].nelem
> 1)
1732 ? dfa
->edests
[cur_node
].elems
[1] : -1);
1733 if ((!re_node_set_contains (inv_eclosure
, edst1
)
1734 && re_node_set_contains (dest_nodes
, edst1
))
1736 && !re_node_set_contains (inv_eclosure
, edst2
)
1737 && re_node_set_contains (dest_nodes
, edst2
)))
1739 err
= re_node_set_add_intersect (&except_nodes
, candidates
,
1740 dfa
->inveclosures
+ cur_node
);
1741 if (BE (err
!= REG_NOERROR
, 0))
1743 re_node_set_free (&except_nodes
);
1749 for (ecl_idx
= 0; ecl_idx
< inv_eclosure
->nelem
; ++ecl_idx
)
1751 int cur_node
= inv_eclosure
->elems
[ecl_idx
];
1752 if (!re_node_set_contains (&except_nodes
, cur_node
))
1754 int idx
= re_node_set_contains (dest_nodes
, cur_node
) - 1;
1755 re_node_set_remove_at (dest_nodes
, idx
);
1758 re_node_set_free (&except_nodes
);
1763 check_dst_limits (mctx
, limits
, dst_node
, dst_idx
, src_node
, src_idx
)
1764 re_match_context_t
*mctx
;
1765 re_node_set
*limits
;
1766 int dst_node
, dst_idx
, src_node
, src_idx
;
1768 re_dfa_t
*const dfa
= mctx
->dfa
;
1769 int lim_idx
, src_pos
, dst_pos
;
1771 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1774 struct re_backref_cache_entry
*ent
;
1775 ent
= mctx
->bkref_ents
+ limits
->elems
[lim_idx
];
1776 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1778 dst_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1779 dfa
->eclosures
+ dst_node
,
1780 subexp_idx
, dst_node
, dst_idx
);
1781 src_pos
= check_dst_limits_calc_pos (mctx
, limits
->elems
[lim_idx
],
1782 dfa
->eclosures
+ src_node
,
1783 subexp_idx
, src_node
, src_idx
);
1786 <src> <dst> ( <subexp> )
1787 ( <subexp> ) <src> <dst>
1788 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1789 if (src_pos
== dst_pos
)
1790 continue; /* This is unrelated limitation. */
1798 check_dst_limits_calc_pos (mctx
, limit
, eclosures
, subexp_idx
, from_node
,
1800 re_match_context_t
*mctx
;
1801 re_node_set
*eclosures
;
1802 int limit
, subexp_idx
, from_node
, str_idx
;
1804 re_dfa_t
*const dfa
= mctx
->dfa
;
1805 struct re_backref_cache_entry
*lim
= mctx
->bkref_ents
+ limit
;
1808 /* If we are outside the range of the subexpression, return -1 or 1. */
1809 if (str_idx
< lim
->subexp_from
)
1812 if (lim
->subexp_to
< str_idx
)
1815 /* If we are within the subexpression, return 0. */
1816 if (str_idx
!= lim
->subexp_from
&& str_idx
!= lim
->subexp_to
)
1819 /* Else, we are on the boundary: examine the nodes on the epsilon
1821 for (node_idx
= 0; node_idx
< eclosures
->nelem
; ++node_idx
)
1823 int node
= eclosures
->elems
[node_idx
];
1824 switch (dfa
->nodes
[node
].type
)
1828 int bi
= search_cur_bkref_entry (mctx
, str_idx
);
1829 for (; bi
< mctx
->nbkref_ents
; ++bi
)
1831 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ bi
;
1834 /* If this backreference goes beyond the point we're
1835 examining, don't go any further. */
1836 if (ent
->str_idx
> str_idx
)
1839 if (ent
->node
!= node
|| ent
->subexp_from
!= ent
->subexp_to
)
1842 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1843 OP_CLOSE_SUBEXP cases below. But, if the
1844 destination node is the same node as the source
1845 node, don't recurse because it would cause an
1846 infinite loop: a regex that exhibits this behavior
1848 dst
= dfa
->edests
[node
].elems
[0];
1849 if (dst
== from_node
)
1851 if (str_idx
== lim
->subexp_from
)
1853 else /* if (str_idx == lim->subexp_to) */
1857 cpos
= check_dst_limits_calc_pos (mctx
, limit
,
1858 dfa
->eclosures
+ dst
,
1862 if (cpos
== -1 && str_idx
== lim
->subexp_from
)
1865 if (cpos
== 0 /* && str_idx == lim->lim->subexp_to */)
1871 case OP_OPEN_SUBEXP
:
1872 if (str_idx
== lim
->subexp_from
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1876 case OP_CLOSE_SUBEXP
:
1877 if (str_idx
== lim
->subexp_to
&& subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1886 if (str_idx
== lim
->subexp_to
)
1892 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1893 which are against limitations from DEST_NODES. */
1895 static reg_errcode_t
1896 check_subexp_limits (dfa
, dest_nodes
, candidates
, limits
, bkref_ents
, str_idx
)
1898 re_node_set
*dest_nodes
;
1899 const re_node_set
*candidates
;
1900 re_node_set
*limits
;
1901 struct re_backref_cache_entry
*bkref_ents
;
1905 int node_idx
, lim_idx
;
1907 for (lim_idx
= 0; lim_idx
< limits
->nelem
; ++lim_idx
)
1910 struct re_backref_cache_entry
*ent
;
1911 ent
= bkref_ents
+ limits
->elems
[lim_idx
];
1913 if (str_idx
<= ent
->subexp_from
|| ent
->str_idx
< str_idx
)
1914 continue; /* This is unrelated limitation. */
1916 subexp_idx
= dfa
->nodes
[ent
->node
].opr
.idx
- 1;
1917 if (ent
->subexp_to
== str_idx
)
1921 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1923 int node
= dest_nodes
->elems
[node_idx
];
1924 re_token_type_t type
= dfa
->nodes
[node
].type
;
1925 if (type
== OP_OPEN_SUBEXP
1926 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1928 else if (type
== OP_CLOSE_SUBEXP
1929 && subexp_idx
== dfa
->nodes
[node
].opr
.idx
)
1933 /* Check the limitation of the open subexpression. */
1934 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1937 err
= sub_epsilon_src_nodes (dfa
, ops_node
, dest_nodes
,
1939 if (BE (err
!= REG_NOERROR
, 0))
1943 /* Check the limitation of the close subexpression. */
1945 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1947 int node
= dest_nodes
->elems
[node_idx
];
1948 if (!re_node_set_contains (dfa
->inveclosures
+ node
,
1950 && !re_node_set_contains (dfa
->eclosures
+ node
,
1953 /* It is against this limitation.
1954 Remove it form the current sifted state. */
1955 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
1957 if (BE (err
!= REG_NOERROR
, 0))
1963 else /* (ent->subexp_to != str_idx) */
1965 for (node_idx
= 0; node_idx
< dest_nodes
->nelem
; ++node_idx
)
1967 int node
= dest_nodes
->elems
[node_idx
];
1968 re_token_type_t type
= dfa
->nodes
[node
].type
;
1969 if (type
== OP_CLOSE_SUBEXP
|| type
== OP_OPEN_SUBEXP
)
1971 if (subexp_idx
!= dfa
->nodes
[node
].opr
.idx
)
1973 if ((type
== OP_CLOSE_SUBEXP
&& ent
->subexp_to
!= str_idx
)
1974 || (type
== OP_OPEN_SUBEXP
))
1976 /* It is against this limitation.
1977 Remove it form the current sifted state. */
1978 err
= sub_epsilon_src_nodes (dfa
, node
, dest_nodes
,
1980 if (BE (err
!= REG_NOERROR
, 0))
1990 static reg_errcode_t
1991 sift_states_bkref (mctx
, sctx
, str_idx
, dest_nodes
)
1992 re_match_context_t
*mctx
;
1993 re_sift_context_t
*sctx
;
1995 re_node_set
*dest_nodes
;
1997 re_dfa_t
*const dfa
= mctx
->dfa
;
2000 re_sift_context_t local_sctx
;
2001 const re_node_set
*candidates
;
2002 candidates
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
2003 : &mctx
->state_log
[str_idx
]->nodes
);
2004 local_sctx
.sifted_states
= NULL
; /* Mark that it hasn't been initialized. */
2006 for (node_idx
= 0; node_idx
< candidates
->nelem
; ++node_idx
)
2008 int cur_bkref_idx
= re_string_cur_idx (&mctx
->input
);
2009 re_token_type_t type
;
2010 node
= candidates
->elems
[node_idx
];
2011 type
= dfa
->nodes
[node
].type
;
2012 if (node
== sctx
->cur_bkref
&& str_idx
== cur_bkref_idx
)
2014 /* Avoid infinite loop for the REs like "()\1+". */
2015 if (node
== sctx
->last_node
&& str_idx
== sctx
->last_str_idx
)
2017 if (type
== OP_BACK_REF
)
2019 int enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2020 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2022 int disabled_idx
, subexp_len
, to_idx
, dst_node
;
2023 struct re_backref_cache_entry
*entry
;
2024 entry
= mctx
->bkref_ents
+ enabled_idx
;
2025 if (entry
->str_idx
> str_idx
)
2027 if (entry
->node
!= node
)
2029 subexp_len
= entry
->subexp_to
- entry
->subexp_from
;
2030 to_idx
= str_idx
+ subexp_len
;
2031 dst_node
= (subexp_len
? dfa
->nexts
[node
]
2032 : dfa
->edests
[node
].elems
[0]);
2034 if (to_idx
> sctx
->last_str_idx
2035 || sctx
->sifted_states
[to_idx
] == NULL
2036 || !STATE_NODE_CONTAINS (sctx
->sifted_states
[to_idx
],
2038 || check_dst_limits (mctx
, &sctx
->limits
, node
,
2039 str_idx
, dst_node
, to_idx
))
2042 re_dfastate_t
*cur_state
;
2044 for (disabled_idx
= enabled_idx
+ 1;
2045 disabled_idx
< mctx
->nbkref_ents
; ++disabled_idx
)
2047 struct re_backref_cache_entry
*entry2
;
2048 entry2
= mctx
->bkref_ents
+ disabled_idx
;
2049 if (entry2
->str_idx
> str_idx
)
2051 entry2
->flag
= (entry2
->node
== node
) ? 1 : entry2
->flag
;
2054 if (local_sctx
.sifted_states
== NULL
)
2057 err
= re_node_set_init_copy (&local_sctx
.limits
,
2059 if (BE (err
!= REG_NOERROR
, 0))
2062 local_sctx
.last_node
= node
;
2063 local_sctx
.last_str_idx
= str_idx
;
2064 err
= re_node_set_insert (&local_sctx
.limits
, enabled_idx
);
2065 if (BE (err
< 0, 0))
2070 cur_state
= local_sctx
.sifted_states
[str_idx
];
2071 err
= sift_states_backward (mctx
, &local_sctx
);
2072 if (BE (err
!= REG_NOERROR
, 0))
2074 if (sctx
->limited_states
!= NULL
)
2076 err
= merge_state_array (dfa
, sctx
->limited_states
,
2077 local_sctx
.sifted_states
,
2079 if (BE (err
!= REG_NOERROR
, 0))
2082 local_sctx
.sifted_states
[str_idx
] = cur_state
;
2083 re_node_set_remove (&local_sctx
.limits
, enabled_idx
);
2084 /* We must not use the variable entry here, since
2085 mctx->bkref_ents might be realloced. */
2086 mctx
->bkref_ents
[enabled_idx
].flag
= 1;
2089 enabled_idx
= search_cur_bkref_entry (mctx
, str_idx
);
2090 for (; enabled_idx
< mctx
->nbkref_ents
; ++enabled_idx
)
2092 struct re_backref_cache_entry
*entry
;
2093 entry
= mctx
->bkref_ents
+ enabled_idx
;
2094 if (entry
->str_idx
> str_idx
)
2096 if (entry
->node
== node
)
2103 if (local_sctx
.sifted_states
!= NULL
)
2105 re_node_set_free (&local_sctx
.limits
);
2112 #ifdef RE_ENABLE_I18N
2114 sift_states_iter_mb (mctx
, sctx
, node_idx
, str_idx
, max_str_idx
)
2115 const re_match_context_t
*mctx
;
2116 re_sift_context_t
*sctx
;
2117 int node_idx
, str_idx
, max_str_idx
;
2119 re_dfa_t
*const dfa
= mctx
->dfa
;
2121 /* Check the node can accept `multi byte'. */
2122 naccepted
= check_node_accept_bytes (dfa
, node_idx
, &mctx
->input
, str_idx
);
2123 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
2124 !STATE_NODE_CONTAINS (sctx
->sifted_states
[str_idx
+ naccepted
],
2125 dfa
->nexts
[node_idx
]))
2126 /* The node can't accept the `multi byte', or the
2127 destination was already thrown away, then the node
2128 could't accept the current input `multi byte'. */
2130 /* Otherwise, it is sure that the node could accept
2131 `naccepted' bytes input. */
2134 #endif /* RE_ENABLE_I18N */
2137 /* Functions for state transition. */
2139 /* Return the next state to which the current state STATE will transit by
2140 accepting the current input byte, and update STATE_LOG if necessary.
2141 If STATE can accept a multibyte char/collating element/back reference
2142 update the destination of STATE_LOG. */
2144 static re_dfastate_t
*
2145 transit_state (err
, mctx
, state
)
2147 re_match_context_t
*mctx
;
2148 re_dfastate_t
*state
;
2150 re_dfa_t
*const dfa
= mctx
->dfa
;
2151 re_dfastate_t
**trtable
;
2154 if (re_string_cur_idx (&mctx
->input
) + 1 >= mctx
->input
.bufs_len
2155 || (re_string_cur_idx (&mctx
->input
) + 1 >= mctx
->input
.valid_len
2156 && mctx
->input
.valid_len
< mctx
->input
.len
))
2158 *err
= extend_buffers (mctx
);
2159 if (BE (*err
!= REG_NOERROR
, 0))
2163 #ifdef RE_ENABLE_I18N
2164 /* If the current state can accept multibyte. */
2165 if (state
->accept_mb
)
2167 *err
= transit_state_mb (mctx
, state
);
2168 if (BE (*err
!= REG_NOERROR
, 0))
2171 #endif /* RE_ENABLE_I18N */
2173 /* Then decide the next state with the single byte. */
2176 /* Use transition table */
2177 ch
= re_string_fetch_byte (&mctx
->input
);
2178 trtable
= state
->trtable
;
2179 if (trtable
== NULL
)
2181 trtable
= build_trtable (dfa
, state
);
2182 if (trtable
== NULL
)
2188 if (BE (state
->word_trtable
, 0))
2190 unsigned int context
;
2192 = re_string_context_at (&mctx
->input
,
2193 re_string_cur_idx (&mctx
->input
) - 1,
2195 if (IS_WORD_CONTEXT (context
))
2196 return trtable
[ch
+ SBC_MAX
];
2205 /* don't use transition table */
2206 return transit_state_sb (err
, mctx
, state
);
2210 /* Update the state_log if we need */
2212 merge_state_with_log (err
, mctx
, next_state
)
2214 re_match_context_t
*mctx
;
2215 re_dfastate_t
*next_state
;
2217 re_dfa_t
*const dfa
= mctx
->dfa
;
2218 int cur_idx
= re_string_cur_idx (&mctx
->input
);
2220 if (cur_idx
> mctx
->state_log_top
)
2222 mctx
->state_log
[cur_idx
] = next_state
;
2223 mctx
->state_log_top
= cur_idx
;
2225 else if (mctx
->state_log
[cur_idx
] == 0)
2227 mctx
->state_log
[cur_idx
] = next_state
;
2231 re_dfastate_t
*pstate
;
2232 unsigned int context
;
2233 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
2234 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2235 the destination of a multibyte char/collating element/
2236 back reference. Then the next state is the union set of
2237 these destinations and the results of the transition table. */
2238 pstate
= mctx
->state_log
[cur_idx
];
2239 log_nodes
= pstate
->entrance_nodes
;
2240 if (next_state
!= NULL
)
2242 table_nodes
= next_state
->entrance_nodes
;
2243 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
2245 if (BE (*err
!= REG_NOERROR
, 0))
2249 next_nodes
= *log_nodes
;
2250 /* Note: We already add the nodes of the initial state,
2251 then we don't need to add them here. */
2253 context
= re_string_context_at (&mctx
->input
,
2254 re_string_cur_idx (&mctx
->input
) - 1,
2256 next_state
= mctx
->state_log
[cur_idx
]
2257 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2258 /* We don't need to check errors here, since the return value of
2259 this function is next_state and ERR is already set. */
2261 if (table_nodes
!= NULL
)
2262 re_node_set_free (&next_nodes
);
2265 if (BE (dfa
->nbackref
, 0) && next_state
!= NULL
)
2267 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2268 later. We must check them here, since the back references in the
2269 next state might use them. */
2270 *err
= check_subexp_matching_top (mctx
, &next_state
->nodes
,
2272 if (BE (*err
!= REG_NOERROR
, 0))
2275 /* If the next state has back references. */
2276 if (next_state
->has_backref
)
2278 *err
= transit_state_bkref (mctx
, &next_state
->nodes
);
2279 if (BE (*err
!= REG_NOERROR
, 0))
2281 next_state
= mctx
->state_log
[cur_idx
];
2288 /* Skip bytes in the input that correspond to part of a
2289 multi-byte match, then look in the log for a state
2290 from which to restart matching. */
2292 find_recover_state (err
, mctx
)
2294 re_match_context_t
*mctx
;
2296 re_dfastate_t
*cur_state
= NULL
;
2299 int max
= mctx
->state_log_top
;
2300 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2304 if (++cur_str_idx
> max
)
2306 re_string_skip_bytes (&mctx
->input
, 1);
2308 while (mctx
->state_log
[cur_str_idx
] == NULL
);
2310 cur_state
= merge_state_with_log (err
, mctx
, NULL
);
2312 while (err
== REG_NOERROR
&& cur_state
== NULL
);
2316 /* Helper functions for transit_state. */
2318 /* From the node set CUR_NODES, pick up the nodes whose types are
2319 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2320 expression. And register them to use them later for evaluating the
2321 correspoding back references. */
2323 static reg_errcode_t
2324 check_subexp_matching_top (mctx
, cur_nodes
, str_idx
)
2325 re_match_context_t
*mctx
;
2326 re_node_set
*cur_nodes
;
2329 re_dfa_t
*const dfa
= mctx
->dfa
;
2333 /* TODO: This isn't efficient.
2334 Because there might be more than one nodes whose types are
2335 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2338 for (node_idx
= 0; node_idx
< cur_nodes
->nelem
; ++node_idx
)
2340 int node
= cur_nodes
->elems
[node_idx
];
2341 if (dfa
->nodes
[node
].type
== OP_OPEN_SUBEXP
2342 && dfa
->nodes
[node
].opr
.idx
< (8 * sizeof (dfa
->used_bkref_map
))
2343 && dfa
->used_bkref_map
& (1 << dfa
->nodes
[node
].opr
.idx
))
2345 err
= match_ctx_add_subtop (mctx
, node
, str_idx
);
2346 if (BE (err
!= REG_NOERROR
, 0))
2354 /* Return the next state to which the current state STATE will transit by
2355 accepting the current input byte. */
2357 static re_dfastate_t
*
2358 transit_state_sb (err
, mctx
, state
)
2360 re_match_context_t
*mctx
;
2361 re_dfastate_t
*state
;
2363 re_dfa_t
*const dfa
= mctx
->dfa
;
2364 re_node_set next_nodes
;
2365 re_dfastate_t
*next_state
;
2366 int node_cnt
, cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2367 unsigned int context
;
2369 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
2370 if (BE (*err
!= REG_NOERROR
, 0))
2372 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
2374 int cur_node
= state
->nodes
.elems
[node_cnt
];
2375 if (check_node_accept (mctx
, dfa
->nodes
+ cur_node
, cur_str_idx
))
2377 *err
= re_node_set_merge (&next_nodes
,
2378 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
2379 if (BE (*err
!= REG_NOERROR
, 0))
2381 re_node_set_free (&next_nodes
);
2386 context
= re_string_context_at (&mctx
->input
, cur_str_idx
, mctx
->eflags
);
2387 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
2388 /* We don't need to check errors here, since the return value of
2389 this function is next_state and ERR is already set. */
2391 re_node_set_free (&next_nodes
);
2392 re_string_skip_bytes (&mctx
->input
, 1);
2397 #ifdef RE_ENABLE_I18N
2398 static reg_errcode_t
2399 transit_state_mb (mctx
, pstate
)
2400 re_match_context_t
*mctx
;
2401 re_dfastate_t
*pstate
;
2403 re_dfa_t
*const dfa
= mctx
->dfa
;
2407 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
2409 re_node_set dest_nodes
, *new_nodes
;
2410 int cur_node_idx
= pstate
->nodes
.elems
[i
];
2411 int naccepted
= 0, dest_idx
;
2412 unsigned int context
;
2413 re_dfastate_t
*dest_state
;
2415 if (dfa
->nodes
[cur_node_idx
].constraint
)
2417 context
= re_string_context_at (&mctx
->input
,
2418 re_string_cur_idx (&mctx
->input
),
2420 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
2425 /* How many bytes the node can accept? */
2426 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
2427 naccepted
= check_node_accept_bytes (dfa
, cur_node_idx
, &mctx
->input
,
2428 re_string_cur_idx (&mctx
->input
));
2432 /* The node can accepts `naccepted' bytes. */
2433 dest_idx
= re_string_cur_idx (&mctx
->input
) + naccepted
;
2434 mctx
->max_mb_elem_len
= ((mctx
->max_mb_elem_len
< naccepted
) ? naccepted
2435 : mctx
->max_mb_elem_len
);
2436 err
= clean_state_log_if_needed (mctx
, dest_idx
);
2437 if (BE (err
!= REG_NOERROR
, 0))
2440 assert (dfa
->nexts
[cur_node_idx
] != -1);
2442 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2443 then we use pstate->nodes.elems[i] instead. */
2444 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
2446 dest_state
= mctx
->state_log
[dest_idx
];
2447 if (dest_state
== NULL
)
2448 dest_nodes
= *new_nodes
;
2451 err
= re_node_set_init_union (&dest_nodes
,
2452 dest_state
->entrance_nodes
, new_nodes
);
2453 if (BE (err
!= REG_NOERROR
, 0))
2456 context
= re_string_context_at (&mctx
->input
, dest_idx
- 1, mctx
->eflags
);
2457 mctx
->state_log
[dest_idx
]
2458 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2459 if (dest_state
!= NULL
)
2460 re_node_set_free (&dest_nodes
);
2461 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
2466 #endif /* RE_ENABLE_I18N */
2468 static reg_errcode_t
2469 transit_state_bkref (mctx
, nodes
)
2470 re_match_context_t
*mctx
;
2471 const re_node_set
*nodes
;
2473 re_dfa_t
*const dfa
= mctx
->dfa
;
2476 int cur_str_idx
= re_string_cur_idx (&mctx
->input
);
2478 for (i
= 0; i
< nodes
->nelem
; ++i
)
2480 int dest_str_idx
, prev_nelem
, bkc_idx
;
2481 int node_idx
= nodes
->elems
[i
];
2482 unsigned int context
;
2483 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2484 re_node_set
*new_dest_nodes
;
2486 /* Check whether `node' is a backreference or not. */
2487 if (node
->type
!= OP_BACK_REF
)
2490 if (node
->constraint
)
2492 context
= re_string_context_at (&mctx
->input
, cur_str_idx
,
2494 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2498 /* `node' is a backreference.
2499 Check the substring which the substring matched. */
2500 bkc_idx
= mctx
->nbkref_ents
;
2501 err
= get_subexp (mctx
, node_idx
, cur_str_idx
);
2502 if (BE (err
!= REG_NOERROR
, 0))
2505 /* And add the epsilon closures (which is `new_dest_nodes') of
2506 the backreference to appropriate state_log. */
2508 assert (dfa
->nexts
[node_idx
] != -1);
2510 for (; bkc_idx
< mctx
->nbkref_ents
; ++bkc_idx
)
2513 re_dfastate_t
*dest_state
;
2514 struct re_backref_cache_entry
*bkref_ent
;
2515 bkref_ent
= mctx
->bkref_ents
+ bkc_idx
;
2516 if (bkref_ent
->node
!= node_idx
|| bkref_ent
->str_idx
!= cur_str_idx
)
2518 subexp_len
= bkref_ent
->subexp_to
- bkref_ent
->subexp_from
;
2519 new_dest_nodes
= (subexp_len
== 0
2520 ? dfa
->eclosures
+ dfa
->edests
[node_idx
].elems
[0]
2521 : dfa
->eclosures
+ dfa
->nexts
[node_idx
]);
2522 dest_str_idx
= (cur_str_idx
+ bkref_ent
->subexp_to
2523 - bkref_ent
->subexp_from
);
2524 context
= re_string_context_at (&mctx
->input
, dest_str_idx
- 1,
2526 dest_state
= mctx
->state_log
[dest_str_idx
];
2527 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
2528 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
2529 /* Add `new_dest_node' to state_log. */
2530 if (dest_state
== NULL
)
2532 mctx
->state_log
[dest_str_idx
]
2533 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
,
2535 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2536 && err
!= REG_NOERROR
, 0))
2541 re_node_set dest_nodes
;
2542 err
= re_node_set_init_union (&dest_nodes
,
2543 dest_state
->entrance_nodes
,
2545 if (BE (err
!= REG_NOERROR
, 0))
2547 re_node_set_free (&dest_nodes
);
2550 mctx
->state_log
[dest_str_idx
]
2551 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
2552 re_node_set_free (&dest_nodes
);
2553 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
2554 && err
!= REG_NOERROR
, 0))
2557 /* We need to check recursively if the backreference can epsilon
2560 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
2562 err
= check_subexp_matching_top (mctx
, new_dest_nodes
,
2564 if (BE (err
!= REG_NOERROR
, 0))
2566 err
= transit_state_bkref (mctx
, new_dest_nodes
);
2567 if (BE (err
!= REG_NOERROR
, 0))
2577 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2578 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2579 Note that we might collect inappropriate candidates here.
2580 However, the cost of checking them strictly here is too high, then we
2581 delay these checking for prune_impossible_nodes(). */
2583 static reg_errcode_t
2584 get_subexp (mctx
, bkref_node
, bkref_str_idx
)
2585 re_match_context_t
*mctx
;
2586 int bkref_node
, bkref_str_idx
;
2588 re_dfa_t
*const dfa
= mctx
->dfa
;
2589 int subexp_num
, sub_top_idx
;
2590 const char *buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2591 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2592 int cache_idx
= search_cur_bkref_entry (mctx
, bkref_str_idx
);
2593 for (; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
2595 const struct re_backref_cache_entry
*entry
2596 = &mctx
->bkref_ents
[cache_idx
];
2597 if (entry
->str_idx
> bkref_str_idx
)
2599 if (entry
->node
== bkref_node
)
2600 return REG_NOERROR
; /* We already checked it. */
2602 subexp_num
= dfa
->nodes
[bkref_node
].opr
.idx
- 1;
2604 /* For each sub expression */
2605 for (sub_top_idx
= 0; sub_top_idx
< mctx
->nsub_tops
; ++sub_top_idx
)
2608 re_sub_match_top_t
*sub_top
= mctx
->sub_tops
[sub_top_idx
];
2609 re_sub_match_last_t
*sub_last
;
2610 int sub_last_idx
, sl_str
, bkref_str_off
;
2612 if (dfa
->nodes
[sub_top
->node
].opr
.idx
!= subexp_num
)
2613 continue; /* It isn't related. */
2615 sl_str
= sub_top
->str_idx
;
2616 bkref_str_off
= bkref_str_idx
;
2617 /* At first, check the last node of sub expressions we already
2619 for (sub_last_idx
= 0; sub_last_idx
< sub_top
->nlasts
; ++sub_last_idx
)
2622 sub_last
= sub_top
->lasts
[sub_last_idx
];
2623 sl_str_diff
= sub_last
->str_idx
- sl_str
;
2624 /* The matched string by the sub expression match with the substring
2625 at the back reference? */
2626 if (sl_str_diff
> 0)
2628 if (BE (bkref_str_off
+ sl_str_diff
> mctx
->input
.valid_len
, 0))
2630 /* Not enough chars for a successful match. */
2631 if (bkref_str_off
+ sl_str_diff
> mctx
->input
.len
)
2634 err
= clean_state_log_if_needed (mctx
,
2637 if (BE (err
!= REG_NOERROR
, 0))
2639 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2641 if (memcmp (buf
+ bkref_str_off
, buf
+ sl_str
, sl_str_diff
) != 0)
2642 break; /* We don't need to search this sub expression any more. */
2644 bkref_str_off
+= sl_str_diff
;
2645 sl_str
+= sl_str_diff
;
2646 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2649 /* Reload buf, since the preceding call might have reallocated
2651 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2653 if (err
== REG_NOMATCH
)
2655 if (BE (err
!= REG_NOERROR
, 0))
2659 if (sub_last_idx
< sub_top
->nlasts
)
2661 if (sub_last_idx
> 0)
2663 /* Then, search for the other last nodes of the sub expression. */
2664 for (; sl_str
<= bkref_str_idx
; ++sl_str
)
2666 int cls_node
, sl_str_off
;
2667 const re_node_set
*nodes
;
2668 sl_str_off
= sl_str
- sub_top
->str_idx
;
2669 /* The matched string by the sub expression match with the substring
2670 at the back reference? */
2673 if (BE (bkref_str_off
>= mctx
->input
.valid_len
, 0))
2675 /* If we are at the end of the input, we cannot match. */
2676 if (bkref_str_off
>= mctx
->input
.len
)
2679 err
= extend_buffers (mctx
);
2680 if (BE (err
!= REG_NOERROR
, 0))
2683 buf
= (const char *) re_string_get_buffer (&mctx
->input
);
2685 if (buf
[bkref_str_off
++] != buf
[sl_str
- 1])
2686 break; /* We don't need to search this sub expression
2689 if (mctx
->state_log
[sl_str
] == NULL
)
2691 /* Does this state have a ')' of the sub expression? */
2692 nodes
= &mctx
->state_log
[sl_str
]->nodes
;
2693 cls_node
= find_subexp_node (dfa
, nodes
, subexp_num
, OP_CLOSE_SUBEXP
);
2696 if (sub_top
->path
== NULL
)
2698 sub_top
->path
= calloc (sizeof (state_array_t
),
2699 sl_str
- sub_top
->str_idx
+ 1);
2700 if (sub_top
->path
== NULL
)
2703 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2704 in the current context? */
2705 err
= check_arrival (mctx
, sub_top
->path
, sub_top
->node
,
2706 sub_top
->str_idx
, cls_node
, sl_str
, OP_CLOSE_SUBEXP
);
2707 if (err
== REG_NOMATCH
)
2709 if (BE (err
!= REG_NOERROR
, 0))
2711 sub_last
= match_ctx_add_sublast (sub_top
, cls_node
, sl_str
);
2712 if (BE (sub_last
== NULL
, 0))
2714 err
= get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
,
2716 if (err
== REG_NOMATCH
)
2723 /* Helper functions for get_subexp(). */
2725 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2726 If it can arrive, register the sub expression expressed with SUB_TOP
2729 static reg_errcode_t
2730 get_subexp_sub (mctx
, sub_top
, sub_last
, bkref_node
, bkref_str
)
2731 re_match_context_t
*mctx
;
2732 const re_sub_match_top_t
*sub_top
;
2733 re_sub_match_last_t
*sub_last
;
2734 int bkref_node
, bkref_str
;
2738 /* Can the subexpression arrive the back reference? */
2739 err
= check_arrival (mctx
, &sub_last
->path
, sub_last
->node
,
2740 sub_last
->str_idx
, bkref_node
, bkref_str
, OP_OPEN_SUBEXP
);
2741 if (err
!= REG_NOERROR
)
2743 err
= match_ctx_add_entry (mctx
, bkref_node
, bkref_str
, sub_top
->str_idx
,
2745 if (BE (err
!= REG_NOERROR
, 0))
2747 to_idx
= bkref_str
+ sub_last
->str_idx
- sub_top
->str_idx
;
2748 return clean_state_log_if_needed (mctx
, to_idx
);
2751 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2752 Search '(' if FL_OPEN, or search ')' otherwise.
2753 TODO: This function isn't efficient...
2754 Because there might be more than one nodes whose types are
2755 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2760 find_subexp_node (dfa
, nodes
, subexp_idx
, type
)
2761 const re_dfa_t
*dfa
;
2762 const re_node_set
*nodes
;
2763 int subexp_idx
, type
;
2766 for (cls_idx
= 0; cls_idx
< nodes
->nelem
; ++cls_idx
)
2768 int cls_node
= nodes
->elems
[cls_idx
];
2769 const re_token_t
*node
= dfa
->nodes
+ cls_node
;
2770 if (node
->type
== type
2771 && node
->opr
.idx
== subexp_idx
)
2777 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2778 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2780 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2782 static reg_errcode_t
2783 check_arrival (mctx
, path
, top_node
, top_str
, last_node
, last_str
,
2785 re_match_context_t
*mctx
;
2786 state_array_t
*path
;
2787 int top_node
, top_str
, last_node
, last_str
, type
;
2789 re_dfa_t
*const dfa
= mctx
->dfa
;
2791 int subexp_num
, backup_cur_idx
, str_idx
, null_cnt
;
2792 re_dfastate_t
*cur_state
= NULL
;
2793 re_node_set
*cur_nodes
, next_nodes
;
2794 re_dfastate_t
**backup_state_log
;
2795 unsigned int context
;
2797 subexp_num
= dfa
->nodes
[top_node
].opr
.idx
;
2798 /* Extend the buffer if we need. */
2799 if (BE (path
->alloc
< last_str
+ mctx
->max_mb_elem_len
+ 1, 0))
2801 re_dfastate_t
**new_array
;
2802 int old_alloc
= path
->alloc
;
2803 path
->alloc
+= last_str
+ mctx
->max_mb_elem_len
+ 1;
2804 new_array
= re_realloc (path
->array
, re_dfastate_t
*, path
->alloc
);
2805 if (new_array
== NULL
)
2807 path
->alloc
= old_alloc
;
2810 path
->array
= new_array
;
2811 memset (new_array
+ old_alloc
, '\0',
2812 sizeof (re_dfastate_t
*) * (path
->alloc
- old_alloc
));
2815 str_idx
= path
->next_idx
== 0 ? top_str
: path
->next_idx
;
2817 /* Temporary modify MCTX. */
2818 backup_state_log
= mctx
->state_log
;
2819 backup_cur_idx
= mctx
->input
.cur_idx
;
2820 mctx
->state_log
= path
->array
;
2821 mctx
->input
.cur_idx
= str_idx
;
2823 /* Setup initial node set. */
2824 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2825 if (str_idx
== top_str
)
2827 err
= re_node_set_init_1 (&next_nodes
, top_node
);
2828 if (BE (err
!= REG_NOERROR
, 0))
2830 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2831 if (BE (err
!= REG_NOERROR
, 0))
2833 re_node_set_free (&next_nodes
);
2839 cur_state
= mctx
->state_log
[str_idx
];
2840 if (cur_state
&& cur_state
->has_backref
)
2842 err
= re_node_set_init_copy (&next_nodes
, &cur_state
->nodes
);
2843 if (BE ( err
!= REG_NOERROR
, 0))
2847 re_node_set_init_empty (&next_nodes
);
2849 if (str_idx
== top_str
|| (cur_state
&& cur_state
->has_backref
))
2851 if (next_nodes
.nelem
)
2853 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
, last_str
,
2855 if (BE ( err
!= REG_NOERROR
, 0))
2857 re_node_set_free (&next_nodes
);
2861 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2862 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2864 re_node_set_free (&next_nodes
);
2867 mctx
->state_log
[str_idx
] = cur_state
;
2870 for (null_cnt
= 0; str_idx
< last_str
&& null_cnt
<= mctx
->max_mb_elem_len
;)
2872 re_node_set_empty (&next_nodes
);
2873 if (mctx
->state_log
[str_idx
+ 1])
2875 err
= re_node_set_merge (&next_nodes
,
2876 &mctx
->state_log
[str_idx
+ 1]->nodes
);
2877 if (BE (err
!= REG_NOERROR
, 0))
2879 re_node_set_free (&next_nodes
);
2885 err
= check_arrival_add_next_nodes (mctx
, str_idx
,
2886 &cur_state
->nodes
, &next_nodes
);
2887 if (BE (err
!= REG_NOERROR
, 0))
2889 re_node_set_free (&next_nodes
);
2894 if (next_nodes
.nelem
)
2896 err
= check_arrival_expand_ecl (dfa
, &next_nodes
, subexp_num
, type
);
2897 if (BE (err
!= REG_NOERROR
, 0))
2899 re_node_set_free (&next_nodes
);
2902 err
= expand_bkref_cache (mctx
, &next_nodes
, str_idx
, last_str
,
2904 if (BE ( err
!= REG_NOERROR
, 0))
2906 re_node_set_free (&next_nodes
);
2910 context
= re_string_context_at (&mctx
->input
, str_idx
- 1, mctx
->eflags
);
2911 cur_state
= re_acquire_state_context (&err
, dfa
, &next_nodes
, context
);
2912 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
2914 re_node_set_free (&next_nodes
);
2917 mctx
->state_log
[str_idx
] = cur_state
;
2918 null_cnt
= cur_state
== NULL
? null_cnt
+ 1 : 0;
2920 re_node_set_free (&next_nodes
);
2921 cur_nodes
= (mctx
->state_log
[last_str
] == NULL
? NULL
2922 : &mctx
->state_log
[last_str
]->nodes
);
2923 path
->next_idx
= str_idx
;
2926 mctx
->state_log
= backup_state_log
;
2927 mctx
->input
.cur_idx
= backup_cur_idx
;
2929 /* Then check the current node set has the node LAST_NODE. */
2930 if (cur_nodes
!= NULL
&& re_node_set_contains (cur_nodes
, last_node
))
2936 /* Helper functions for check_arrival. */
2938 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2940 TODO: This function is similar to the functions transit_state*(),
2941 however this function has many additional works.
2942 Can't we unify them? */
2944 static reg_errcode_t
2945 check_arrival_add_next_nodes (mctx
, str_idx
, cur_nodes
, next_nodes
)
2946 re_match_context_t
*mctx
;
2948 re_node_set
*cur_nodes
, *next_nodes
;
2950 re_dfa_t
*const dfa
= mctx
->dfa
;
2953 re_node_set union_set
;
2954 re_node_set_init_empty (&union_set
);
2955 for (cur_idx
= 0; cur_idx
< cur_nodes
->nelem
; ++cur_idx
)
2958 int cur_node
= cur_nodes
->elems
[cur_idx
];
2959 re_token_type_t type
= dfa
->nodes
[cur_node
].type
;
2960 if (IS_EPSILON_NODE (type
))
2962 #ifdef RE_ENABLE_I18N
2963 /* If the node may accept `multi byte'. */
2964 if (ACCEPT_MB_NODE (type
))
2966 naccepted
= check_node_accept_bytes (dfa
, cur_node
, &mctx
->input
,
2970 re_dfastate_t
*dest_state
;
2971 int next_node
= dfa
->nexts
[cur_node
];
2972 int next_idx
= str_idx
+ naccepted
;
2973 dest_state
= mctx
->state_log
[next_idx
];
2974 re_node_set_empty (&union_set
);
2977 err
= re_node_set_merge (&union_set
, &dest_state
->nodes
);
2978 if (BE (err
!= REG_NOERROR
, 0))
2980 re_node_set_free (&union_set
);
2984 err
= re_node_set_insert (&union_set
, next_node
);
2985 if (BE (err
< 0, 0))
2987 re_node_set_free (&union_set
);
2990 mctx
->state_log
[next_idx
] = re_acquire_state (&err
, dfa
,
2992 if (BE (mctx
->state_log
[next_idx
] == NULL
2993 && err
!= REG_NOERROR
, 0))
2995 re_node_set_free (&union_set
);
3000 #endif /* RE_ENABLE_I18N */
3002 || check_node_accept (mctx
, dfa
->nodes
+ cur_node
, str_idx
))
3004 err
= re_node_set_insert (next_nodes
, dfa
->nexts
[cur_node
]);
3005 if (BE (err
< 0, 0))
3007 re_node_set_free (&union_set
);
3012 re_node_set_free (&union_set
);
3016 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3017 CUR_NODES, however exclude the nodes which are:
3018 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3019 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3022 static reg_errcode_t
3023 check_arrival_expand_ecl (dfa
, cur_nodes
, ex_subexp
, type
)
3025 re_node_set
*cur_nodes
;
3026 int ex_subexp
, type
;
3029 int idx
, outside_node
;
3030 re_node_set new_nodes
;
3032 assert (cur_nodes
->nelem
);
3034 err
= re_node_set_alloc (&new_nodes
, cur_nodes
->nelem
);
3035 if (BE (err
!= REG_NOERROR
, 0))
3037 /* Create a new node set NEW_NODES with the nodes which are epsilon
3038 closures of the node in CUR_NODES. */
3040 for (idx
= 0; idx
< cur_nodes
->nelem
; ++idx
)
3042 int cur_node
= cur_nodes
->elems
[idx
];
3043 re_node_set
*eclosure
= dfa
->eclosures
+ cur_node
;
3044 outside_node
= find_subexp_node (dfa
, eclosure
, ex_subexp
, type
);
3045 if (outside_node
== -1)
3047 /* There are no problematic nodes, just merge them. */
3048 err
= re_node_set_merge (&new_nodes
, eclosure
);
3049 if (BE (err
!= REG_NOERROR
, 0))
3051 re_node_set_free (&new_nodes
);
3057 /* There are problematic nodes, re-calculate incrementally. */
3058 err
= check_arrival_expand_ecl_sub (dfa
, &new_nodes
, cur_node
,
3060 if (BE (err
!= REG_NOERROR
, 0))
3062 re_node_set_free (&new_nodes
);
3067 re_node_set_free (cur_nodes
);
3068 *cur_nodes
= new_nodes
;
3072 /* Helper function for check_arrival_expand_ecl.
3073 Check incrementally the epsilon closure of TARGET, and if it isn't
3074 problematic append it to DST_NODES. */
3076 static reg_errcode_t
3077 check_arrival_expand_ecl_sub (dfa
, dst_nodes
, target
, ex_subexp
, type
)
3079 int target
, ex_subexp
, type
;
3080 re_node_set
*dst_nodes
;
3083 for (cur_node
= target
; !re_node_set_contains (dst_nodes
, cur_node
);)
3087 if (dfa
->nodes
[cur_node
].type
== type
3088 && dfa
->nodes
[cur_node
].opr
.idx
== ex_subexp
)
3090 if (type
== OP_CLOSE_SUBEXP
)
3092 err
= re_node_set_insert (dst_nodes
, cur_node
);
3093 if (BE (err
== -1, 0))
3098 err
= re_node_set_insert (dst_nodes
, cur_node
);
3099 if (BE (err
== -1, 0))
3101 if (dfa
->edests
[cur_node
].nelem
== 0)
3103 if (dfa
->edests
[cur_node
].nelem
== 2)
3105 err
= check_arrival_expand_ecl_sub (dfa
, dst_nodes
,
3106 dfa
->edests
[cur_node
].elems
[1],
3108 if (BE (err
!= REG_NOERROR
, 0))
3111 cur_node
= dfa
->edests
[cur_node
].elems
[0];
3117 /* For all the back references in the current state, calculate the
3118 destination of the back references by the appropriate entry
3119 in MCTX->BKREF_ENTS. */
3121 static reg_errcode_t
3122 expand_bkref_cache (mctx
, cur_nodes
, cur_str
, last_str
, subexp_num
,
3124 re_match_context_t
*mctx
;
3125 int cur_str
, last_str
, subexp_num
, type
;
3126 re_node_set
*cur_nodes
;
3128 re_dfa_t
*const dfa
= mctx
->dfa
;
3130 int cache_idx
, cache_idx_start
;
3131 /* The current state. */
3133 cache_idx_start
= search_cur_bkref_entry (mctx
, cur_str
);
3134 for (cache_idx
= cache_idx_start
; cache_idx
< mctx
->nbkref_ents
; ++cache_idx
)
3136 int to_idx
, next_node
;
3137 struct re_backref_cache_entry
*ent
= mctx
->bkref_ents
+ cache_idx
;
3138 if (ent
->str_idx
> cur_str
)
3140 /* Is this entry ENT is appropriate? */
3141 if (!re_node_set_contains (cur_nodes
, ent
->node
))
3144 to_idx
= cur_str
+ ent
->subexp_to
- ent
->subexp_from
;
3145 /* Calculate the destination of the back reference, and append it
3146 to MCTX->STATE_LOG. */
3147 if (to_idx
== cur_str
)
3149 /* The backreference did epsilon transit, we must re-check all the
3150 node in the current state. */
3151 re_node_set new_dests
;
3152 reg_errcode_t err2
, err3
;
3153 next_node
= dfa
->edests
[ent
->node
].elems
[0];
3154 if (re_node_set_contains (cur_nodes
, next_node
))
3156 err
= re_node_set_init_1 (&new_dests
, next_node
);
3157 err2
= check_arrival_expand_ecl (dfa
, &new_dests
, subexp_num
, type
);
3158 err3
= re_node_set_merge (cur_nodes
, &new_dests
);
3159 re_node_set_free (&new_dests
);
3160 if (BE (err
!= REG_NOERROR
|| err2
!= REG_NOERROR
3161 || err3
!= REG_NOERROR
, 0))
3163 err
= (err
!= REG_NOERROR
? err
3164 : (err2
!= REG_NOERROR
? err2
: err3
));
3167 /* TODO: It is still inefficient... */
3168 cache_idx
= cache_idx_start
- 1;
3173 re_node_set union_set
;
3174 next_node
= dfa
->nexts
[ent
->node
];
3175 if (mctx
->state_log
[to_idx
])
3178 if (re_node_set_contains (&mctx
->state_log
[to_idx
]->nodes
,
3181 err
= re_node_set_init_copy (&union_set
,
3182 &mctx
->state_log
[to_idx
]->nodes
);
3183 ret
= re_node_set_insert (&union_set
, next_node
);
3184 if (BE (err
!= REG_NOERROR
|| ret
< 0, 0))
3186 re_node_set_free (&union_set
);
3187 err
= err
!= REG_NOERROR
? err
: REG_ESPACE
;
3193 err
= re_node_set_init_1 (&union_set
, next_node
);
3194 if (BE (err
!= REG_NOERROR
, 0))
3197 mctx
->state_log
[to_idx
] = re_acquire_state (&err
, dfa
, &union_set
);
3198 re_node_set_free (&union_set
);
3199 if (BE (mctx
->state_log
[to_idx
] == NULL
3200 && err
!= REG_NOERROR
, 0))
3207 /* Build transition table for the state.
3208 Return the new table if succeeded, otherwise return NULL. */
3210 static re_dfastate_t
**
3211 build_trtable (dfa
, state
)
3213 re_dfastate_t
*state
;
3217 unsigned int elem
, mask
;
3218 int dests_node_malloced
= 0, dest_states_malloced
= 0;
3219 int ndests
; /* Number of the destination states from `state'. */
3220 re_dfastate_t
**trtable
;
3221 re_dfastate_t
**dest_states
= NULL
, **dest_states_word
, **dest_states_nl
;
3222 re_node_set follows
, *dests_node
;
3226 /* We build DFA states which corresponds to the destination nodes
3227 from `state'. `dests_node[i]' represents the nodes which i-th
3228 destination state contains, and `dests_ch[i]' represents the
3229 characters which i-th destination state accepts. */
3231 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
))
3232 dests_node
= (re_node_set
*)
3233 alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3237 dests_node
= (re_node_set
*)
3238 malloc ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
);
3239 if (BE (dests_node
== NULL
, 0))
3241 dests_node_malloced
= 1;
3243 dests_ch
= (bitset
*) (dests_node
+ SBC_MAX
);
3245 /* Initialize transiton table. */
3246 state
->word_trtable
= 0;
3248 /* At first, group all nodes belonging to `state' into several
3250 ndests
= group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
);
3251 if (BE (ndests
<= 0, 0))
3253 if (dests_node_malloced
)
3255 /* Return NULL in case of an error, trtable otherwise. */
3258 state
->trtable
= (re_dfastate_t
**)
3259 calloc (sizeof (re_dfastate_t
*), SBC_MAX
);;
3260 return state
->trtable
;
3265 err
= re_node_set_alloc (&follows
, ndests
+ 1);
3266 if (BE (err
!= REG_NOERROR
, 0))
3270 if (__libc_use_alloca ((sizeof (re_node_set
) + sizeof (bitset
)) * SBC_MAX
3271 + ndests
* 3 * sizeof (re_dfastate_t
*)))
3272 dest_states
= (re_dfastate_t
**)
3273 alloca (ndests
* 3 * sizeof (re_dfastate_t
*));
3277 dest_states
= (re_dfastate_t
**)
3278 malloc (ndests
* 3 * sizeof (re_dfastate_t
*));
3279 if (BE (dest_states
== NULL
, 0))
3282 if (dest_states_malloced
)
3284 re_node_set_free (&follows
);
3285 for (i
= 0; i
< ndests
; ++i
)
3286 re_node_set_free (dests_node
+ i
);
3287 if (dests_node_malloced
)
3291 dest_states_malloced
= 1;
3293 dest_states_word
= dest_states
+ ndests
;
3294 dest_states_nl
= dest_states_word
+ ndests
;
3295 bitset_empty (acceptable
);
3297 /* Then build the states for all destinations. */
3298 for (i
= 0; i
< ndests
; ++i
)
3301 re_node_set_empty (&follows
);
3302 /* Merge the follows of this destination states. */
3303 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
3305 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
3306 if (next_node
!= -1)
3308 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
3309 if (BE (err
!= REG_NOERROR
, 0))
3313 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
3314 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3316 /* If the new state has context constraint,
3317 build appropriate states for these contexts. */
3318 if (dest_states
[i
]->has_constraint
)
3320 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3322 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3325 if (dest_states
[i
] != dest_states_word
[i
]
3326 && dfa
->mb_cur_max
> 1)
3327 state
->word_trtable
= 1;
3329 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
3331 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
3336 dest_states_word
[i
] = dest_states
[i
];
3337 dest_states_nl
[i
] = dest_states
[i
];
3339 bitset_merge (acceptable
, dests_ch
[i
]);
3342 if (!BE (state
->word_trtable
, 0))
3344 /* We don't care about whether the following character is a word
3345 character, or we are in a single-byte character set so we can
3346 discern by looking at the character code: allocate a
3347 256-entry transition table. */
3348 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
3349 if (BE (trtable
== NULL
, 0))
3352 /* For all characters ch...: */
3353 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3354 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3356 mask
<<= 1, elem
>>= 1, ++ch
)
3357 if (BE (elem
& 1, 0))
3359 /* There must be exactly one destination which accepts
3360 character ch. See group_nodes_into_DFAstates. */
3361 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3364 /* j-th destination accepts the word character ch. */
3365 if (dfa
->word_char
[i
] & mask
)
3366 trtable
[ch
] = dest_states_word
[j
];
3368 trtable
[ch
] = dest_states
[j
];
3373 /* We care about whether the following character is a word
3374 character, and we are in a multi-byte character set: discern
3375 by looking at the character code: build two 256-entry
3376 transition tables, one starting at trtable[0] and one
3377 starting at trtable[SBC_MAX]. */
3378 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*),
3380 if (BE (trtable
== NULL
, 0))
3383 /* For all characters ch...: */
3384 for (i
= 0; i
< BITSET_UINTS
; ++i
)
3385 for (ch
= i
* UINT_BITS
, elem
= acceptable
[i
], mask
= 1;
3387 mask
<<= 1, elem
>>= 1, ++ch
)
3388 if (BE (elem
& 1, 0))
3390 /* There must be exactly one destination which accepts
3391 character ch. See group_nodes_into_DFAstates. */
3392 for (j
= 0; (dests_ch
[j
][i
] & mask
) == 0; ++j
)
3395 /* j-th destination accepts the word character ch. */
3396 trtable
[ch
] = dest_states
[j
];
3397 trtable
[ch
+ SBC_MAX
] = dest_states_word
[j
];
3402 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
3404 /* The current state accepts newline character. */
3405 for (j
= 0; j
< ndests
; ++j
)
3406 if (bitset_contain (dests_ch
[j
], NEWLINE_CHAR
))
3408 /* k-th destination accepts newline character. */
3409 trtable
[NEWLINE_CHAR
] = dest_states_nl
[j
];
3410 if (state
->word_trtable
)
3411 trtable
[NEWLINE_CHAR
+ SBC_MAX
] = dest_states_nl
[j
];
3412 /* There must be only one destination which accepts
3413 newline. See group_nodes_into_DFAstates. */
3418 if (dest_states_malloced
)
3421 re_node_set_free (&follows
);
3422 for (i
= 0; i
< ndests
; ++i
)
3423 re_node_set_free (dests_node
+ i
);
3425 if (dests_node_malloced
)
3428 state
->trtable
= trtable
;
3432 /* Group all nodes belonging to STATE into several destinations.
3433 Then for all destinations, set the nodes belonging to the destination
3434 to DESTS_NODE[i] and set the characters accepted by the destination
3435 to DEST_CH[i]. This function return the number of destinations. */
3438 group_nodes_into_DFAstates (dfa
, state
, dests_node
, dests_ch
)
3440 const re_dfastate_t
*state
;
3441 re_node_set
*dests_node
;
3446 int ndests
; /* Number of the destinations from `state'. */
3447 bitset accepts
; /* Characters a node can accept. */
3448 const re_node_set
*cur_nodes
= &state
->nodes
;
3449 bitset_empty (accepts
);
3452 /* For all the nodes belonging to `state', */
3453 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
3455 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
3456 re_token_type_t type
= node
->type
;
3457 unsigned int constraint
= node
->constraint
;
3459 /* Enumerate all single byte character this node can accept. */
3460 if (type
== CHARACTER
)
3461 bitset_set (accepts
, node
->opr
.c
);
3462 else if (type
== SIMPLE_BRACKET
)
3464 bitset_merge (accepts
, node
->opr
.sbcset
);
3466 else if (type
== OP_PERIOD
)
3468 #ifdef RE_ENABLE_I18N
3469 if (dfa
->mb_cur_max
> 1)
3470 bitset_merge (accepts
, dfa
->sb_char
);
3473 bitset_set_all (accepts
);
3474 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3475 bitset_clear (accepts
, '\n');
3476 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3477 bitset_clear (accepts
, '\0');
3479 #ifdef RE_ENABLE_I18N
3480 else if (type
== OP_UTF8_PERIOD
)
3482 memset (accepts
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
3483 if (!(dfa
->syntax
& RE_DOT_NEWLINE
))
3484 bitset_clear (accepts
, '\n');
3485 if (dfa
->syntax
& RE_DOT_NOT_NULL
)
3486 bitset_clear (accepts
, '\0');
3492 /* Check the `accepts' and sift the characters which are not
3493 match it the context. */
3496 if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
3498 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
3499 bitset_empty (accepts
);
3500 if (accepts_newline
)
3501 bitset_set (accepts
, NEWLINE_CHAR
);
3505 if (constraint
& NEXT_ENDBUF_CONSTRAINT
)
3507 bitset_empty (accepts
);
3511 if (constraint
& NEXT_WORD_CONSTRAINT
)
3513 unsigned int any_set
= 0;
3514 if (type
== CHARACTER
&& !node
->word_char
)
3516 bitset_empty (accepts
);
3519 #ifdef RE_ENABLE_I18N
3520 if (dfa
->mb_cur_max
> 1)
3521 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3522 any_set
|= (accepts
[j
] &= (dfa
->word_char
[j
] | ~dfa
->sb_char
[j
]));
3525 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3526 any_set
|= (accepts
[j
] &= dfa
->word_char
[j
]);
3530 if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
3532 unsigned int any_set
= 0;
3533 if (type
== CHARACTER
&& node
->word_char
)
3535 bitset_empty (accepts
);
3538 #ifdef RE_ENABLE_I18N
3539 if (dfa
->mb_cur_max
> 1)
3540 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3541 any_set
|= (accepts
[j
] &= ~(dfa
->word_char
[j
] & dfa
->sb_char
[j
]));
3544 for (j
= 0; j
< BITSET_UINTS
; ++j
)
3545 any_set
|= (accepts
[j
] &= ~dfa
->word_char
[j
]);
3551 /* Then divide `accepts' into DFA states, or create a new
3552 state. Above, we make sure that accepts is not empty. */
3553 for (j
= 0; j
< ndests
; ++j
)
3555 bitset intersec
; /* Intersection sets, see below. */
3557 /* Flags, see below. */
3558 int has_intersec
, not_subset
, not_consumed
;
3560 /* Optimization, skip if this state doesn't accept the character. */
3561 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
3564 /* Enumerate the intersection set of this state and `accepts'. */
3566 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3567 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
3568 /* And skip if the intersection set is empty. */
3572 /* Then check if this state is a subset of `accepts'. */
3573 not_subset
= not_consumed
= 0;
3574 for (k
= 0; k
< BITSET_UINTS
; ++k
)
3576 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
3577 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
3580 /* If this state isn't a subset of `accepts', create a
3581 new group state, which has the `remains'. */
3584 bitset_copy (dests_ch
[ndests
], remains
);
3585 bitset_copy (dests_ch
[j
], intersec
);
3586 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
3587 if (BE (err
!= REG_NOERROR
, 0))
3592 /* Put the position in the current group. */
3593 err
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
3594 if (BE (err
< 0, 0))
3597 /* If all characters are consumed, go to next node. */
3601 /* Some characters remain, create a new group. */
3604 bitset_copy (dests_ch
[ndests
], accepts
);
3605 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
3606 if (BE (err
!= REG_NOERROR
, 0))
3609 bitset_empty (accepts
);
3614 for (j
= 0; j
< ndests
; ++j
)
3615 re_node_set_free (dests_node
+ j
);
3619 #ifdef RE_ENABLE_I18N
3620 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3621 Return the number of the bytes the node accepts.
3622 STR_IDX is the current index of the input string.
3624 This function handles the nodes which can accept one character, or
3625 one collating element like '.', '[a-z]', opposite to the other nodes
3626 can only accept one byte. */
3629 check_node_accept_bytes (dfa
, node_idx
, input
, str_idx
)
3631 int node_idx
, str_idx
;
3632 const re_string_t
*input
;
3634 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
3635 int char_len
, elem_len
;
3638 if (BE (node
->type
== OP_UTF8_PERIOD
, 0))
3640 unsigned char c
= re_string_byte_at (input
, str_idx
), d
;
3641 if (BE (c
< 0xc2, 1))
3644 if (str_idx
+ 2 > input
->len
)
3647 d
= re_string_byte_at (input
, str_idx
+ 1);
3649 return (d
< 0x80 || d
> 0xbf) ? 0 : 2;
3653 if (c
== 0xe0 && d
< 0xa0)
3659 if (c
== 0xf0 && d
< 0x90)
3665 if (c
== 0xf8 && d
< 0x88)
3671 if (c
== 0xfc && d
< 0x84)
3677 if (str_idx
+ char_len
> input
->len
)
3680 for (i
= 1; i
< char_len
; ++i
)
3682 d
= re_string_byte_at (input
, str_idx
+ i
);
3683 if (d
< 0x80 || d
> 0xbf)
3689 char_len
= re_string_char_size_at (input
, str_idx
);
3690 if (node
->type
== OP_PERIOD
)
3694 /* FIXME: I don't think this if is needed, as both '\n'
3695 and '\0' are char_len == 1. */
3696 /* '.' accepts any one character except the following two cases. */
3697 if ((!(dfa
->syntax
& RE_DOT_NEWLINE
) &&
3698 re_string_byte_at (input
, str_idx
) == '\n') ||
3699 ((dfa
->syntax
& RE_DOT_NOT_NULL
) &&
3700 re_string_byte_at (input
, str_idx
) == '\0'))
3705 elem_len
= re_string_elem_size_at (input
, str_idx
);
3706 if ((elem_len
<= 1 && char_len
<= 1) || char_len
== 0)
3709 if (node
->type
== COMPLEX_BRACKET
)
3711 const re_charset_t
*cset
= node
->opr
.mbcset
;
3713 const unsigned char *pin
= ((char *) re_string_get_buffer (input
)
3719 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
3720 ? re_string_wchar_at (input
, str_idx
) : 0);
3722 /* match with multibyte character? */
3723 for (i
= 0; i
< cset
->nmbchars
; ++i
)
3724 if (wc
== cset
->mbchars
[i
])
3726 match_len
= char_len
;
3727 goto check_node_accept_bytes_match
;
3729 /* match with character_class? */
3730 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
3732 wctype_t wt
= cset
->char_classes
[i
];
3733 if (__iswctype (wc
, wt
))
3735 match_len
= char_len
;
3736 goto check_node_accept_bytes_match
;
3741 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3744 unsigned int in_collseq
= 0;
3745 const int32_t *table
, *indirect
;
3746 const unsigned char *weights
, *extra
;
3747 const char *collseqwc
;
3749 /* This #include defines a local function! */
3750 # include <locale/weight.h>
3752 /* match with collating_symbol? */
3753 if (cset
->ncoll_syms
)
3754 extra
= (const unsigned char *)
3755 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3756 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
3758 const unsigned char *coll_sym
= extra
+ cset
->coll_syms
[i
];
3759 /* Compare the length of input collating element and
3760 the length of current collating element. */
3761 if (*coll_sym
!= elem_len
)
3763 /* Compare each bytes. */
3764 for (j
= 0; j
< *coll_sym
; j
++)
3765 if (pin
[j
] != coll_sym
[1 + j
])
3769 /* Match if every bytes is equal. */
3771 goto check_node_accept_bytes_match
;
3777 if (elem_len
<= char_len
)
3779 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3780 in_collseq
= __collseq_table_lookup (collseqwc
, wc
);
3783 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
3785 /* match with range expression? */
3786 for (i
= 0; i
< cset
->nranges
; ++i
)
3787 if (cset
->range_starts
[i
] <= in_collseq
3788 && in_collseq
<= cset
->range_ends
[i
])
3790 match_len
= elem_len
;
3791 goto check_node_accept_bytes_match
;
3794 /* match with equivalence_class? */
3795 if (cset
->nequiv_classes
)
3797 const unsigned char *cp
= pin
;
3798 table
= (const int32_t *)
3799 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3800 weights
= (const unsigned char *)
3801 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
3802 extra
= (const unsigned char *)
3803 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
3804 indirect
= (const int32_t *)
3805 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
3806 idx
= findidx (&cp
);
3808 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
3810 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
3811 size_t weight_len
= weights
[idx
];
3812 if (weight_len
== weights
[equiv_class_idx
])
3815 while (cnt
<= weight_len
3816 && (weights
[equiv_class_idx
+ 1 + cnt
]
3817 == weights
[idx
+ 1 + cnt
]))
3819 if (cnt
> weight_len
)
3821 match_len
= elem_len
;
3822 goto check_node_accept_bytes_match
;
3831 /* match with range expression? */
3833 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
3835 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
3838 for (i
= 0; i
< cset
->nranges
; ++i
)
3840 cmp_buf
[0] = cset
->range_starts
[i
];
3841 cmp_buf
[4] = cset
->range_ends
[i
];
3842 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
3843 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
3845 match_len
= char_len
;
3846 goto check_node_accept_bytes_match
;
3850 check_node_accept_bytes_match
:
3851 if (!cset
->non_match
)
3858 return (elem_len
> char_len
) ? elem_len
: char_len
;
3866 find_collation_sequence_value (mbs
, mbs_len
)
3867 const unsigned char *mbs
;
3870 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3875 /* No valid character. Match it as a single byte character. */
3876 const unsigned char *collseq
= (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3878 return collseq
[mbs
[0]];
3885 const unsigned char *extra
= (const unsigned char *)
3886 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
3887 int32_t extrasize
= (const unsigned char *)
3888 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
+ 1) - extra
;
3890 for (idx
= 0; idx
< extrasize
;)
3892 int mbs_cnt
, found
= 0;
3893 int32_t elem_mbs_len
;
3894 /* Skip the name of collating element name. */
3895 idx
= idx
+ extra
[idx
] + 1;
3896 elem_mbs_len
= extra
[idx
++];
3897 if (mbs_len
== elem_mbs_len
)
3899 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
3900 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
3902 if (mbs_cnt
== elem_mbs_len
)
3903 /* Found the entry. */
3906 /* Skip the byte sequence of the collating element. */
3907 idx
+= elem_mbs_len
;
3908 /* Adjust for the alignment. */
3909 idx
= (idx
+ 3) & ~3;
3910 /* Skip the collation sequence value. */
3911 idx
+= sizeof (uint32_t);
3912 /* Skip the wide char sequence of the collating element. */
3913 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
3914 /* If we found the entry, return the sequence value. */
3916 return *(uint32_t *) (extra
+ idx
);
3917 /* Skip the collation sequence value. */
3918 idx
+= sizeof (uint32_t);
3924 #endif /* RE_ENABLE_I18N */
3926 /* Check whether the node accepts the byte which is IDX-th
3927 byte of the INPUT. */
3930 check_node_accept (mctx
, node
, idx
)
3931 const re_match_context_t
*mctx
;
3932 const re_token_t
*node
;
3935 re_dfa_t
*const dfa
= mctx
->dfa
;
3937 if (node
->constraint
)
3939 /* The node has constraints. Check whether the current context
3940 satisfies the constraints. */
3941 unsigned int context
= re_string_context_at (&mctx
->input
, idx
,
3943 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
3946 ch
= re_string_byte_at (&mctx
->input
, idx
);
3950 return node
->opr
.c
== ch
;
3951 case SIMPLE_BRACKET
:
3952 return bitset_contain (node
->opr
.sbcset
, ch
);
3953 #ifdef RE_ENABLE_I18N
3954 case OP_UTF8_PERIOD
:
3960 return !((ch
== '\n' && !(dfa
->syntax
& RE_DOT_NEWLINE
))
3961 || (ch
== '\0' && (dfa
->syntax
& RE_DOT_NOT_NULL
)));
3967 /* Extend the buffers, if the buffers have run out. */
3969 static reg_errcode_t
3970 extend_buffers (mctx
)
3971 re_match_context_t
*mctx
;
3974 re_string_t
*pstr
= &mctx
->input
;
3976 /* Double the lengthes of the buffers. */
3977 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
3978 if (BE (ret
!= REG_NOERROR
, 0))
3981 if (mctx
->state_log
!= NULL
)
3983 /* And double the length of state_log. */
3984 /* XXX We have no indication of the size of this buffer. If this
3985 allocation fail we have no indication that the state_log array
3986 does not have the right size. */
3987 re_dfastate_t
**new_array
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
3988 pstr
->bufs_len
+ 1);
3989 if (BE (new_array
== NULL
, 0))
3991 mctx
->state_log
= new_array
;
3994 /* Then reconstruct the buffers. */
3997 #ifdef RE_ENABLE_I18N
3998 if (pstr
->mb_cur_max
> 1)
4000 ret
= build_wcs_upper_buffer (pstr
);
4001 if (BE (ret
!= REG_NOERROR
, 0))
4005 #endif /* RE_ENABLE_I18N */
4006 build_upper_buffer (pstr
);
4010 #ifdef RE_ENABLE_I18N
4011 if (pstr
->mb_cur_max
> 1)
4012 build_wcs_buffer (pstr
);
4014 #endif /* RE_ENABLE_I18N */
4016 if (pstr
->trans
!= NULL
)
4017 re_string_translate_buffer (pstr
);
4024 /* Functions for matching context. */
4026 /* Initialize MCTX. */
4028 static reg_errcode_t
4029 match_ctx_init (mctx
, eflags
, n
)
4030 re_match_context_t
*mctx
;
4033 mctx
->eflags
= eflags
;
4034 mctx
->match_last
= -1;
4037 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
4038 mctx
->sub_tops
= re_malloc (re_sub_match_top_t
*, n
);
4039 if (BE (mctx
->bkref_ents
== NULL
|| mctx
->sub_tops
== NULL
, 0))
4042 /* Already zero-ed by the caller.
4044 mctx->bkref_ents = NULL;
4045 mctx->nbkref_ents = 0;
4046 mctx->nsub_tops = 0; */
4047 mctx
->abkref_ents
= n
;
4048 mctx
->max_mb_elem_len
= 1;
4049 mctx
->asub_tops
= n
;
4053 /* Clean the entries which depend on the current input in MCTX.
4054 This function must be invoked when the matcher changes the start index
4055 of the input, or changes the input string. */
4058 match_ctx_clean (mctx
)
4059 re_match_context_t
*mctx
;
4061 match_ctx_free_subtops (mctx
);
4062 mctx
->nsub_tops
= 0;
4063 mctx
->nbkref_ents
= 0;
4066 /* Free all the memory associated with MCTX. */
4069 match_ctx_free (mctx
)
4070 re_match_context_t
*mctx
;
4072 match_ctx_free_subtops (mctx
);
4073 re_free (mctx
->sub_tops
);
4074 re_free (mctx
->bkref_ents
);
4077 /* Free all the memory associated with MCTX->SUB_TOPS. */
4080 match_ctx_free_subtops (mctx
)
4081 re_match_context_t
*mctx
;
4084 for (st_idx
= 0; st_idx
< mctx
->nsub_tops
; ++st_idx
)
4087 re_sub_match_top_t
*top
= mctx
->sub_tops
[st_idx
];
4088 for (sl_idx
= 0; sl_idx
< top
->nlasts
; ++sl_idx
)
4090 re_sub_match_last_t
*last
= top
->lasts
[sl_idx
];
4091 re_free (last
->path
.array
);
4094 re_free (top
->lasts
);
4097 re_free (top
->path
->array
);
4098 re_free (top
->path
);
4104 /* Add a new backreference entry to MCTX.
4105 Note that we assume that caller never call this function with duplicate
4106 entry, and call with STR_IDX which isn't smaller than any existing entry.
4109 static reg_errcode_t
4110 match_ctx_add_entry (mctx
, node
, str_idx
, from
, to
)
4111 re_match_context_t
*mctx
;
4112 int node
, str_idx
, from
, to
;
4114 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
4116 struct re_backref_cache_entry
* new_entry
;
4117 new_entry
= re_realloc (mctx
->bkref_ents
, struct re_backref_cache_entry
,
4118 mctx
->abkref_ents
* 2);
4119 if (BE (new_entry
== NULL
, 0))
4121 re_free (mctx
->bkref_ents
);
4124 mctx
->bkref_ents
= new_entry
;
4125 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
4126 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
4127 mctx
->abkref_ents
*= 2;
4129 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
4130 mctx
->bkref_ents
[mctx
->nbkref_ents
].str_idx
= str_idx
;
4131 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_from
= from
;
4132 mctx
->bkref_ents
[mctx
->nbkref_ents
].subexp_to
= to
;
4133 mctx
->bkref_ents
[mctx
->nbkref_ents
++].flag
= 0;
4134 if (mctx
->max_mb_elem_len
< to
- from
)
4135 mctx
->max_mb_elem_len
= to
- from
;
4139 /* Search for the first entry which has the same str_idx.
4140 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4143 search_cur_bkref_entry (mctx
, str_idx
)
4144 re_match_context_t
*mctx
;
4147 int left
, right
, mid
;
4148 right
= mctx
->nbkref_ents
;
4149 for (left
= 0; left
< right
;)
4151 mid
= (left
+ right
) / 2;
4152 if (mctx
->bkref_ents
[mid
].str_idx
< str_idx
)
4161 match_ctx_clear_flag (mctx
)
4162 re_match_context_t
*mctx
;
4165 for (i
= 0; i
< mctx
->nbkref_ents
; ++i
)
4166 mctx
->bkref_ents
[i
].flag
= 0;
4169 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4172 static reg_errcode_t
4173 match_ctx_add_subtop (mctx
, node
, str_idx
)
4174 re_match_context_t
*mctx
;
4178 assert (mctx
->sub_tops
!= NULL
);
4179 assert (mctx
->asub_tops
> 0);
4181 if (BE (mctx
->nsub_tops
== mctx
->asub_tops
, 0))
4183 int new_asub_tops
= mctx
->asub_tops
* 2;
4184 re_sub_match_top_t
**new_array
= re_realloc (mctx
->sub_tops
,
4185 re_sub_match_top_t
*,
4187 if (BE (new_array
== NULL
, 0))
4189 mctx
->sub_tops
= new_array
;
4190 mctx
->asub_tops
= new_asub_tops
;
4192 mctx
->sub_tops
[mctx
->nsub_tops
] = calloc (1, sizeof (re_sub_match_top_t
));
4193 if (BE (mctx
->sub_tops
[mctx
->nsub_tops
] == NULL
, 0))
4195 mctx
->sub_tops
[mctx
->nsub_tops
]->node
= node
;
4196 mctx
->sub_tops
[mctx
->nsub_tops
++]->str_idx
= str_idx
;
4200 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4201 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4203 static re_sub_match_last_t
*
4204 match_ctx_add_sublast (subtop
, node
, str_idx
)
4205 re_sub_match_top_t
*subtop
;
4208 re_sub_match_last_t
*new_entry
;
4209 if (BE (subtop
->nlasts
== subtop
->alasts
, 0))
4211 int new_alasts
= 2 * subtop
->alasts
+ 1;
4212 re_sub_match_last_t
**new_array
= re_realloc (subtop
->lasts
,
4213 re_sub_match_last_t
*,
4215 if (BE (new_array
== NULL
, 0))
4217 subtop
->lasts
= new_array
;
4218 subtop
->alasts
= new_alasts
;
4220 new_entry
= calloc (1, sizeof (re_sub_match_last_t
));
4221 if (BE (new_entry
!= NULL
, 1))
4223 subtop
->lasts
[subtop
->nlasts
] = new_entry
;
4224 new_entry
->node
= node
;
4225 new_entry
->str_idx
= str_idx
;
4232 sift_ctx_init (sctx
, sifted_sts
, limited_sts
, last_node
, last_str_idx
,
4234 re_sift_context_t
*sctx
;
4235 re_dfastate_t
**sifted_sts
, **limited_sts
;
4236 int last_node
, last_str_idx
, check_subexp
;
4238 sctx
->sifted_states
= sifted_sts
;
4239 sctx
->limited_states
= limited_sts
;
4240 sctx
->last_node
= last_node
;
4241 sctx
->last_str_idx
= last_str_idx
;
4242 sctx
->check_subexp
= check_subexp
;
4243 sctx
->cur_bkref
= -1;
4244 sctx
->cls_subexp_idx
= -1;
4245 re_node_set_init_empty (&sctx
->limits
);