]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regexec.c
Update.
[thirdparty/glibc.git] / posix / regexec.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static void match_ctx_free_subtops (re_match_context_t *mctx)
26 internal_function;
27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
28 int str_idx, int from, int to)
29 internal_function;
30 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
31 internal_function;
32 static void match_ctx_clear_flag (re_match_context_t *mctx) internal_function;
33 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
34 int str_idx) internal_function;
35 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
36 int node, int str_idx)
37 internal_function;
38 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
39 re_dfastate_t **limited_sts, int last_node,
40 int last_str_idx, int check_subexp)
41 internal_function;
42 static reg_errcode_t re_search_internal (const regex_t *preg,
43 const char *string, int length,
44 int start, int range, int stop,
45 size_t nmatch, regmatch_t pmatch[],
46 int eflags) internal_function;
47 static int re_search_2_stub (struct re_pattern_buffer *bufp,
48 const char *string1, int length1,
49 const char *string2, int length2,
50 int start, int range, struct re_registers *regs,
51 int stop, int ret_len) internal_function;
52 static int re_search_stub (struct re_pattern_buffer *bufp,
53 const char *string, int length, int start,
54 int range, int stop, struct re_registers *regs,
55 int ret_len) internal_function;
56 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
57 int nregs, int regs_allocated) internal_function;
58 static inline re_dfastate_t *acquire_init_state_context
59 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
60 __attribute ((always_inline)) internal_function;
61 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
62 internal_function;
63 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
64 int *p_match_first)
65 internal_function;
66 static int check_halt_node_context (const re_dfa_t *dfa, int node,
67 unsigned int context) internal_function;
68 static int check_halt_state_context (const re_match_context_t *mctx,
69 const re_dfastate_t *state, int idx)
70 internal_function;
71 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
72 regmatch_t *prev_idx_match, int cur_node,
73 int cur_idx, int nmatch) internal_function;
74 static int proceed_next_node (const re_match_context_t *mctx,
75 int nregs, regmatch_t *regs,
76 int *pidx, int node, re_node_set *eps_via_nodes,
77 struct re_fail_stack_t *fs) internal_function;
78 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
79 int str_idx, int *dests, int nregs,
80 regmatch_t *regs,
81 re_node_set *eps_via_nodes) internal_function;
82 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
83 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
84 static reg_errcode_t set_regs (const regex_t *preg,
85 const re_match_context_t *mctx,
86 size_t nmatch, regmatch_t *pmatch,
87 int fl_backtrack) internal_function;
88 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
89
90 #ifdef RE_ENABLE_I18N
91 static int sift_states_iter_mb (const re_match_context_t *mctx,
92 re_sift_context_t *sctx,
93 int node_idx, int str_idx, int max_str_idx) internal_function;
94 #endif /* RE_ENABLE_I18N */
95 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
96 re_sift_context_t *sctx) internal_function;
97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
111 int limit, re_node_set *eclosures,
112 int subexp_idx, int node, int str_idx) internal_function;
113 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
114 re_node_set *dest_nodes,
115 const re_node_set *candidates,
116 re_node_set *limits,
117 struct re_backref_cache_entry *bkref_ents,
118 int str_idx) internal_function;
119 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
120 re_sift_context_t *sctx,
121 int str_idx, re_node_set *dest_nodes) internal_function;
122 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
123 int next_state_log_idx) internal_function;
124 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
125 re_dfastate_t **src, int num) internal_function;
126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
128 static re_dfastate_t *transit_state (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *state) internal_function;
131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state) internal_function;
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137 #if 0
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate) internal_function;
141 #endif
142 #ifdef RE_ENABLE_I18N
143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145 #endif /* RE_ENABLE_I18N */
146 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
147 const re_node_set *nodes) internal_function;
148 static reg_errcode_t get_subexp (re_match_context_t *mctx,
149 int bkref_node, int bkref_str_idx) internal_function;
150 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
151 const re_sub_match_top_t *sub_top,
152 re_sub_match_last_t *sub_last,
153 int bkref_node, int bkref_str) internal_function;
154 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
155 int subexp_idx, int type) internal_function;
156 static reg_errcode_t check_arrival (re_match_context_t *mctx,
157 state_array_t *path, int top_node,
158 int top_str, int last_node, int last_str,
159 int type) internal_function;
160 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
161 int str_idx,
162 re_node_set *cur_nodes,
163 re_node_set *next_nodes) internal_function;
164 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
165 re_node_set *cur_nodes,
166 int ex_subexp, int type) internal_function;
167 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
168 re_node_set *dst_nodes,
169 int target, int ex_subexp,
170 int type) internal_function;
171 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
172 re_node_set *cur_nodes, int cur_str,
173 int last_str, int subexp_num,
174 int type) internal_function;
175 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
176 re_dfastate_t *state) internal_function;
177 #ifdef RE_ENABLE_I18N
178 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
179 const re_string_t *input, int idx) internal_function;
180 # ifdef _LIBC
181 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
182 size_t name_len) internal_function;
183 # endif /* _LIBC */
184 #endif /* RE_ENABLE_I18N */
185 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
186 const re_dfastate_t *state,
187 re_node_set *states_node,
188 bitset *states_ch) internal_function;
189 static int check_node_accept (const re_match_context_t *mctx,
190 const re_token_t *node, int idx) internal_function;
191 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
192 \f
193 /* Entry point for POSIX code. */
194
195 /* regexec searches for a given pattern, specified by PREG, in the
196 string STRING.
197
198 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
199 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
200 least NMATCH elements, and we set them to the offsets of the
201 corresponding matched substrings.
202
203 EFLAGS specifies `execution flags' which affect matching: if
204 REG_NOTBOL is set, then ^ does not match at the beginning of the
205 string; if REG_NOTEOL is set, then $ does not match at the end.
206
207 We return 0 if we find a match and REG_NOMATCH if not. */
208
209 int
210 regexec (preg, string, nmatch, pmatch, eflags)
211 const regex_t *__restrict preg;
212 const char *__restrict string;
213 size_t nmatch;
214 regmatch_t pmatch[];
215 int eflags;
216 {
217 reg_errcode_t err;
218 int start, length;
219 if (eflags & REG_STARTEND)
220 {
221 start = pmatch[0].rm_so;
222 length = pmatch[0].rm_eo;
223 }
224 else
225 {
226 start = 0;
227 length = strlen (string);
228 }
229 if (preg->no_sub)
230 err = re_search_internal (preg, string, length, start, length - start,
231 length, 0, NULL, eflags);
232 else
233 err = re_search_internal (preg, string, length, start, length - start,
234 length, nmatch, pmatch, eflags);
235 return err != REG_NOERROR;
236 }
237 #ifdef _LIBC
238 weak_alias (__regexec, regexec)
239 #endif
240
241 /* Entry points for GNU code. */
242
243 /* re_match, re_search, re_match_2, re_search_2
244
245 The former two functions operate on STRING with length LENGTH,
246 while the later two operate on concatenation of STRING1 and STRING2
247 with lengths LENGTH1 and LENGTH2, respectively.
248
249 re_match() matches the compiled pattern in BUFP against the string,
250 starting at index START.
251
252 re_search() first tries matching at index START, then it tries to match
253 starting from index START + 1, and so on. The last start position tried
254 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
255 way as re_match().)
256
257 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258 the first STOP characters of the concatenation of the strings should be
259 concerned.
260
261 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
263 computed relative to the concatenation, not relative to the individual
264 strings.)
265
266 On success, re_match* functions return the length of the match, re_search*
267 return the position of the start of the match. Return value -1 means no
268 match was found and -2 indicates an internal error. */
269
270 int
271 re_match (bufp, string, length, start, regs)
272 struct re_pattern_buffer *bufp;
273 const char *string;
274 int length, start;
275 struct re_registers *regs;
276 {
277 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
278 }
279 #ifdef _LIBC
280 weak_alias (__re_match, re_match)
281 #endif
282
283 int
284 re_search (bufp, string, length, start, range, regs)
285 struct re_pattern_buffer *bufp;
286 const char *string;
287 int length, start, range;
288 struct re_registers *regs;
289 {
290 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
291 }
292 #ifdef _LIBC
293 weak_alias (__re_search, re_search)
294 #endif
295
296 int
297 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
298 struct re_pattern_buffer *bufp;
299 const char *string1, *string2;
300 int length1, length2, start, stop;
301 struct re_registers *regs;
302 {
303 return re_search_2_stub (bufp, string1, length1, string2, length2,
304 start, 0, regs, stop, 1);
305 }
306 #ifdef _LIBC
307 weak_alias (__re_match_2, re_match_2)
308 #endif
309
310 int
311 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
312 struct re_pattern_buffer *bufp;
313 const char *string1, *string2;
314 int length1, length2, start, range, stop;
315 struct re_registers *regs;
316 {
317 return re_search_2_stub (bufp, string1, length1, string2, length2,
318 start, range, regs, stop, 0);
319 }
320 #ifdef _LIBC
321 weak_alias (__re_search_2, re_search_2)
322 #endif
323
324 static int
325 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
326 stop, ret_len)
327 struct re_pattern_buffer *bufp;
328 const char *string1, *string2;
329 int length1, length2, start, range, stop, ret_len;
330 struct re_registers *regs;
331 {
332 const char *str;
333 int rval;
334 int len = length1 + length2;
335 int free_str = 0;
336
337 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
338 return -2;
339
340 /* Concatenate the strings. */
341 if (length2 > 0)
342 if (length1 > 0)
343 {
344 char *s = re_malloc (char, len);
345
346 if (BE (s == NULL, 0))
347 return -2;
348 memcpy (s, string1, length1);
349 memcpy (s + length1, string2, length2);
350 str = s;
351 free_str = 1;
352 }
353 else
354 str = string2;
355 else
356 str = string1;
357
358 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
359 ret_len);
360 if (free_str)
361 re_free ((char *) str);
362 return rval;
363 }
364
365 /* The parameters have the same meaning as those of re_search.
366 Additional parameters:
367 If RET_LEN is nonzero the length of the match is returned (re_match style);
368 otherwise the position of the match is returned. */
369
370 static int
371 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
372 struct re_pattern_buffer *bufp;
373 const char *string;
374 int length, start, range, stop, ret_len;
375 struct re_registers *regs;
376 {
377 reg_errcode_t result;
378 regmatch_t *pmatch;
379 int nregs, rval;
380 int eflags = 0;
381
382 /* Check for out-of-range. */
383 if (BE (start < 0 || start > length, 0))
384 return -1;
385 if (BE (start + range > length, 0))
386 range = length - start;
387 else if (BE (start + range < 0, 0))
388 range = -start;
389
390 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
391 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
392
393 /* Compile fastmap if we haven't yet. */
394 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
395 re_compile_fastmap (bufp);
396
397 if (BE (bufp->no_sub, 0))
398 regs = NULL;
399
400 /* We need at least 1 register. */
401 if (regs == NULL)
402 nregs = 1;
403 else if (BE (bufp->regs_allocated == REGS_FIXED &&
404 regs->num_regs < bufp->re_nsub + 1, 0))
405 {
406 nregs = regs->num_regs;
407 if (BE (nregs < 1, 0))
408 {
409 /* Nothing can be copied to regs. */
410 regs = NULL;
411 nregs = 1;
412 }
413 }
414 else
415 nregs = bufp->re_nsub + 1;
416 pmatch = re_malloc (regmatch_t, nregs);
417 if (BE (pmatch == NULL, 0))
418 return -2;
419
420 result = re_search_internal (bufp, string, length, start, range, stop,
421 nregs, pmatch, eflags);
422
423 rval = 0;
424
425 /* I hope we needn't fill ther regs with -1's when no match was found. */
426 if (result != REG_NOERROR)
427 rval = -1;
428 else if (regs != NULL)
429 {
430 /* If caller wants register contents data back, copy them. */
431 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
432 bufp->regs_allocated);
433 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
434 rval = -2;
435 }
436
437 if (BE (rval == 0, 1))
438 {
439 if (ret_len)
440 {
441 assert (pmatch[0].rm_so == start);
442 rval = pmatch[0].rm_eo - start;
443 }
444 else
445 rval = pmatch[0].rm_so;
446 }
447 re_free (pmatch);
448 return rval;
449 }
450
451 static unsigned
452 re_copy_regs (regs, pmatch, nregs, regs_allocated)
453 struct re_registers *regs;
454 regmatch_t *pmatch;
455 int nregs, regs_allocated;
456 {
457 int rval = REGS_REALLOCATE;
458 int i;
459 int need_regs = nregs + 1;
460 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
461 uses. */
462
463 /* Have the register data arrays been allocated? */
464 if (regs_allocated == REGS_UNALLOCATED)
465 { /* No. So allocate them with malloc. */
466 regs->start = re_malloc (regoff_t, need_regs);
467 regs->end = re_malloc (regoff_t, need_regs);
468 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
469 return REGS_UNALLOCATED;
470 regs->num_regs = need_regs;
471 }
472 else if (regs_allocated == REGS_REALLOCATE)
473 { /* Yes. If we need more elements than were already
474 allocated, reallocate them. If we need fewer, just
475 leave it alone. */
476 if (BE (need_regs > regs->num_regs, 0))
477 {
478 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
479 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
480 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
481 return REGS_UNALLOCATED;
482 regs->start = new_start;
483 regs->end = new_end;
484 regs->num_regs = need_regs;
485 }
486 }
487 else
488 {
489 assert (regs_allocated == REGS_FIXED);
490 /* This function may not be called with REGS_FIXED and nregs too big. */
491 assert (regs->num_regs >= nregs);
492 rval = REGS_FIXED;
493 }
494
495 /* Copy the regs. */
496 for (i = 0; i < nregs; ++i)
497 {
498 regs->start[i] = pmatch[i].rm_so;
499 regs->end[i] = pmatch[i].rm_eo;
500 }
501 for ( ; i < regs->num_regs; ++i)
502 regs->start[i] = regs->end[i] = -1;
503
504 return rval;
505 }
506
507 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
508 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
509 this memory for recording register information. STARTS and ENDS
510 must be allocated using the malloc library routine, and must each
511 be at least NUM_REGS * sizeof (regoff_t) bytes long.
512
513 If NUM_REGS == 0, then subsequent matches should allocate their own
514 register data.
515
516 Unless this function is called, the first search or match using
517 PATTERN_BUFFER will allocate its own register data, without
518 freeing the old data. */
519
520 void
521 re_set_registers (bufp, regs, num_regs, starts, ends)
522 struct re_pattern_buffer *bufp;
523 struct re_registers *regs;
524 unsigned num_regs;
525 regoff_t *starts, *ends;
526 {
527 if (num_regs)
528 {
529 bufp->regs_allocated = REGS_REALLOCATE;
530 regs->num_regs = num_regs;
531 regs->start = starts;
532 regs->end = ends;
533 }
534 else
535 {
536 bufp->regs_allocated = REGS_UNALLOCATED;
537 regs->num_regs = 0;
538 regs->start = regs->end = (regoff_t *) 0;
539 }
540 }
541 #ifdef _LIBC
542 weak_alias (__re_set_registers, re_set_registers)
543 #endif
544 \f
545 /* Entry points compatible with 4.2 BSD regex library. We don't define
546 them unless specifically requested. */
547
548 #if defined _REGEX_RE_COMP || defined _LIBC
549 int
550 # ifdef _LIBC
551 weak_function
552 # endif
553 re_exec (s)
554 const char *s;
555 {
556 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
557 }
558 #endif /* _REGEX_RE_COMP */
559 \f
560 static re_node_set empty_set;
561
562 /* Internal entry point. */
563
564 /* Searches for a compiled pattern PREG in the string STRING, whose
565 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
566 mingings with regexec. START, and RANGE have the same meanings
567 with re_search.
568 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
569 otherwise return the error code.
570 Note: We assume front end functions already check ranges.
571 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
572
573 static reg_errcode_t
574 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
575 eflags)
576 const regex_t *preg;
577 const char *string;
578 int length, start, range, stop, eflags;
579 size_t nmatch;
580 regmatch_t pmatch[];
581 {
582 reg_errcode_t err;
583 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
584 int left_lim, right_lim, incr;
585 int fl_longest_match, match_first, match_last = -1;
586 int fast_translate, sb;
587 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
588 re_match_context_t mctx = { .dfa = dfa };
589 #else
590 re_match_context_t mctx;
591 #endif
592 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
593 && range && !preg->can_be_null) ? preg->fastmap : NULL);
594
595 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
596 memset (&mctx, '\0', sizeof (re_match_context_t));
597 mctx.dfa = dfa;
598 #endif
599
600 /* Check if the DFA haven't been compiled. */
601 if (BE (preg->used == 0 || dfa->init_state == NULL
602 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
603 || dfa->init_state_begbuf == NULL, 0))
604 return REG_NOMATCH;
605
606 #ifdef DEBUG
607 /* We assume front-end functions already check them. */
608 assert (start + range >= 0 && start + range <= length);
609 #endif
610
611 /* If initial states with non-begbuf contexts have no elements,
612 the regex must be anchored. If preg->newline_anchor is set,
613 we'll never use init_state_nl, so do not check it. */
614 if (dfa->init_state->nodes.nelem == 0
615 && dfa->init_state_word->nodes.nelem == 0
616 && (dfa->init_state_nl->nodes.nelem == 0
617 || !preg->newline_anchor))
618 {
619 if (start != 0 && start + range != 0)
620 return REG_NOMATCH;
621 start = range = 0;
622 }
623
624 re_node_set_init_empty (&empty_set);
625
626 /* We must check the longest matching, if nmatch > 0. */
627 fl_longest_match = (nmatch != 0 || dfa->nbackref);
628
629 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
630 preg->translate, preg->syntax & RE_ICASE, dfa);
631 if (BE (err != REG_NOERROR, 0))
632 goto free_return;
633 mctx.input.stop = stop;
634 mctx.input.raw_stop = stop;
635 mctx.input.newline_anchor = preg->newline_anchor;
636
637 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
638 if (BE (err != REG_NOERROR, 0))
639 goto free_return;
640
641 /* We will log all the DFA states through which the dfa pass,
642 if nmatch > 1, or this dfa has "multibyte node", which is a
643 back-reference or a node which can accept multibyte character or
644 multi character collating element. */
645 if (nmatch > 1 || dfa->has_mb_node)
646 {
647 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
648 if (BE (mctx.state_log == NULL, 0))
649 {
650 err = REG_ESPACE;
651 goto free_return;
652 }
653 }
654 else
655 mctx.state_log = NULL;
656
657 match_first = start;
658 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
659 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
660
661 /* Check incrementally whether of not the input string match. */
662 incr = (range < 0) ? -1 : 1;
663 left_lim = (range < 0) ? start + range : start;
664 right_lim = (range < 0) ? start : start + range;
665 sb = dfa->mb_cur_max == 1;
666 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
667
668 for (;;)
669 {
670 /* At first get the current byte from input string. */
671 if (fastmap)
672 {
673 if (BE (fast_translate, 1))
674 {
675 unsigned RE_TRANSLATE_TYPE t
676 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
677 if (BE (range >= 0, 1))
678 {
679 if (BE (t != NULL, 0))
680 {
681 while (BE (match_first < right_lim, 1)
682 && !fastmap[t[(unsigned char) string[match_first]]])
683 ++match_first;
684 }
685 else
686 {
687 while (BE (match_first < right_lim, 1)
688 && !fastmap[(unsigned char) string[match_first]])
689 ++match_first;
690 }
691 if (BE (match_first == right_lim, 0))
692 {
693 int ch = match_first >= length
694 ? 0 : (unsigned char) string[match_first];
695 if (!fastmap[t ? t[ch] : ch])
696 break;
697 }
698 }
699 else
700 {
701 while (match_first >= left_lim)
702 {
703 int ch = match_first >= length
704 ? 0 : (unsigned char) string[match_first];
705 if (fastmap[t ? t[ch] : ch])
706 break;
707 --match_first;
708 }
709 if (match_first < left_lim)
710 break;
711 }
712 }
713 else
714 {
715 int ch;
716
717 do
718 {
719 /* In this case, we can't determine easily the current byte,
720 since it might be a component byte of a multibyte
721 character. Then we use the constructed buffer
722 instead. */
723 /* If MATCH_FIRST is out of the valid range, reconstruct the
724 buffers. */
725 if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
726 <= match_first
727 || match_first < mctx.input.raw_mbs_idx)
728 {
729 err = re_string_reconstruct (&mctx.input, match_first,
730 eflags);
731 if (BE (err != REG_NOERROR, 0))
732 goto free_return;
733 }
734 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
735 Note that MATCH_FIRST must not be smaller than 0. */
736 ch = ((match_first >= length) ? 0
737 : re_string_byte_at (&mctx.input,
738 match_first
739 - mctx.input.raw_mbs_idx));
740 if (fastmap[ch])
741 break;
742 match_first += incr;
743 }
744 while (match_first >= left_lim && match_first <= right_lim);
745 if (! fastmap[ch])
746 break;
747 }
748 }
749
750 /* Reconstruct the buffers so that the matcher can assume that
751 the matching starts from the beginning of the buffer. */
752 err = re_string_reconstruct (&mctx.input, match_first, eflags);
753 if (BE (err != REG_NOERROR, 0))
754 goto free_return;
755 #ifdef RE_ENABLE_I18N
756 /* Eliminate it when it is a component of a multibyte character
757 and isn't the head of a multibyte character. */
758 if (sb || re_string_first_byte (&mctx.input, 0))
759 #endif
760 {
761 /* It seems to be appropriate one, then use the matcher. */
762 /* We assume that the matching starts from 0. */
763 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
764 match_last = check_matching (&mctx, fl_longest_match,
765 range >= 0 ? &match_first : NULL);
766 if (match_last != -1)
767 {
768 if (BE (match_last == -2, 0))
769 {
770 err = REG_ESPACE;
771 goto free_return;
772 }
773 else
774 {
775 mctx.match_last = match_last;
776 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
777 {
778 re_dfastate_t *pstate = mctx.state_log[match_last];
779 mctx.last_node = check_halt_state_context (&mctx, pstate,
780 match_last);
781 }
782 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
783 || dfa->nbackref)
784 {
785 err = prune_impossible_nodes (&mctx);
786 if (err == REG_NOERROR)
787 break;
788 if (BE (err != REG_NOMATCH, 0))
789 goto free_return;
790 match_last = -1;
791 }
792 else
793 break; /* We found a match. */
794 }
795 }
796 match_ctx_clean (&mctx);
797 }
798 /* Update counter. */
799 match_first += incr;
800 if (match_first < left_lim || right_lim < match_first)
801 break;
802 }
803
804 /* Set pmatch[] if we need. */
805 if (match_last != -1 && nmatch > 0)
806 {
807 int reg_idx;
808
809 /* Initialize registers. */
810 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
811 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
812
813 /* Set the points where matching start/end. */
814 pmatch[0].rm_so = 0;
815 pmatch[0].rm_eo = mctx.match_last;
816
817 if (!preg->no_sub && nmatch > 1)
818 {
819 err = set_regs (preg, &mctx, nmatch, pmatch,
820 dfa->has_plural_match && dfa->nbackref > 0);
821 if (BE (err != REG_NOERROR, 0))
822 goto free_return;
823 }
824
825 /* At last, add the offset to the each registers, since we slided
826 the buffers so that we could assume that the matching starts
827 from 0. */
828 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
829 if (pmatch[reg_idx].rm_so != -1)
830 {
831 #ifdef RE_ENABLE_I18N
832 if (BE (mctx.input.offsets_needed != 0, 0))
833 {
834 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
835 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
836 else
837 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
838 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
839 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
840 else
841 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
842 }
843 #else
844 assert (mctx.input.offsets_needed == 0);
845 #endif
846 pmatch[reg_idx].rm_so += match_first;
847 pmatch[reg_idx].rm_eo += match_first;
848 }
849 }
850 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
851 free_return:
852 re_free (mctx.state_log);
853 if (dfa->nbackref)
854 match_ctx_free (&mctx);
855 re_string_destruct (&mctx.input);
856 return err;
857 }
858
859 static reg_errcode_t
860 prune_impossible_nodes (mctx)
861 re_match_context_t *mctx;
862 {
863 re_dfa_t *const dfa = mctx->dfa;
864 int halt_node, match_last;
865 reg_errcode_t ret;
866 re_dfastate_t **sifted_states;
867 re_dfastate_t **lim_states = NULL;
868 re_sift_context_t sctx;
869 #ifdef DEBUG
870 assert (mctx->state_log != NULL);
871 #endif
872 match_last = mctx->match_last;
873 halt_node = mctx->last_node;
874 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
875 if (BE (sifted_states == NULL, 0))
876 {
877 ret = REG_ESPACE;
878 goto free_return;
879 }
880 if (dfa->nbackref)
881 {
882 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
883 if (BE (lim_states == NULL, 0))
884 {
885 ret = REG_ESPACE;
886 goto free_return;
887 }
888 while (1)
889 {
890 memset (lim_states, '\0',
891 sizeof (re_dfastate_t *) * (match_last + 1));
892 match_ctx_clear_flag (mctx);
893 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
894 match_last, 0);
895 ret = sift_states_backward (mctx, &sctx);
896 re_node_set_free (&sctx.limits);
897 if (BE (ret != REG_NOERROR, 0))
898 goto free_return;
899 if (sifted_states[0] != NULL || lim_states[0] != NULL)
900 break;
901 do
902 {
903 --match_last;
904 if (match_last < 0)
905 {
906 ret = REG_NOMATCH;
907 goto free_return;
908 }
909 } while (mctx->state_log[match_last] == NULL
910 || !mctx->state_log[match_last]->halt);
911 halt_node = check_halt_state_context (mctx,
912 mctx->state_log[match_last],
913 match_last);
914 }
915 ret = merge_state_array (dfa, sifted_states, lim_states,
916 match_last + 1);
917 re_free (lim_states);
918 lim_states = NULL;
919 if (BE (ret != REG_NOERROR, 0))
920 goto free_return;
921 }
922 else
923 {
924 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
925 match_last, 0);
926 ret = sift_states_backward (mctx, &sctx);
927 re_node_set_free (&sctx.limits);
928 if (BE (ret != REG_NOERROR, 0))
929 goto free_return;
930 }
931 re_free (mctx->state_log);
932 mctx->state_log = sifted_states;
933 sifted_states = NULL;
934 mctx->last_node = halt_node;
935 mctx->match_last = match_last;
936 ret = REG_NOERROR;
937 free_return:
938 re_free (sifted_states);
939 re_free (lim_states);
940 return ret;
941 }
942
943 /* Acquire an initial state and return it.
944 We must select appropriate initial state depending on the context,
945 since initial states may have constraints like "\<", "^", etc.. */
946
947 static inline re_dfastate_t *
948 acquire_init_state_context (err, mctx, idx)
949 reg_errcode_t *err;
950 const re_match_context_t *mctx;
951 int idx;
952 {
953 re_dfa_t *const dfa = mctx->dfa;
954 if (dfa->init_state->has_constraint)
955 {
956 unsigned int context;
957 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
958 if (IS_WORD_CONTEXT (context))
959 return dfa->init_state_word;
960 else if (IS_ORDINARY_CONTEXT (context))
961 return dfa->init_state;
962 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
963 return dfa->init_state_begbuf;
964 else if (IS_NEWLINE_CONTEXT (context))
965 return dfa->init_state_nl;
966 else if (IS_BEGBUF_CONTEXT (context))
967 {
968 /* It is relatively rare case, then calculate on demand. */
969 return re_acquire_state_context (err, dfa,
970 dfa->init_state->entrance_nodes,
971 context);
972 }
973 else
974 /* Must not happen? */
975 return dfa->init_state;
976 }
977 else
978 return dfa->init_state;
979 }
980
981 /* Check whether the regular expression match input string INPUT or not,
982 and return the index where the matching end, return -1 if not match,
983 or return -2 in case of an error.
984 FL_LONGEST_MATCH means we want the POSIX longest matching.
985 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
986 next place where we may want to try matching.
987 Note that the matcher assume that the maching starts from the current
988 index of the buffer. */
989
990 static int
991 check_matching (mctx, fl_longest_match, p_match_first)
992 re_match_context_t *mctx;
993 int fl_longest_match;
994 int *p_match_first;
995 {
996 re_dfa_t *const dfa = mctx->dfa;
997 reg_errcode_t err;
998 int match = 0;
999 int match_last = -1;
1000 int cur_str_idx = re_string_cur_idx (&mctx->input);
1001 re_dfastate_t *cur_state;
1002 int at_init_state = p_match_first != NULL;
1003 int next_start_idx = cur_str_idx;
1004
1005 err = REG_NOERROR;
1006 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1007 /* An initial state must not be NULL (invalid). */
1008 if (BE (cur_state == NULL, 0))
1009 {
1010 assert (err == REG_ESPACE);
1011 return -2;
1012 }
1013
1014 if (mctx->state_log != NULL)
1015 {
1016 mctx->state_log[cur_str_idx] = cur_state;
1017
1018 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1019 later. E.g. Processing back references. */
1020 if (BE (dfa->nbackref, 0))
1021 {
1022 at_init_state = 0;
1023 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1024 if (BE (err != REG_NOERROR, 0))
1025 return err;
1026
1027 if (cur_state->has_backref)
1028 {
1029 err = transit_state_bkref (mctx, &cur_state->nodes);
1030 if (BE (err != REG_NOERROR, 0))
1031 return err;
1032 }
1033 }
1034 }
1035
1036 /* If the RE accepts NULL string. */
1037 if (BE (cur_state->halt, 0))
1038 {
1039 if (!cur_state->has_constraint
1040 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1041 {
1042 if (!fl_longest_match)
1043 return cur_str_idx;
1044 else
1045 {
1046 match_last = cur_str_idx;
1047 match = 1;
1048 }
1049 }
1050 }
1051
1052 while (!re_string_eoi (&mctx->input))
1053 {
1054 re_dfastate_t *old_state = cur_state;
1055 cur_state = transit_state (&err, mctx, cur_state);
1056 if (mctx->state_log != NULL)
1057 cur_state = merge_state_with_log (&err, mctx, cur_state);
1058
1059 if (cur_state == NULL)
1060 {
1061 /* Reached the invalid state or an error. Try to recover a valid
1062 state using the state log, if available and if we have not
1063 already found a valid (even if not the longest) match. */
1064 if (BE (err != REG_NOERROR, 0))
1065 return -2;
1066
1067 if (mctx->state_log == NULL
1068 || (match && !fl_longest_match)
1069 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1070 break;
1071 }
1072
1073 if (at_init_state)
1074 {
1075 if (old_state == cur_state)
1076 next_start_idx = re_string_cur_idx (&mctx->input);
1077 else
1078 at_init_state = 0;
1079 }
1080
1081 if (cur_state->halt)
1082 {
1083 /* Reached a halt state.
1084 Check the halt state can satisfy the current context. */
1085 if (!cur_state->has_constraint
1086 || check_halt_state_context (mctx, cur_state,
1087 re_string_cur_idx (&mctx->input)))
1088 {
1089 /* We found an appropriate halt state. */
1090 match_last = re_string_cur_idx (&mctx->input);
1091 match = 1;
1092 if (!fl_longest_match)
1093 break;
1094 }
1095 }
1096 }
1097
1098 if (match_last == -1 && p_match_first)
1099 *p_match_first += next_start_idx;
1100
1101 return match_last;
1102 }
1103
1104 /* Check NODE match the current context. */
1105
1106 static int check_halt_node_context (dfa, node, context)
1107 const re_dfa_t *dfa;
1108 int node;
1109 unsigned int context;
1110 {
1111 re_token_type_t type = dfa->nodes[node].type;
1112 unsigned int constraint = dfa->nodes[node].constraint;
1113 if (type != END_OF_RE)
1114 return 0;
1115 if (!constraint)
1116 return 1;
1117 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1118 return 0;
1119 return 1;
1120 }
1121
1122 /* Check the halt state STATE match the current context.
1123 Return 0 if not match, if the node, STATE has, is a halt node and
1124 match the context, return the node. */
1125
1126 static int
1127 check_halt_state_context (mctx, state, idx)
1128 const re_match_context_t *mctx;
1129 const re_dfastate_t *state;
1130 int idx;
1131 {
1132 int i;
1133 unsigned int context;
1134 #ifdef DEBUG
1135 assert (state->halt);
1136 #endif
1137 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1138 for (i = 0; i < state->nodes.nelem; ++i)
1139 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1140 return state->nodes.elems[i];
1141 return 0;
1142 }
1143
1144 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1145 corresponding to the DFA).
1146 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1147 of errors. */
1148
1149 static int
1150 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1151 const re_match_context_t *mctx;
1152 regmatch_t *regs;
1153 int nregs, *pidx, node;
1154 re_node_set *eps_via_nodes;
1155 struct re_fail_stack_t *fs;
1156 {
1157 re_dfa_t *const dfa = mctx->dfa;
1158 int i, err, dest_node;
1159 dest_node = -1;
1160 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1161 {
1162 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1163 int ndest, dest_nodes[2];
1164 err = re_node_set_insert (eps_via_nodes, node);
1165 if (BE (err < 0, 0))
1166 return -2;
1167 /* Pick up valid destinations. */
1168 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1169 {
1170 int candidate = dfa->edests[node].elems[i];
1171 if (!re_node_set_contains (cur_nodes, candidate))
1172 continue;
1173 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1174 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1175 ++ndest;
1176 }
1177 if (ndest <= 1)
1178 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1179 /* In order to avoid infinite loop like "(a*)*". */
1180 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1181 return dest_nodes[1];
1182 if (fs != NULL
1183 && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1184 eps_via_nodes))
1185 return -2;
1186 return dest_nodes[0];
1187 }
1188 else
1189 {
1190 int naccepted = 0;
1191 re_token_type_t type = dfa->nodes[node].type;
1192
1193 #ifdef RE_ENABLE_I18N
1194 if (ACCEPT_MB_NODE (type))
1195 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1196 else
1197 #endif /* RE_ENABLE_I18N */
1198 if (type == OP_BACK_REF)
1199 {
1200 int subexp_idx = dfa->nodes[node].opr.idx;
1201 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1202 if (fs != NULL)
1203 {
1204 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1205 return -1;
1206 else if (naccepted)
1207 {
1208 char *buf = (char *) re_string_get_buffer (&mctx->input);
1209 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1210 naccepted) != 0)
1211 return -1;
1212 }
1213 }
1214
1215 if (naccepted == 0)
1216 {
1217 err = re_node_set_insert (eps_via_nodes, node);
1218 if (BE (err < 0, 0))
1219 return -2;
1220 dest_node = dfa->edests[node].elems[0];
1221 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1222 dest_node))
1223 return dest_node;
1224 }
1225 }
1226
1227 if (naccepted != 0
1228 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1229 {
1230 dest_node = dfa->nexts[node];
1231 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1232 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1233 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1234 dest_node)))
1235 return -1;
1236 re_node_set_empty (eps_via_nodes);
1237 return dest_node;
1238 }
1239 }
1240 return -1;
1241 }
1242
1243 static reg_errcode_t
1244 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1245 struct re_fail_stack_t *fs;
1246 int str_idx, *dests, nregs;
1247 regmatch_t *regs;
1248 re_node_set *eps_via_nodes;
1249 {
1250 reg_errcode_t err;
1251 int num = fs->num++;
1252 if (fs->num == fs->alloc)
1253 {
1254 struct re_fail_stack_ent_t *new_array;
1255 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1256 * fs->alloc * 2));
1257 if (new_array == NULL)
1258 return REG_ESPACE;
1259 fs->alloc *= 2;
1260 fs->stack = new_array;
1261 }
1262 fs->stack[num].idx = str_idx;
1263 fs->stack[num].node = dests[1];
1264 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1265 if (fs->stack[num].regs == NULL)
1266 return REG_ESPACE;
1267 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1268 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1269 return err;
1270 }
1271
1272 static int
1273 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1274 struct re_fail_stack_t *fs;
1275 int *pidx, nregs;
1276 regmatch_t *regs;
1277 re_node_set *eps_via_nodes;
1278 {
1279 int num = --fs->num;
1280 assert (num >= 0);
1281 *pidx = fs->stack[num].idx;
1282 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1283 re_node_set_free (eps_via_nodes);
1284 re_free (fs->stack[num].regs);
1285 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1286 return fs->stack[num].node;
1287 }
1288
1289 /* Set the positions where the subexpressions are starts/ends to registers
1290 PMATCH.
1291 Note: We assume that pmatch[0] is already set, and
1292 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1293
1294 static reg_errcode_t
1295 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1296 const regex_t *preg;
1297 const re_match_context_t *mctx;
1298 size_t nmatch;
1299 regmatch_t *pmatch;
1300 int fl_backtrack;
1301 {
1302 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1303 int idx, cur_node, real_nmatch;
1304 re_node_set eps_via_nodes;
1305 struct re_fail_stack_t *fs;
1306 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1307 regmatch_t *prev_idx_match;
1308
1309 #ifdef DEBUG
1310 assert (nmatch > 1);
1311 assert (mctx->state_log != NULL);
1312 #endif
1313 if (fl_backtrack)
1314 {
1315 fs = &fs_body;
1316 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1317 if (fs->stack == NULL)
1318 return REG_ESPACE;
1319 }
1320 else
1321 fs = NULL;
1322
1323 cur_node = dfa->init_node;
1324 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1325 re_node_set_init_empty (&eps_via_nodes);
1326
1327 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1328 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1329
1330 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1331 {
1332 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1333
1334 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1335 {
1336 int reg_idx;
1337 if (fs)
1338 {
1339 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1340 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1341 break;
1342 if (reg_idx == nmatch)
1343 {
1344 re_node_set_free (&eps_via_nodes);
1345 return free_fail_stack_return (fs);
1346 }
1347 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1348 &eps_via_nodes);
1349 }
1350 else
1351 {
1352 re_node_set_free (&eps_via_nodes);
1353 return REG_NOERROR;
1354 }
1355 }
1356
1357 /* Proceed to next node. */
1358 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1359 &eps_via_nodes, fs);
1360
1361 if (BE (cur_node < 0, 0))
1362 {
1363 if (BE (cur_node == -2, 0))
1364 {
1365 re_node_set_free (&eps_via_nodes);
1366 free_fail_stack_return (fs);
1367 return REG_ESPACE;
1368 }
1369 if (fs)
1370 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1371 &eps_via_nodes);
1372 else
1373 {
1374 re_node_set_free (&eps_via_nodes);
1375 return REG_NOMATCH;
1376 }
1377 }
1378 }
1379 re_node_set_free (&eps_via_nodes);
1380 return free_fail_stack_return (fs);
1381 }
1382
1383 static reg_errcode_t
1384 free_fail_stack_return (fs)
1385 struct re_fail_stack_t *fs;
1386 {
1387 if (fs)
1388 {
1389 int fs_idx;
1390 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1391 {
1392 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1393 re_free (fs->stack[fs_idx].regs);
1394 }
1395 re_free (fs->stack);
1396 }
1397 return REG_NOERROR;
1398 }
1399
1400 static void
1401 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1402 re_dfa_t *dfa;
1403 regmatch_t *pmatch, *prev_idx_match;
1404 int cur_node, cur_idx, nmatch;
1405 {
1406 int type = dfa->nodes[cur_node].type;
1407 if (type == OP_OPEN_SUBEXP)
1408 {
1409 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1410
1411 /* We are at the first node of this sub expression. */
1412 if (reg_num < nmatch)
1413 {
1414 pmatch[reg_num].rm_so = cur_idx;
1415 pmatch[reg_num].rm_eo = -1;
1416 }
1417 }
1418 else if (type == OP_CLOSE_SUBEXP)
1419 {
1420 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1421 if (reg_num < nmatch)
1422 {
1423 /* We are at the last node of this sub expression. */
1424 if (pmatch[reg_num].rm_so < cur_idx)
1425 {
1426 pmatch[reg_num].rm_eo = cur_idx;
1427 /* This is a non-empty match or we are not inside an optional
1428 subexpression. Accept this right away. */
1429 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1430 }
1431 else
1432 {
1433 if (dfa->nodes[cur_node].opt_subexp
1434 && prev_idx_match[reg_num].rm_so != -1)
1435 /* We transited through an empty match for an optional
1436 subexpression, like (a?)*, and this is not the subexp's
1437 first match. Copy back the old content of the registers
1438 so that matches of an inner subexpression are undone as
1439 well, like in ((a?))*. */
1440 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1441 else
1442 /* We completed a subexpression, but it may be part of
1443 an optional one, so do not update PREV_IDX_MATCH. */
1444 pmatch[reg_num].rm_eo = cur_idx;
1445 }
1446 }
1447 }
1448 }
1449
1450 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1451 and sift the nodes in each states according to the following rules.
1452 Updated state_log will be wrote to STATE_LOG.
1453
1454 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1455 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1456 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1457 the LAST_NODE, we throw away the node `a'.
1458 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1459 string `s' and transit to `b':
1460 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1461 away the node `a'.
1462 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1463 thrown away, we throw away the node `a'.
1464 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1465 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1466 node `a'.
1467 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1468 we throw away the node `a'. */
1469
1470 #define STATE_NODE_CONTAINS(state,node) \
1471 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1472
1473 static reg_errcode_t
1474 sift_states_backward (mctx, sctx)
1475 re_match_context_t *mctx;
1476 re_sift_context_t *sctx;
1477 {
1478 re_dfa_t *const dfa = mctx->dfa;
1479 reg_errcode_t err;
1480 int null_cnt = 0;
1481 int str_idx = sctx->last_str_idx;
1482 re_node_set cur_dest;
1483 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
1484
1485 #ifdef DEBUG
1486 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1487 #endif
1488 cur_src = &mctx->state_log[str_idx]->nodes;
1489
1490 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1491 transit to the last_node and the last_node itself. */
1492 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1493 if (BE (err != REG_NOERROR, 0))
1494 return err;
1495 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1496 if (BE (err != REG_NOERROR, 0))
1497 goto free_return;
1498
1499 /* Then check each states in the state_log. */
1500 while (str_idx > 0)
1501 {
1502 int i, ret;
1503 /* Update counters. */
1504 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1505 if (null_cnt > mctx->max_mb_elem_len)
1506 {
1507 memset (sctx->sifted_states, '\0',
1508 sizeof (re_dfastate_t *) * str_idx);
1509 re_node_set_free (&cur_dest);
1510 return REG_NOERROR;
1511 }
1512 re_node_set_empty (&cur_dest);
1513 --str_idx;
1514 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1515 : &mctx->state_log[str_idx]->nodes);
1516
1517 /* Then build the next sifted state.
1518 We build the next sifted state on `cur_dest', and update
1519 `sifted_states[str_idx]' with `cur_dest'.
1520 Note:
1521 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1522 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1523 for (i = 0; i < cur_src->nelem; i++)
1524 {
1525 int prev_node = cur_src->elems[i];
1526 int naccepted = 0;
1527 re_token_type_t type = dfa->nodes[prev_node].type;
1528
1529 if (IS_EPSILON_NODE (type))
1530 continue;
1531 #ifdef RE_ENABLE_I18N
1532 /* If the node may accept `multi byte'. */
1533 if (ACCEPT_MB_NODE (type))
1534 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1535 str_idx, sctx->last_str_idx);
1536
1537 #endif /* RE_ENABLE_I18N */
1538 /* We don't check backreferences here.
1539 See update_cur_sifted_state(). */
1540
1541 if (!naccepted
1542 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1543 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1544 dfa->nexts[prev_node]))
1545 naccepted = 1;
1546
1547 if (naccepted == 0)
1548 continue;
1549
1550 if (sctx->limits.nelem)
1551 {
1552 int to_idx = str_idx + naccepted;
1553 if (check_dst_limits (mctx, &sctx->limits,
1554 dfa->nexts[prev_node], to_idx,
1555 prev_node, str_idx))
1556 continue;
1557 }
1558 ret = re_node_set_insert (&cur_dest, prev_node);
1559 if (BE (ret == -1, 0))
1560 {
1561 err = REG_ESPACE;
1562 goto free_return;
1563 }
1564 }
1565
1566 /* Add all the nodes which satisfy the following conditions:
1567 - It can epsilon transit to a node in CUR_DEST.
1568 - It is in CUR_SRC.
1569 And update state_log. */
1570 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1571 if (BE (err != REG_NOERROR, 0))
1572 goto free_return;
1573 }
1574 err = REG_NOERROR;
1575 free_return:
1576 re_node_set_free (&cur_dest);
1577 return err;
1578 }
1579
1580 /* Helper functions. */
1581
1582 static reg_errcode_t
1583 clean_state_log_if_needed (mctx, next_state_log_idx)
1584 re_match_context_t *mctx;
1585 int next_state_log_idx;
1586 {
1587 int top = mctx->state_log_top;
1588
1589 if (next_state_log_idx >= mctx->input.bufs_len
1590 || (next_state_log_idx >= mctx->input.valid_len
1591 && mctx->input.valid_len < mctx->input.len))
1592 {
1593 reg_errcode_t err;
1594 err = extend_buffers (mctx);
1595 if (BE (err != REG_NOERROR, 0))
1596 return err;
1597 }
1598
1599 if (top < next_state_log_idx)
1600 {
1601 memset (mctx->state_log + top + 1, '\0',
1602 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1603 mctx->state_log_top = next_state_log_idx;
1604 }
1605 return REG_NOERROR;
1606 }
1607
1608 static reg_errcode_t
1609 merge_state_array (dfa, dst, src, num)
1610 re_dfa_t *dfa;
1611 re_dfastate_t **dst;
1612 re_dfastate_t **src;
1613 int num;
1614 {
1615 int st_idx;
1616 reg_errcode_t err;
1617 for (st_idx = 0; st_idx < num; ++st_idx)
1618 {
1619 if (dst[st_idx] == NULL)
1620 dst[st_idx] = src[st_idx];
1621 else if (src[st_idx] != NULL)
1622 {
1623 re_node_set merged_set;
1624 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1625 &src[st_idx]->nodes);
1626 if (BE (err != REG_NOERROR, 0))
1627 return err;
1628 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1629 re_node_set_free (&merged_set);
1630 if (BE (err != REG_NOERROR, 0))
1631 return err;
1632 }
1633 }
1634 return REG_NOERROR;
1635 }
1636
1637 static reg_errcode_t
1638 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1639 re_match_context_t *mctx;
1640 re_sift_context_t *sctx;
1641 int str_idx;
1642 re_node_set *dest_nodes;
1643 {
1644 re_dfa_t *const dfa = mctx->dfa;
1645 reg_errcode_t err;
1646 const re_node_set *candidates;
1647 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1648 : &mctx->state_log[str_idx]->nodes);
1649
1650 /* At first, add the nodes which can epsilon transit to a node in
1651 DEST_NODE. */
1652 if (dest_nodes->nelem)
1653 {
1654 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1655 if (BE (err != REG_NOERROR, 0))
1656 return err;
1657 }
1658
1659 /* Then, check the limitations in the current sift_context. */
1660 if (dest_nodes->nelem && sctx->limits.nelem)
1661 {
1662 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1663 mctx->bkref_ents, str_idx);
1664 if (BE (err != REG_NOERROR, 0))
1665 return err;
1666 }
1667
1668 /* Update state_log. */
1669 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1670 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1671 return err;
1672
1673 if ((mctx->state_log[str_idx] != NULL
1674 && mctx->state_log[str_idx]->has_backref))
1675 {
1676 err = sift_states_bkref (mctx, sctx, str_idx, dest_nodes);
1677 if (BE (err != REG_NOERROR, 0))
1678 return err;
1679 }
1680 return REG_NOERROR;
1681 }
1682
1683 static reg_errcode_t
1684 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1685 re_dfa_t *dfa;
1686 re_node_set *dest_nodes;
1687 const re_node_set *candidates;
1688 {
1689 reg_errcode_t err;
1690 int src_idx;
1691 re_node_set src_copy;
1692
1693 err = re_node_set_init_copy (&src_copy, dest_nodes);
1694 if (BE (err != REG_NOERROR, 0))
1695 return err;
1696 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1697 {
1698 err = re_node_set_add_intersect (dest_nodes, candidates,
1699 dfa->inveclosures
1700 + src_copy.elems[src_idx]);
1701 if (BE (err != REG_NOERROR, 0))
1702 {
1703 re_node_set_free (&src_copy);
1704 return err;
1705 }
1706 }
1707 re_node_set_free (&src_copy);
1708 return REG_NOERROR;
1709 }
1710
1711 static reg_errcode_t
1712 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1713 re_dfa_t *dfa;
1714 int node;
1715 re_node_set *dest_nodes;
1716 const re_node_set *candidates;
1717 {
1718 int ecl_idx;
1719 reg_errcode_t err;
1720 re_node_set *inv_eclosure = dfa->inveclosures + node;
1721 re_node_set except_nodes;
1722 re_node_set_init_empty (&except_nodes);
1723 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1724 {
1725 int cur_node = inv_eclosure->elems[ecl_idx];
1726 if (cur_node == node)
1727 continue;
1728 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1729 {
1730 int edst1 = dfa->edests[cur_node].elems[0];
1731 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1732 ? dfa->edests[cur_node].elems[1] : -1);
1733 if ((!re_node_set_contains (inv_eclosure, edst1)
1734 && re_node_set_contains (dest_nodes, edst1))
1735 || (edst2 > 0
1736 && !re_node_set_contains (inv_eclosure, edst2)
1737 && re_node_set_contains (dest_nodes, edst2)))
1738 {
1739 err = re_node_set_add_intersect (&except_nodes, candidates,
1740 dfa->inveclosures + cur_node);
1741 if (BE (err != REG_NOERROR, 0))
1742 {
1743 re_node_set_free (&except_nodes);
1744 return err;
1745 }
1746 }
1747 }
1748 }
1749 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1750 {
1751 int cur_node = inv_eclosure->elems[ecl_idx];
1752 if (!re_node_set_contains (&except_nodes, cur_node))
1753 {
1754 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1755 re_node_set_remove_at (dest_nodes, idx);
1756 }
1757 }
1758 re_node_set_free (&except_nodes);
1759 return REG_NOERROR;
1760 }
1761
1762 static int
1763 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1764 re_match_context_t *mctx;
1765 re_node_set *limits;
1766 int dst_node, dst_idx, src_node, src_idx;
1767 {
1768 re_dfa_t *const dfa = mctx->dfa;
1769 int lim_idx, src_pos, dst_pos;
1770
1771 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1772 {
1773 int subexp_idx;
1774 struct re_backref_cache_entry *ent;
1775 ent = mctx->bkref_ents + limits->elems[lim_idx];
1776 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1777
1778 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1779 dfa->eclosures + dst_node,
1780 subexp_idx, dst_node, dst_idx);
1781 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1782 dfa->eclosures + src_node,
1783 subexp_idx, src_node, src_idx);
1784
1785 /* In case of:
1786 <src> <dst> ( <subexp> )
1787 ( <subexp> ) <src> <dst>
1788 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1789 if (src_pos == dst_pos)
1790 continue; /* This is unrelated limitation. */
1791 else
1792 return 1;
1793 }
1794 return 0;
1795 }
1796
1797 static int
1798 check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
1799 str_idx)
1800 re_match_context_t *mctx;
1801 re_node_set *eclosures;
1802 int limit, subexp_idx, from_node, str_idx;
1803 {
1804 re_dfa_t *const dfa = mctx->dfa;
1805 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1806 int node_idx;
1807
1808 /* If we are outside the range of the subexpression, return -1 or 1. */
1809 if (str_idx < lim->subexp_from)
1810 return -1;
1811
1812 if (lim->subexp_to < str_idx)
1813 return 1;
1814
1815 /* If we are within the subexpression, return 0. */
1816 if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1817 return 0;
1818
1819 /* Else, we are on the boundary: examine the nodes on the epsilon
1820 closure. */
1821 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1822 {
1823 int node = eclosures->elems[node_idx];
1824 switch (dfa->nodes[node].type)
1825 {
1826 case OP_BACK_REF:
1827 {
1828 int bi = search_cur_bkref_entry (mctx, str_idx);
1829 for (; bi < mctx->nbkref_ents; ++bi)
1830 {
1831 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1832 int dst, cpos;
1833
1834 /* If this backreference goes beyond the point we're
1835 examining, don't go any further. */
1836 if (ent->str_idx > str_idx)
1837 break;
1838
1839 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1840 continue;
1841
1842 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1843 OP_CLOSE_SUBEXP cases below. But, if the
1844 destination node is the same node as the source
1845 node, don't recurse because it would cause an
1846 infinite loop: a regex that exhibits this behavior
1847 is ()\1*\1* */
1848 dst = dfa->edests[node].elems[0];
1849 if (dst == from_node)
1850 {
1851 if (str_idx == lim->subexp_from)
1852 return -1;
1853 else /* if (str_idx == lim->subexp_to) */
1854 return 0;
1855 }
1856
1857 cpos = check_dst_limits_calc_pos (mctx, limit,
1858 dfa->eclosures + dst,
1859 subexp_idx, dst,
1860 str_idx);
1861
1862 if (cpos == -1 && str_idx == lim->subexp_from)
1863 return -1;
1864
1865 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1866 return 0;
1867 }
1868 break;
1869 }
1870
1871 case OP_OPEN_SUBEXP:
1872 if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1873 return -1;
1874 break;
1875
1876 case OP_CLOSE_SUBEXP:
1877 if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1878 return 0;
1879 break;
1880
1881 default:
1882 break;
1883 }
1884 }
1885
1886 if (str_idx == lim->subexp_to)
1887 return 1;
1888 else
1889 return 0;
1890 }
1891
1892 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1893 which are against limitations from DEST_NODES. */
1894
1895 static reg_errcode_t
1896 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1897 re_dfa_t *dfa;
1898 re_node_set *dest_nodes;
1899 const re_node_set *candidates;
1900 re_node_set *limits;
1901 struct re_backref_cache_entry *bkref_ents;
1902 int str_idx;
1903 {
1904 reg_errcode_t err;
1905 int node_idx, lim_idx;
1906
1907 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1908 {
1909 int subexp_idx;
1910 struct re_backref_cache_entry *ent;
1911 ent = bkref_ents + limits->elems[lim_idx];
1912
1913 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1914 continue; /* This is unrelated limitation. */
1915
1916 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1917 if (ent->subexp_to == str_idx)
1918 {
1919 int ops_node = -1;
1920 int cls_node = -1;
1921 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1922 {
1923 int node = dest_nodes->elems[node_idx];
1924 re_token_type_t type = dfa->nodes[node].type;
1925 if (type == OP_OPEN_SUBEXP
1926 && subexp_idx == dfa->nodes[node].opr.idx)
1927 ops_node = node;
1928 else if (type == OP_CLOSE_SUBEXP
1929 && subexp_idx == dfa->nodes[node].opr.idx)
1930 cls_node = node;
1931 }
1932
1933 /* Check the limitation of the open subexpression. */
1934 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1935 if (ops_node >= 0)
1936 {
1937 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1938 candidates);
1939 if (BE (err != REG_NOERROR, 0))
1940 return err;
1941 }
1942
1943 /* Check the limitation of the close subexpression. */
1944 if (cls_node >= 0)
1945 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1946 {
1947 int node = dest_nodes->elems[node_idx];
1948 if (!re_node_set_contains (dfa->inveclosures + node,
1949 cls_node)
1950 && !re_node_set_contains (dfa->eclosures + node,
1951 cls_node))
1952 {
1953 /* It is against this limitation.
1954 Remove it form the current sifted state. */
1955 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1956 candidates);
1957 if (BE (err != REG_NOERROR, 0))
1958 return err;
1959 --node_idx;
1960 }
1961 }
1962 }
1963 else /* (ent->subexp_to != str_idx) */
1964 {
1965 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1966 {
1967 int node = dest_nodes->elems[node_idx];
1968 re_token_type_t type = dfa->nodes[node].type;
1969 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1970 {
1971 if (subexp_idx != dfa->nodes[node].opr.idx)
1972 continue;
1973 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1974 || (type == OP_OPEN_SUBEXP))
1975 {
1976 /* It is against this limitation.
1977 Remove it form the current sifted state. */
1978 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1979 candidates);
1980 if (BE (err != REG_NOERROR, 0))
1981 return err;
1982 }
1983 }
1984 }
1985 }
1986 }
1987 return REG_NOERROR;
1988 }
1989
1990 static reg_errcode_t
1991 sift_states_bkref (mctx, sctx, str_idx, dest_nodes)
1992 re_match_context_t *mctx;
1993 re_sift_context_t *sctx;
1994 int str_idx;
1995 re_node_set *dest_nodes;
1996 {
1997 re_dfa_t *const dfa = mctx->dfa;
1998 reg_errcode_t err;
1999 int node_idx, node;
2000 re_sift_context_t local_sctx;
2001 const re_node_set *candidates;
2002 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
2003 : &mctx->state_log[str_idx]->nodes);
2004 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2005
2006 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2007 {
2008 int cur_bkref_idx = re_string_cur_idx (&mctx->input);
2009 re_token_type_t type;
2010 node = candidates->elems[node_idx];
2011 type = dfa->nodes[node].type;
2012 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
2013 continue;
2014 /* Avoid infinite loop for the REs like "()\1+". */
2015 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2016 continue;
2017 if (type == OP_BACK_REF)
2018 {
2019 int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2020 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2021 {
2022 int disabled_idx, subexp_len, to_idx, dst_node;
2023 struct re_backref_cache_entry *entry;
2024 entry = mctx->bkref_ents + enabled_idx;
2025 if (entry->str_idx > str_idx)
2026 break;
2027 if (entry->node != node)
2028 continue;
2029 subexp_len = entry->subexp_to - entry->subexp_from;
2030 to_idx = str_idx + subexp_len;
2031 dst_node = (subexp_len ? dfa->nexts[node]
2032 : dfa->edests[node].elems[0]);
2033
2034 if (to_idx > sctx->last_str_idx
2035 || sctx->sifted_states[to_idx] == NULL
2036 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
2037 dst_node)
2038 || check_dst_limits (mctx, &sctx->limits, node,
2039 str_idx, dst_node, to_idx))
2040 continue;
2041 {
2042 re_dfastate_t *cur_state;
2043 entry->flag = 0;
2044 for (disabled_idx = enabled_idx + 1;
2045 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
2046 {
2047 struct re_backref_cache_entry *entry2;
2048 entry2 = mctx->bkref_ents + disabled_idx;
2049 if (entry2->str_idx > str_idx)
2050 break;
2051 entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
2052 }
2053
2054 if (local_sctx.sifted_states == NULL)
2055 {
2056 local_sctx = *sctx;
2057 err = re_node_set_init_copy (&local_sctx.limits,
2058 &sctx->limits);
2059 if (BE (err != REG_NOERROR, 0))
2060 goto free_return;
2061 }
2062 local_sctx.last_node = node;
2063 local_sctx.last_str_idx = str_idx;
2064 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2065 if (BE (err < 0, 0))
2066 {
2067 err = REG_ESPACE;
2068 goto free_return;
2069 }
2070 cur_state = local_sctx.sifted_states[str_idx];
2071 err = sift_states_backward (mctx, &local_sctx);
2072 if (BE (err != REG_NOERROR, 0))
2073 goto free_return;
2074 if (sctx->limited_states != NULL)
2075 {
2076 err = merge_state_array (dfa, sctx->limited_states,
2077 local_sctx.sifted_states,
2078 str_idx + 1);
2079 if (BE (err != REG_NOERROR, 0))
2080 goto free_return;
2081 }
2082 local_sctx.sifted_states[str_idx] = cur_state;
2083 re_node_set_remove (&local_sctx.limits, enabled_idx);
2084 /* We must not use the variable entry here, since
2085 mctx->bkref_ents might be realloced. */
2086 mctx->bkref_ents[enabled_idx].flag = 1;
2087 }
2088 }
2089 enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2090 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2091 {
2092 struct re_backref_cache_entry *entry;
2093 entry = mctx->bkref_ents + enabled_idx;
2094 if (entry->str_idx > str_idx)
2095 break;
2096 if (entry->node == node)
2097 entry->flag = 0;
2098 }
2099 }
2100 }
2101 err = REG_NOERROR;
2102 free_return:
2103 if (local_sctx.sifted_states != NULL)
2104 {
2105 re_node_set_free (&local_sctx.limits);
2106 }
2107
2108 return err;
2109 }
2110
2111
2112 #ifdef RE_ENABLE_I18N
2113 static int
2114 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2115 const re_match_context_t *mctx;
2116 re_sift_context_t *sctx;
2117 int node_idx, str_idx, max_str_idx;
2118 {
2119 re_dfa_t *const dfa = mctx->dfa;
2120 int naccepted;
2121 /* Check the node can accept `multi byte'. */
2122 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2123 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2124 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2125 dfa->nexts[node_idx]))
2126 /* The node can't accept the `multi byte', or the
2127 destination was already thrown away, then the node
2128 could't accept the current input `multi byte'. */
2129 naccepted = 0;
2130 /* Otherwise, it is sure that the node could accept
2131 `naccepted' bytes input. */
2132 return naccepted;
2133 }
2134 #endif /* RE_ENABLE_I18N */
2135
2136 \f
2137 /* Functions for state transition. */
2138
2139 /* Return the next state to which the current state STATE will transit by
2140 accepting the current input byte, and update STATE_LOG if necessary.
2141 If STATE can accept a multibyte char/collating element/back reference
2142 update the destination of STATE_LOG. */
2143
2144 static re_dfastate_t *
2145 transit_state (err, mctx, state)
2146 reg_errcode_t *err;
2147 re_match_context_t *mctx;
2148 re_dfastate_t *state;
2149 {
2150 re_dfa_t *const dfa = mctx->dfa;
2151 re_dfastate_t **trtable;
2152 unsigned char ch;
2153
2154 if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
2155 || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
2156 && mctx->input.valid_len < mctx->input.len))
2157 {
2158 *err = extend_buffers (mctx);
2159 if (BE (*err != REG_NOERROR, 0))
2160 return NULL;
2161 }
2162
2163 #ifdef RE_ENABLE_I18N
2164 /* If the current state can accept multibyte. */
2165 if (state->accept_mb)
2166 {
2167 *err = transit_state_mb (mctx, state);
2168 if (BE (*err != REG_NOERROR, 0))
2169 return NULL;
2170 }
2171 #endif /* RE_ENABLE_I18N */
2172
2173 /* Then decide the next state with the single byte. */
2174 if (1)
2175 {
2176 /* Use transition table */
2177 ch = re_string_fetch_byte (&mctx->input);
2178 trtable = state->trtable;
2179 if (trtable == NULL)
2180 {
2181 trtable = build_trtable (dfa, state);
2182 if (trtable == NULL)
2183 {
2184 *err = REG_ESPACE;
2185 return NULL;
2186 }
2187 }
2188 if (BE (state->word_trtable, 0))
2189 {
2190 unsigned int context;
2191 context
2192 = re_string_context_at (&mctx->input,
2193 re_string_cur_idx (&mctx->input) - 1,
2194 mctx->eflags);
2195 if (IS_WORD_CONTEXT (context))
2196 return trtable[ch + SBC_MAX];
2197 else
2198 return trtable[ch];
2199 }
2200 else
2201 return trtable[ch];
2202 }
2203 #if 0
2204 else
2205 /* don't use transition table */
2206 return transit_state_sb (err, mctx, state);
2207 #endif
2208 }
2209
2210 /* Update the state_log if we need */
2211 re_dfastate_t *
2212 merge_state_with_log (err, mctx, next_state)
2213 reg_errcode_t *err;
2214 re_match_context_t *mctx;
2215 re_dfastate_t *next_state;
2216 {
2217 re_dfa_t *const dfa = mctx->dfa;
2218 int cur_idx = re_string_cur_idx (&mctx->input);
2219
2220 if (cur_idx > mctx->state_log_top)
2221 {
2222 mctx->state_log[cur_idx] = next_state;
2223 mctx->state_log_top = cur_idx;
2224 }
2225 else if (mctx->state_log[cur_idx] == 0)
2226 {
2227 mctx->state_log[cur_idx] = next_state;
2228 }
2229 else
2230 {
2231 re_dfastate_t *pstate;
2232 unsigned int context;
2233 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2234 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2235 the destination of a multibyte char/collating element/
2236 back reference. Then the next state is the union set of
2237 these destinations and the results of the transition table. */
2238 pstate = mctx->state_log[cur_idx];
2239 log_nodes = pstate->entrance_nodes;
2240 if (next_state != NULL)
2241 {
2242 table_nodes = next_state->entrance_nodes;
2243 *err = re_node_set_init_union (&next_nodes, table_nodes,
2244 log_nodes);
2245 if (BE (*err != REG_NOERROR, 0))
2246 return NULL;
2247 }
2248 else
2249 next_nodes = *log_nodes;
2250 /* Note: We already add the nodes of the initial state,
2251 then we don't need to add them here. */
2252
2253 context = re_string_context_at (&mctx->input,
2254 re_string_cur_idx (&mctx->input) - 1,
2255 mctx->eflags);
2256 next_state = mctx->state_log[cur_idx]
2257 = re_acquire_state_context (err, dfa, &next_nodes, context);
2258 /* We don't need to check errors here, since the return value of
2259 this function is next_state and ERR is already set. */
2260
2261 if (table_nodes != NULL)
2262 re_node_set_free (&next_nodes);
2263 }
2264
2265 if (BE (dfa->nbackref, 0) && next_state != NULL)
2266 {
2267 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2268 later. We must check them here, since the back references in the
2269 next state might use them. */
2270 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2271 cur_idx);
2272 if (BE (*err != REG_NOERROR, 0))
2273 return NULL;
2274
2275 /* If the next state has back references. */
2276 if (next_state->has_backref)
2277 {
2278 *err = transit_state_bkref (mctx, &next_state->nodes);
2279 if (BE (*err != REG_NOERROR, 0))
2280 return NULL;
2281 next_state = mctx->state_log[cur_idx];
2282 }
2283 }
2284
2285 return next_state;
2286 }
2287
2288 /* Skip bytes in the input that correspond to part of a
2289 multi-byte match, then look in the log for a state
2290 from which to restart matching. */
2291 re_dfastate_t *
2292 find_recover_state (err, mctx)
2293 reg_errcode_t *err;
2294 re_match_context_t *mctx;
2295 {
2296 re_dfastate_t *cur_state = NULL;
2297 do
2298 {
2299 int max = mctx->state_log_top;
2300 int cur_str_idx = re_string_cur_idx (&mctx->input);
2301
2302 do
2303 {
2304 if (++cur_str_idx > max)
2305 return NULL;
2306 re_string_skip_bytes (&mctx->input, 1);
2307 }
2308 while (mctx->state_log[cur_str_idx] == NULL);
2309
2310 cur_state = merge_state_with_log (err, mctx, NULL);
2311 }
2312 while (err == REG_NOERROR && cur_state == NULL);
2313 return cur_state;
2314 }
2315
2316 /* Helper functions for transit_state. */
2317
2318 /* From the node set CUR_NODES, pick up the nodes whose types are
2319 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2320 expression. And register them to use them later for evaluating the
2321 correspoding back references. */
2322
2323 static reg_errcode_t
2324 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2325 re_match_context_t *mctx;
2326 re_node_set *cur_nodes;
2327 int str_idx;
2328 {
2329 re_dfa_t *const dfa = mctx->dfa;
2330 int node_idx;
2331 reg_errcode_t err;
2332
2333 /* TODO: This isn't efficient.
2334 Because there might be more than one nodes whose types are
2335 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2336 nodes.
2337 E.g. RE: (a){2} */
2338 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2339 {
2340 int node = cur_nodes->elems[node_idx];
2341 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2342 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2343 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2344 {
2345 err = match_ctx_add_subtop (mctx, node, str_idx);
2346 if (BE (err != REG_NOERROR, 0))
2347 return err;
2348 }
2349 }
2350 return REG_NOERROR;
2351 }
2352
2353 #if 0
2354 /* Return the next state to which the current state STATE will transit by
2355 accepting the current input byte. */
2356
2357 static re_dfastate_t *
2358 transit_state_sb (err, mctx, state)
2359 reg_errcode_t *err;
2360 re_match_context_t *mctx;
2361 re_dfastate_t *state;
2362 {
2363 re_dfa_t *const dfa = mctx->dfa;
2364 re_node_set next_nodes;
2365 re_dfastate_t *next_state;
2366 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2367 unsigned int context;
2368
2369 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2370 if (BE (*err != REG_NOERROR, 0))
2371 return NULL;
2372 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2373 {
2374 int cur_node = state->nodes.elems[node_cnt];
2375 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2376 {
2377 *err = re_node_set_merge (&next_nodes,
2378 dfa->eclosures + dfa->nexts[cur_node]);
2379 if (BE (*err != REG_NOERROR, 0))
2380 {
2381 re_node_set_free (&next_nodes);
2382 return NULL;
2383 }
2384 }
2385 }
2386 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2387 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2388 /* We don't need to check errors here, since the return value of
2389 this function is next_state and ERR is already set. */
2390
2391 re_node_set_free (&next_nodes);
2392 re_string_skip_bytes (&mctx->input, 1);
2393 return next_state;
2394 }
2395 #endif
2396
2397 #ifdef RE_ENABLE_I18N
2398 static reg_errcode_t
2399 transit_state_mb (mctx, pstate)
2400 re_match_context_t *mctx;
2401 re_dfastate_t *pstate;
2402 {
2403 re_dfa_t *const dfa = mctx->dfa;
2404 reg_errcode_t err;
2405 int i;
2406
2407 for (i = 0; i < pstate->nodes.nelem; ++i)
2408 {
2409 re_node_set dest_nodes, *new_nodes;
2410 int cur_node_idx = pstate->nodes.elems[i];
2411 int naccepted = 0, dest_idx;
2412 unsigned int context;
2413 re_dfastate_t *dest_state;
2414
2415 if (dfa->nodes[cur_node_idx].constraint)
2416 {
2417 context = re_string_context_at (&mctx->input,
2418 re_string_cur_idx (&mctx->input),
2419 mctx->eflags);
2420 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2421 context))
2422 continue;
2423 }
2424
2425 /* How many bytes the node can accept? */
2426 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2427 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2428 re_string_cur_idx (&mctx->input));
2429 if (naccepted == 0)
2430 continue;
2431
2432 /* The node can accepts `naccepted' bytes. */
2433 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2434 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2435 : mctx->max_mb_elem_len);
2436 err = clean_state_log_if_needed (mctx, dest_idx);
2437 if (BE (err != REG_NOERROR, 0))
2438 return err;
2439 #ifdef DEBUG
2440 assert (dfa->nexts[cur_node_idx] != -1);
2441 #endif
2442 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2443 then we use pstate->nodes.elems[i] instead. */
2444 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2445
2446 dest_state = mctx->state_log[dest_idx];
2447 if (dest_state == NULL)
2448 dest_nodes = *new_nodes;
2449 else
2450 {
2451 err = re_node_set_init_union (&dest_nodes,
2452 dest_state->entrance_nodes, new_nodes);
2453 if (BE (err != REG_NOERROR, 0))
2454 return err;
2455 }
2456 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2457 mctx->state_log[dest_idx]
2458 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2459 if (dest_state != NULL)
2460 re_node_set_free (&dest_nodes);
2461 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2462 return err;
2463 }
2464 return REG_NOERROR;
2465 }
2466 #endif /* RE_ENABLE_I18N */
2467
2468 static reg_errcode_t
2469 transit_state_bkref (mctx, nodes)
2470 re_match_context_t *mctx;
2471 const re_node_set *nodes;
2472 {
2473 re_dfa_t *const dfa = mctx->dfa;
2474 reg_errcode_t err;
2475 int i;
2476 int cur_str_idx = re_string_cur_idx (&mctx->input);
2477
2478 for (i = 0; i < nodes->nelem; ++i)
2479 {
2480 int dest_str_idx, prev_nelem, bkc_idx;
2481 int node_idx = nodes->elems[i];
2482 unsigned int context;
2483 const re_token_t *node = dfa->nodes + node_idx;
2484 re_node_set *new_dest_nodes;
2485
2486 /* Check whether `node' is a backreference or not. */
2487 if (node->type != OP_BACK_REF)
2488 continue;
2489
2490 if (node->constraint)
2491 {
2492 context = re_string_context_at (&mctx->input, cur_str_idx,
2493 mctx->eflags);
2494 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2495 continue;
2496 }
2497
2498 /* `node' is a backreference.
2499 Check the substring which the substring matched. */
2500 bkc_idx = mctx->nbkref_ents;
2501 err = get_subexp (mctx, node_idx, cur_str_idx);
2502 if (BE (err != REG_NOERROR, 0))
2503 goto free_return;
2504
2505 /* And add the epsilon closures (which is `new_dest_nodes') of
2506 the backreference to appropriate state_log. */
2507 #ifdef DEBUG
2508 assert (dfa->nexts[node_idx] != -1);
2509 #endif
2510 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2511 {
2512 int subexp_len;
2513 re_dfastate_t *dest_state;
2514 struct re_backref_cache_entry *bkref_ent;
2515 bkref_ent = mctx->bkref_ents + bkc_idx;
2516 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2517 continue;
2518 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2519 new_dest_nodes = (subexp_len == 0
2520 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2521 : dfa->eclosures + dfa->nexts[node_idx]);
2522 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2523 - bkref_ent->subexp_from);
2524 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2525 mctx->eflags);
2526 dest_state = mctx->state_log[dest_str_idx];
2527 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2528 : mctx->state_log[cur_str_idx]->nodes.nelem);
2529 /* Add `new_dest_node' to state_log. */
2530 if (dest_state == NULL)
2531 {
2532 mctx->state_log[dest_str_idx]
2533 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2534 context);
2535 if (BE (mctx->state_log[dest_str_idx] == NULL
2536 && err != REG_NOERROR, 0))
2537 goto free_return;
2538 }
2539 else
2540 {
2541 re_node_set dest_nodes;
2542 err = re_node_set_init_union (&dest_nodes,
2543 dest_state->entrance_nodes,
2544 new_dest_nodes);
2545 if (BE (err != REG_NOERROR, 0))
2546 {
2547 re_node_set_free (&dest_nodes);
2548 goto free_return;
2549 }
2550 mctx->state_log[dest_str_idx]
2551 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2552 re_node_set_free (&dest_nodes);
2553 if (BE (mctx->state_log[dest_str_idx] == NULL
2554 && err != REG_NOERROR, 0))
2555 goto free_return;
2556 }
2557 /* We need to check recursively if the backreference can epsilon
2558 transit. */
2559 if (subexp_len == 0
2560 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2561 {
2562 err = check_subexp_matching_top (mctx, new_dest_nodes,
2563 cur_str_idx);
2564 if (BE (err != REG_NOERROR, 0))
2565 goto free_return;
2566 err = transit_state_bkref (mctx, new_dest_nodes);
2567 if (BE (err != REG_NOERROR, 0))
2568 goto free_return;
2569 }
2570 }
2571 }
2572 err = REG_NOERROR;
2573 free_return:
2574 return err;
2575 }
2576
2577 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2578 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2579 Note that we might collect inappropriate candidates here.
2580 However, the cost of checking them strictly here is too high, then we
2581 delay these checking for prune_impossible_nodes(). */
2582
2583 static reg_errcode_t
2584 get_subexp (mctx, bkref_node, bkref_str_idx)
2585 re_match_context_t *mctx;
2586 int bkref_node, bkref_str_idx;
2587 {
2588 re_dfa_t *const dfa = mctx->dfa;
2589 int subexp_num, sub_top_idx;
2590 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2591 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2592 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2593 for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2594 {
2595 const struct re_backref_cache_entry *entry
2596 = &mctx->bkref_ents[cache_idx];
2597 if (entry->str_idx > bkref_str_idx)
2598 break;
2599 if (entry->node == bkref_node)
2600 return REG_NOERROR; /* We already checked it. */
2601 }
2602 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2603
2604 /* For each sub expression */
2605 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2606 {
2607 reg_errcode_t err;
2608 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2609 re_sub_match_last_t *sub_last;
2610 int sub_last_idx, sl_str, bkref_str_off;
2611
2612 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2613 continue; /* It isn't related. */
2614
2615 sl_str = sub_top->str_idx;
2616 bkref_str_off = bkref_str_idx;
2617 /* At first, check the last node of sub expressions we already
2618 evaluated. */
2619 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2620 {
2621 int sl_str_diff;
2622 sub_last = sub_top->lasts[sub_last_idx];
2623 sl_str_diff = sub_last->str_idx - sl_str;
2624 /* The matched string by the sub expression match with the substring
2625 at the back reference? */
2626 if (sl_str_diff > 0)
2627 {
2628 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2629 {
2630 /* Not enough chars for a successful match. */
2631 if (bkref_str_off + sl_str_diff > mctx->input.len)
2632 break;
2633
2634 err = clean_state_log_if_needed (mctx,
2635 bkref_str_off
2636 + sl_str_diff);
2637 if (BE (err != REG_NOERROR, 0))
2638 return err;
2639 buf = (const char *) re_string_get_buffer (&mctx->input);
2640 }
2641 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2642 break; /* We don't need to search this sub expression any more. */
2643 }
2644 bkref_str_off += sl_str_diff;
2645 sl_str += sl_str_diff;
2646 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2647 bkref_str_idx);
2648
2649 /* Reload buf, since the preceding call might have reallocated
2650 the buffer. */
2651 buf = (const char *) re_string_get_buffer (&mctx->input);
2652
2653 if (err == REG_NOMATCH)
2654 continue;
2655 if (BE (err != REG_NOERROR, 0))
2656 return err;
2657 }
2658
2659 if (sub_last_idx < sub_top->nlasts)
2660 continue;
2661 if (sub_last_idx > 0)
2662 ++sl_str;
2663 /* Then, search for the other last nodes of the sub expression. */
2664 for (; sl_str <= bkref_str_idx; ++sl_str)
2665 {
2666 int cls_node, sl_str_off;
2667 const re_node_set *nodes;
2668 sl_str_off = sl_str - sub_top->str_idx;
2669 /* The matched string by the sub expression match with the substring
2670 at the back reference? */
2671 if (sl_str_off > 0)
2672 {
2673 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2674 {
2675 /* If we are at the end of the input, we cannot match. */
2676 if (bkref_str_off >= mctx->input.len)
2677 break;
2678
2679 err = extend_buffers (mctx);
2680 if (BE (err != REG_NOERROR, 0))
2681 return err;
2682
2683 buf = (const char *) re_string_get_buffer (&mctx->input);
2684 }
2685 if (buf [bkref_str_off++] != buf[sl_str - 1])
2686 break; /* We don't need to search this sub expression
2687 any more. */
2688 }
2689 if (mctx->state_log[sl_str] == NULL)
2690 continue;
2691 /* Does this state have a ')' of the sub expression? */
2692 nodes = &mctx->state_log[sl_str]->nodes;
2693 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2694 if (cls_node == -1)
2695 continue; /* No. */
2696 if (sub_top->path == NULL)
2697 {
2698 sub_top->path = calloc (sizeof (state_array_t),
2699 sl_str - sub_top->str_idx + 1);
2700 if (sub_top->path == NULL)
2701 return REG_ESPACE;
2702 }
2703 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2704 in the current context? */
2705 err = check_arrival (mctx, sub_top->path, sub_top->node,
2706 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2707 if (err == REG_NOMATCH)
2708 continue;
2709 if (BE (err != REG_NOERROR, 0))
2710 return err;
2711 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2712 if (BE (sub_last == NULL, 0))
2713 return REG_ESPACE;
2714 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2715 bkref_str_idx);
2716 if (err == REG_NOMATCH)
2717 continue;
2718 }
2719 }
2720 return REG_NOERROR;
2721 }
2722
2723 /* Helper functions for get_subexp(). */
2724
2725 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2726 If it can arrive, register the sub expression expressed with SUB_TOP
2727 and SUB_LAST. */
2728
2729 static reg_errcode_t
2730 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2731 re_match_context_t *mctx;
2732 const re_sub_match_top_t *sub_top;
2733 re_sub_match_last_t *sub_last;
2734 int bkref_node, bkref_str;
2735 {
2736 reg_errcode_t err;
2737 int to_idx;
2738 /* Can the subexpression arrive the back reference? */
2739 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2740 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2741 if (err != REG_NOERROR)
2742 return err;
2743 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2744 sub_last->str_idx);
2745 if (BE (err != REG_NOERROR, 0))
2746 return err;
2747 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2748 return clean_state_log_if_needed (mctx, to_idx);
2749 }
2750
2751 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2752 Search '(' if FL_OPEN, or search ')' otherwise.
2753 TODO: This function isn't efficient...
2754 Because there might be more than one nodes whose types are
2755 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2756 nodes.
2757 E.g. RE: (a){2} */
2758
2759 static int
2760 find_subexp_node (dfa, nodes, subexp_idx, type)
2761 const re_dfa_t *dfa;
2762 const re_node_set *nodes;
2763 int subexp_idx, type;
2764 {
2765 int cls_idx;
2766 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2767 {
2768 int cls_node = nodes->elems[cls_idx];
2769 const re_token_t *node = dfa->nodes + cls_node;
2770 if (node->type == type
2771 && node->opr.idx == subexp_idx)
2772 return cls_node;
2773 }
2774 return -1;
2775 }
2776
2777 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2778 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2779 heavily reused.
2780 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2781
2782 static reg_errcode_t
2783 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2784 type)
2785 re_match_context_t *mctx;
2786 state_array_t *path;
2787 int top_node, top_str, last_node, last_str, type;
2788 {
2789 re_dfa_t *const dfa = mctx->dfa;
2790 reg_errcode_t err;
2791 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2792 re_dfastate_t *cur_state = NULL;
2793 re_node_set *cur_nodes, next_nodes;
2794 re_dfastate_t **backup_state_log;
2795 unsigned int context;
2796
2797 subexp_num = dfa->nodes[top_node].opr.idx;
2798 /* Extend the buffer if we need. */
2799 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2800 {
2801 re_dfastate_t **new_array;
2802 int old_alloc = path->alloc;
2803 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2804 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2805 if (new_array == NULL)
2806 {
2807 path->alloc = old_alloc;
2808 return REG_ESPACE;
2809 }
2810 path->array = new_array;
2811 memset (new_array + old_alloc, '\0',
2812 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2813 }
2814
2815 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2816
2817 /* Temporary modify MCTX. */
2818 backup_state_log = mctx->state_log;
2819 backup_cur_idx = mctx->input.cur_idx;
2820 mctx->state_log = path->array;
2821 mctx->input.cur_idx = str_idx;
2822
2823 /* Setup initial node set. */
2824 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2825 if (str_idx == top_str)
2826 {
2827 err = re_node_set_init_1 (&next_nodes, top_node);
2828 if (BE (err != REG_NOERROR, 0))
2829 return err;
2830 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2831 if (BE (err != REG_NOERROR, 0))
2832 {
2833 re_node_set_free (&next_nodes);
2834 return err;
2835 }
2836 }
2837 else
2838 {
2839 cur_state = mctx->state_log[str_idx];
2840 if (cur_state && cur_state->has_backref)
2841 {
2842 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2843 if (BE ( err != REG_NOERROR, 0))
2844 return err;
2845 }
2846 else
2847 re_node_set_init_empty (&next_nodes);
2848 }
2849 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2850 {
2851 if (next_nodes.nelem)
2852 {
2853 err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
2854 subexp_num, type);
2855 if (BE ( err != REG_NOERROR, 0))
2856 {
2857 re_node_set_free (&next_nodes);
2858 return err;
2859 }
2860 }
2861 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2862 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2863 {
2864 re_node_set_free (&next_nodes);
2865 return err;
2866 }
2867 mctx->state_log[str_idx] = cur_state;
2868 }
2869
2870 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2871 {
2872 re_node_set_empty (&next_nodes);
2873 if (mctx->state_log[str_idx + 1])
2874 {
2875 err = re_node_set_merge (&next_nodes,
2876 &mctx->state_log[str_idx + 1]->nodes);
2877 if (BE (err != REG_NOERROR, 0))
2878 {
2879 re_node_set_free (&next_nodes);
2880 return err;
2881 }
2882 }
2883 if (cur_state)
2884 {
2885 err = check_arrival_add_next_nodes (mctx, str_idx,
2886 &cur_state->nodes, &next_nodes);
2887 if (BE (err != REG_NOERROR, 0))
2888 {
2889 re_node_set_free (&next_nodes);
2890 return err;
2891 }
2892 }
2893 ++str_idx;
2894 if (next_nodes.nelem)
2895 {
2896 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2897 if (BE (err != REG_NOERROR, 0))
2898 {
2899 re_node_set_free (&next_nodes);
2900 return err;
2901 }
2902 err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
2903 subexp_num, type);
2904 if (BE ( err != REG_NOERROR, 0))
2905 {
2906 re_node_set_free (&next_nodes);
2907 return err;
2908 }
2909 }
2910 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2911 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2912 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2913 {
2914 re_node_set_free (&next_nodes);
2915 return err;
2916 }
2917 mctx->state_log[str_idx] = cur_state;
2918 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2919 }
2920 re_node_set_free (&next_nodes);
2921 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2922 : &mctx->state_log[last_str]->nodes);
2923 path->next_idx = str_idx;
2924
2925 /* Fix MCTX. */
2926 mctx->state_log = backup_state_log;
2927 mctx->input.cur_idx = backup_cur_idx;
2928
2929 /* Then check the current node set has the node LAST_NODE. */
2930 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2931 return REG_NOERROR;
2932
2933 return REG_NOMATCH;
2934 }
2935
2936 /* Helper functions for check_arrival. */
2937
2938 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2939 to NEXT_NODES.
2940 TODO: This function is similar to the functions transit_state*(),
2941 however this function has many additional works.
2942 Can't we unify them? */
2943
2944 static reg_errcode_t
2945 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
2946 re_match_context_t *mctx;
2947 int str_idx;
2948 re_node_set *cur_nodes, *next_nodes;
2949 {
2950 re_dfa_t *const dfa = mctx->dfa;
2951 int cur_idx;
2952 reg_errcode_t err;
2953 re_node_set union_set;
2954 re_node_set_init_empty (&union_set);
2955 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2956 {
2957 int naccepted = 0;
2958 int cur_node = cur_nodes->elems[cur_idx];
2959 re_token_type_t type = dfa->nodes[cur_node].type;
2960 if (IS_EPSILON_NODE (type))
2961 continue;
2962 #ifdef RE_ENABLE_I18N
2963 /* If the node may accept `multi byte'. */
2964 if (ACCEPT_MB_NODE (type))
2965 {
2966 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2967 str_idx);
2968 if (naccepted > 1)
2969 {
2970 re_dfastate_t *dest_state;
2971 int next_node = dfa->nexts[cur_node];
2972 int next_idx = str_idx + naccepted;
2973 dest_state = mctx->state_log[next_idx];
2974 re_node_set_empty (&union_set);
2975 if (dest_state)
2976 {
2977 err = re_node_set_merge (&union_set, &dest_state->nodes);
2978 if (BE (err != REG_NOERROR, 0))
2979 {
2980 re_node_set_free (&union_set);
2981 return err;
2982 }
2983 }
2984 err = re_node_set_insert (&union_set, next_node);
2985 if (BE (err < 0, 0))
2986 {
2987 re_node_set_free (&union_set);
2988 return REG_ESPACE;
2989 }
2990 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2991 &union_set);
2992 if (BE (mctx->state_log[next_idx] == NULL
2993 && err != REG_NOERROR, 0))
2994 {
2995 re_node_set_free (&union_set);
2996 return err;
2997 }
2998 }
2999 }
3000 #endif /* RE_ENABLE_I18N */
3001 if (naccepted
3002 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3003 {
3004 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3005 if (BE (err < 0, 0))
3006 {
3007 re_node_set_free (&union_set);
3008 return REG_ESPACE;
3009 }
3010 }
3011 }
3012 re_node_set_free (&union_set);
3013 return REG_NOERROR;
3014 }
3015
3016 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3017 CUR_NODES, however exclude the nodes which are:
3018 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3019 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3020 */
3021
3022 static reg_errcode_t
3023 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3024 re_dfa_t *dfa;
3025 re_node_set *cur_nodes;
3026 int ex_subexp, type;
3027 {
3028 reg_errcode_t err;
3029 int idx, outside_node;
3030 re_node_set new_nodes;
3031 #ifdef DEBUG
3032 assert (cur_nodes->nelem);
3033 #endif
3034 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3035 if (BE (err != REG_NOERROR, 0))
3036 return err;
3037 /* Create a new node set NEW_NODES with the nodes which are epsilon
3038 closures of the node in CUR_NODES. */
3039
3040 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3041 {
3042 int cur_node = cur_nodes->elems[idx];
3043 re_node_set *eclosure = dfa->eclosures + cur_node;
3044 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3045 if (outside_node == -1)
3046 {
3047 /* There are no problematic nodes, just merge them. */
3048 err = re_node_set_merge (&new_nodes, eclosure);
3049 if (BE (err != REG_NOERROR, 0))
3050 {
3051 re_node_set_free (&new_nodes);
3052 return err;
3053 }
3054 }
3055 else
3056 {
3057 /* There are problematic nodes, re-calculate incrementally. */
3058 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3059 ex_subexp, type);
3060 if (BE (err != REG_NOERROR, 0))
3061 {
3062 re_node_set_free (&new_nodes);
3063 return err;
3064 }
3065 }
3066 }
3067 re_node_set_free (cur_nodes);
3068 *cur_nodes = new_nodes;
3069 return REG_NOERROR;
3070 }
3071
3072 /* Helper function for check_arrival_expand_ecl.
3073 Check incrementally the epsilon closure of TARGET, and if it isn't
3074 problematic append it to DST_NODES. */
3075
3076 static reg_errcode_t
3077 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3078 re_dfa_t *dfa;
3079 int target, ex_subexp, type;
3080 re_node_set *dst_nodes;
3081 {
3082 int cur_node;
3083 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3084 {
3085 int err;
3086
3087 if (dfa->nodes[cur_node].type == type
3088 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3089 {
3090 if (type == OP_CLOSE_SUBEXP)
3091 {
3092 err = re_node_set_insert (dst_nodes, cur_node);
3093 if (BE (err == -1, 0))
3094 return REG_ESPACE;
3095 }
3096 break;
3097 }
3098 err = re_node_set_insert (dst_nodes, cur_node);
3099 if (BE (err == -1, 0))
3100 return REG_ESPACE;
3101 if (dfa->edests[cur_node].nelem == 0)
3102 break;
3103 if (dfa->edests[cur_node].nelem == 2)
3104 {
3105 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3106 dfa->edests[cur_node].elems[1],
3107 ex_subexp, type);
3108 if (BE (err != REG_NOERROR, 0))
3109 return err;
3110 }
3111 cur_node = dfa->edests[cur_node].elems[0];
3112 }
3113 return REG_NOERROR;
3114 }
3115
3116
3117 /* For all the back references in the current state, calculate the
3118 destination of the back references by the appropriate entry
3119 in MCTX->BKREF_ENTS. */
3120
3121 static reg_errcode_t
3122 expand_bkref_cache (mctx, cur_nodes, cur_str, last_str, subexp_num,
3123 type)
3124 re_match_context_t *mctx;
3125 int cur_str, last_str, subexp_num, type;
3126 re_node_set *cur_nodes;
3127 {
3128 re_dfa_t *const dfa = mctx->dfa;
3129 reg_errcode_t err;
3130 int cache_idx, cache_idx_start;
3131 /* The current state. */
3132
3133 cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3134 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3135 {
3136 int to_idx, next_node;
3137 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3138 if (ent->str_idx > cur_str)
3139 break;
3140 /* Is this entry ENT is appropriate? */
3141 if (!re_node_set_contains (cur_nodes, ent->node))
3142 continue; /* No. */
3143
3144 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3145 /* Calculate the destination of the back reference, and append it
3146 to MCTX->STATE_LOG. */
3147 if (to_idx == cur_str)
3148 {
3149 /* The backreference did epsilon transit, we must re-check all the
3150 node in the current state. */
3151 re_node_set new_dests;
3152 reg_errcode_t err2, err3;
3153 next_node = dfa->edests[ent->node].elems[0];
3154 if (re_node_set_contains (cur_nodes, next_node))
3155 continue;
3156 err = re_node_set_init_1 (&new_dests, next_node);
3157 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3158 err3 = re_node_set_merge (cur_nodes, &new_dests);
3159 re_node_set_free (&new_dests);
3160 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3161 || err3 != REG_NOERROR, 0))
3162 {
3163 err = (err != REG_NOERROR ? err
3164 : (err2 != REG_NOERROR ? err2 : err3));
3165 return err;
3166 }
3167 /* TODO: It is still inefficient... */
3168 cache_idx = cache_idx_start - 1;
3169 continue;
3170 }
3171 else
3172 {
3173 re_node_set union_set;
3174 next_node = dfa->nexts[ent->node];
3175 if (mctx->state_log[to_idx])
3176 {
3177 int ret;
3178 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3179 next_node))
3180 continue;
3181 err = re_node_set_init_copy (&union_set,
3182 &mctx->state_log[to_idx]->nodes);
3183 ret = re_node_set_insert (&union_set, next_node);
3184 if (BE (err != REG_NOERROR || ret < 0, 0))
3185 {
3186 re_node_set_free (&union_set);
3187 err = err != REG_NOERROR ? err : REG_ESPACE;
3188 return err;
3189 }
3190 }
3191 else
3192 {
3193 err = re_node_set_init_1 (&union_set, next_node);
3194 if (BE (err != REG_NOERROR, 0))
3195 return err;
3196 }
3197 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3198 re_node_set_free (&union_set);
3199 if (BE (mctx->state_log[to_idx] == NULL
3200 && err != REG_NOERROR, 0))
3201 return err;
3202 }
3203 }
3204 return REG_NOERROR;
3205 }
3206
3207 /* Build transition table for the state.
3208 Return the new table if succeeded, otherwise return NULL. */
3209
3210 static re_dfastate_t **
3211 build_trtable (dfa, state)
3212 re_dfa_t *dfa;
3213 re_dfastate_t *state;
3214 {
3215 reg_errcode_t err;
3216 int i, j, ch;
3217 unsigned int elem, mask;
3218 int dests_node_malloced = 0, dest_states_malloced = 0;
3219 int ndests; /* Number of the destination states from `state'. */
3220 re_dfastate_t **trtable;
3221 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3222 re_node_set follows, *dests_node;
3223 bitset *dests_ch;
3224 bitset acceptable;
3225
3226 /* We build DFA states which corresponds to the destination nodes
3227 from `state'. `dests_node[i]' represents the nodes which i-th
3228 destination state contains, and `dests_ch[i]' represents the
3229 characters which i-th destination state accepts. */
3230 #ifdef _LIBC
3231 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3232 dests_node = (re_node_set *)
3233 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3234 else
3235 #endif
3236 {
3237 dests_node = (re_node_set *)
3238 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3239 if (BE (dests_node == NULL, 0))
3240 return NULL;
3241 dests_node_malloced = 1;
3242 }
3243 dests_ch = (bitset *) (dests_node + SBC_MAX);
3244
3245 /* Initialize transiton table. */
3246 state->word_trtable = 0;
3247
3248 /* At first, group all nodes belonging to `state' into several
3249 destinations. */
3250 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3251 if (BE (ndests <= 0, 0))
3252 {
3253 if (dests_node_malloced)
3254 free (dests_node);
3255 /* Return NULL in case of an error, trtable otherwise. */
3256 if (ndests == 0)
3257 {
3258 state->trtable = (re_dfastate_t **)
3259 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3260 return state->trtable;
3261 }
3262 return NULL;
3263 }
3264
3265 err = re_node_set_alloc (&follows, ndests + 1);
3266 if (BE (err != REG_NOERROR, 0))
3267 goto out_free;
3268
3269 #ifdef _LIBC
3270 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3271 + ndests * 3 * sizeof (re_dfastate_t *)))
3272 dest_states = (re_dfastate_t **)
3273 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3274 else
3275 #endif
3276 {
3277 dest_states = (re_dfastate_t **)
3278 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3279 if (BE (dest_states == NULL, 0))
3280 {
3281 out_free:
3282 if (dest_states_malloced)
3283 free (dest_states);
3284 re_node_set_free (&follows);
3285 for (i = 0; i < ndests; ++i)
3286 re_node_set_free (dests_node + i);
3287 if (dests_node_malloced)
3288 free (dests_node);
3289 return NULL;
3290 }
3291 dest_states_malloced = 1;
3292 }
3293 dest_states_word = dest_states + ndests;
3294 dest_states_nl = dest_states_word + ndests;
3295 bitset_empty (acceptable);
3296
3297 /* Then build the states for all destinations. */
3298 for (i = 0; i < ndests; ++i)
3299 {
3300 int next_node;
3301 re_node_set_empty (&follows);
3302 /* Merge the follows of this destination states. */
3303 for (j = 0; j < dests_node[i].nelem; ++j)
3304 {
3305 next_node = dfa->nexts[dests_node[i].elems[j]];
3306 if (next_node != -1)
3307 {
3308 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3309 if (BE (err != REG_NOERROR, 0))
3310 goto out_free;
3311 }
3312 }
3313 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3314 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3315 goto out_free;
3316 /* If the new state has context constraint,
3317 build appropriate states for these contexts. */
3318 if (dest_states[i]->has_constraint)
3319 {
3320 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3321 CONTEXT_WORD);
3322 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3323 goto out_free;
3324
3325 if (dest_states[i] != dest_states_word[i]
3326 && dfa->mb_cur_max > 1)
3327 state->word_trtable = 1;
3328
3329 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3330 CONTEXT_NEWLINE);
3331 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3332 goto out_free;
3333 }
3334 else
3335 {
3336 dest_states_word[i] = dest_states[i];
3337 dest_states_nl[i] = dest_states[i];
3338 }
3339 bitset_merge (acceptable, dests_ch[i]);
3340 }
3341
3342 if (!BE (state->word_trtable, 0))
3343 {
3344 /* We don't care about whether the following character is a word
3345 character, or we are in a single-byte character set so we can
3346 discern by looking at the character code: allocate a
3347 256-entry transition table. */
3348 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3349 if (BE (trtable == NULL, 0))
3350 goto out_free;
3351
3352 /* For all characters ch...: */
3353 for (i = 0; i < BITSET_UINTS; ++i)
3354 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3355 elem;
3356 mask <<= 1, elem >>= 1, ++ch)
3357 if (BE (elem & 1, 0))
3358 {
3359 /* There must be exactly one destination which accepts
3360 character ch. See group_nodes_into_DFAstates. */
3361 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3362 ;
3363
3364 /* j-th destination accepts the word character ch. */
3365 if (dfa->word_char[i] & mask)
3366 trtable[ch] = dest_states_word[j];
3367 else
3368 trtable[ch] = dest_states[j];
3369 }
3370 }
3371 else
3372 {
3373 /* We care about whether the following character is a word
3374 character, and we are in a multi-byte character set: discern
3375 by looking at the character code: build two 256-entry
3376 transition tables, one starting at trtable[0] and one
3377 starting at trtable[SBC_MAX]. */
3378 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3379 2 * SBC_MAX);
3380 if (BE (trtable == NULL, 0))
3381 goto out_free;
3382
3383 /* For all characters ch...: */
3384 for (i = 0; i < BITSET_UINTS; ++i)
3385 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3386 elem;
3387 mask <<= 1, elem >>= 1, ++ch)
3388 if (BE (elem & 1, 0))
3389 {
3390 /* There must be exactly one destination which accepts
3391 character ch. See group_nodes_into_DFAstates. */
3392 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3393 ;
3394
3395 /* j-th destination accepts the word character ch. */
3396 trtable[ch] = dest_states[j];
3397 trtable[ch + SBC_MAX] = dest_states_word[j];
3398 }
3399 }
3400
3401 /* new line */
3402 if (bitset_contain (acceptable, NEWLINE_CHAR))
3403 {
3404 /* The current state accepts newline character. */
3405 for (j = 0; j < ndests; ++j)
3406 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3407 {
3408 /* k-th destination accepts newline character. */
3409 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3410 if (state->word_trtable)
3411 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3412 /* There must be only one destination which accepts
3413 newline. See group_nodes_into_DFAstates. */
3414 break;
3415 }
3416 }
3417
3418 if (dest_states_malloced)
3419 free (dest_states);
3420
3421 re_node_set_free (&follows);
3422 for (i = 0; i < ndests; ++i)
3423 re_node_set_free (dests_node + i);
3424
3425 if (dests_node_malloced)
3426 free (dests_node);
3427
3428 state->trtable = trtable;
3429 return trtable;
3430 }
3431
3432 /* Group all nodes belonging to STATE into several destinations.
3433 Then for all destinations, set the nodes belonging to the destination
3434 to DESTS_NODE[i] and set the characters accepted by the destination
3435 to DEST_CH[i]. This function return the number of destinations. */
3436
3437 static int
3438 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3439 re_dfa_t *dfa;
3440 const re_dfastate_t *state;
3441 re_node_set *dests_node;
3442 bitset *dests_ch;
3443 {
3444 reg_errcode_t err;
3445 int i, j, k;
3446 int ndests; /* Number of the destinations from `state'. */
3447 bitset accepts; /* Characters a node can accept. */
3448 const re_node_set *cur_nodes = &state->nodes;
3449 bitset_empty (accepts);
3450 ndests = 0;
3451
3452 /* For all the nodes belonging to `state', */
3453 for (i = 0; i < cur_nodes->nelem; ++i)
3454 {
3455 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3456 re_token_type_t type = node->type;
3457 unsigned int constraint = node->constraint;
3458
3459 /* Enumerate all single byte character this node can accept. */
3460 if (type == CHARACTER)
3461 bitset_set (accepts, node->opr.c);
3462 else if (type == SIMPLE_BRACKET)
3463 {
3464 bitset_merge (accepts, node->opr.sbcset);
3465 }
3466 else if (type == OP_PERIOD)
3467 {
3468 #ifdef RE_ENABLE_I18N
3469 if (dfa->mb_cur_max > 1)
3470 bitset_merge (accepts, dfa->sb_char);
3471 else
3472 #endif
3473 bitset_set_all (accepts);
3474 if (!(dfa->syntax & RE_DOT_NEWLINE))
3475 bitset_clear (accepts, '\n');
3476 if (dfa->syntax & RE_DOT_NOT_NULL)
3477 bitset_clear (accepts, '\0');
3478 }
3479 #ifdef RE_ENABLE_I18N
3480 else if (type == OP_UTF8_PERIOD)
3481 {
3482 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3483 if (!(dfa->syntax & RE_DOT_NEWLINE))
3484 bitset_clear (accepts, '\n');
3485 if (dfa->syntax & RE_DOT_NOT_NULL)
3486 bitset_clear (accepts, '\0');
3487 }
3488 #endif
3489 else
3490 continue;
3491
3492 /* Check the `accepts' and sift the characters which are not
3493 match it the context. */
3494 if (constraint)
3495 {
3496 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3497 {
3498 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3499 bitset_empty (accepts);
3500 if (accepts_newline)
3501 bitset_set (accepts, NEWLINE_CHAR);
3502 else
3503 continue;
3504 }
3505 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3506 {
3507 bitset_empty (accepts);
3508 continue;
3509 }
3510
3511 if (constraint & NEXT_WORD_CONSTRAINT)
3512 {
3513 unsigned int any_set = 0;
3514 if (type == CHARACTER && !node->word_char)
3515 {
3516 bitset_empty (accepts);
3517 continue;
3518 }
3519 #ifdef RE_ENABLE_I18N
3520 if (dfa->mb_cur_max > 1)
3521 for (j = 0; j < BITSET_UINTS; ++j)
3522 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3523 else
3524 #endif
3525 for (j = 0; j < BITSET_UINTS; ++j)
3526 any_set |= (accepts[j] &= dfa->word_char[j]);
3527 if (!any_set)
3528 continue;
3529 }
3530 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3531 {
3532 unsigned int any_set = 0;
3533 if (type == CHARACTER && node->word_char)
3534 {
3535 bitset_empty (accepts);
3536 continue;
3537 }
3538 #ifdef RE_ENABLE_I18N
3539 if (dfa->mb_cur_max > 1)
3540 for (j = 0; j < BITSET_UINTS; ++j)
3541 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3542 else
3543 #endif
3544 for (j = 0; j < BITSET_UINTS; ++j)
3545 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3546 if (!any_set)
3547 continue;
3548 }
3549 }
3550
3551 /* Then divide `accepts' into DFA states, or create a new
3552 state. Above, we make sure that accepts is not empty. */
3553 for (j = 0; j < ndests; ++j)
3554 {
3555 bitset intersec; /* Intersection sets, see below. */
3556 bitset remains;
3557 /* Flags, see below. */
3558 int has_intersec, not_subset, not_consumed;
3559
3560 /* Optimization, skip if this state doesn't accept the character. */
3561 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3562 continue;
3563
3564 /* Enumerate the intersection set of this state and `accepts'. */
3565 has_intersec = 0;
3566 for (k = 0; k < BITSET_UINTS; ++k)
3567 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3568 /* And skip if the intersection set is empty. */
3569 if (!has_intersec)
3570 continue;
3571
3572 /* Then check if this state is a subset of `accepts'. */
3573 not_subset = not_consumed = 0;
3574 for (k = 0; k < BITSET_UINTS; ++k)
3575 {
3576 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3577 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3578 }
3579
3580 /* If this state isn't a subset of `accepts', create a
3581 new group state, which has the `remains'. */
3582 if (not_subset)
3583 {
3584 bitset_copy (dests_ch[ndests], remains);
3585 bitset_copy (dests_ch[j], intersec);
3586 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3587 if (BE (err != REG_NOERROR, 0))
3588 goto error_return;
3589 ++ndests;
3590 }
3591
3592 /* Put the position in the current group. */
3593 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3594 if (BE (err < 0, 0))
3595 goto error_return;
3596
3597 /* If all characters are consumed, go to next node. */
3598 if (!not_consumed)
3599 break;
3600 }
3601 /* Some characters remain, create a new group. */
3602 if (j == ndests)
3603 {
3604 bitset_copy (dests_ch[ndests], accepts);
3605 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3606 if (BE (err != REG_NOERROR, 0))
3607 goto error_return;
3608 ++ndests;
3609 bitset_empty (accepts);
3610 }
3611 }
3612 return ndests;
3613 error_return:
3614 for (j = 0; j < ndests; ++j)
3615 re_node_set_free (dests_node + j);
3616 return -1;
3617 }
3618
3619 #ifdef RE_ENABLE_I18N
3620 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3621 Return the number of the bytes the node accepts.
3622 STR_IDX is the current index of the input string.
3623
3624 This function handles the nodes which can accept one character, or
3625 one collating element like '.', '[a-z]', opposite to the other nodes
3626 can only accept one byte. */
3627
3628 static int
3629 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3630 re_dfa_t *dfa;
3631 int node_idx, str_idx;
3632 const re_string_t *input;
3633 {
3634 const re_token_t *node = dfa->nodes + node_idx;
3635 int char_len, elem_len;
3636 int i;
3637
3638 if (BE (node->type == OP_UTF8_PERIOD, 0))
3639 {
3640 unsigned char c = re_string_byte_at (input, str_idx), d;
3641 if (BE (c < 0xc2, 1))
3642 return 0;
3643
3644 if (str_idx + 2 > input->len)
3645 return 0;
3646
3647 d = re_string_byte_at (input, str_idx + 1);
3648 if (c < 0xe0)
3649 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3650 else if (c < 0xf0)
3651 {
3652 char_len = 3;
3653 if (c == 0xe0 && d < 0xa0)
3654 return 0;
3655 }
3656 else if (c < 0xf8)
3657 {
3658 char_len = 4;
3659 if (c == 0xf0 && d < 0x90)
3660 return 0;
3661 }
3662 else if (c < 0xfc)
3663 {
3664 char_len = 5;
3665 if (c == 0xf8 && d < 0x88)
3666 return 0;
3667 }
3668 else if (c < 0xfe)
3669 {
3670 char_len = 6;
3671 if (c == 0xfc && d < 0x84)
3672 return 0;
3673 }
3674 else
3675 return 0;
3676
3677 if (str_idx + char_len > input->len)
3678 return 0;
3679
3680 for (i = 1; i < char_len; ++i)
3681 {
3682 d = re_string_byte_at (input, str_idx + i);
3683 if (d < 0x80 || d > 0xbf)
3684 return 0;
3685 }
3686 return char_len;
3687 }
3688
3689 char_len = re_string_char_size_at (input, str_idx);
3690 if (node->type == OP_PERIOD)
3691 {
3692 if (char_len <= 1)
3693 return 0;
3694 /* FIXME: I don't think this if is needed, as both '\n'
3695 and '\0' are char_len == 1. */
3696 /* '.' accepts any one character except the following two cases. */
3697 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3698 re_string_byte_at (input, str_idx) == '\n') ||
3699 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3700 re_string_byte_at (input, str_idx) == '\0'))
3701 return 0;
3702 return char_len;
3703 }
3704
3705 elem_len = re_string_elem_size_at (input, str_idx);
3706 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3707 return 0;
3708
3709 if (node->type == COMPLEX_BRACKET)
3710 {
3711 const re_charset_t *cset = node->opr.mbcset;
3712 # ifdef _LIBC
3713 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3714 + str_idx);
3715 int j;
3716 uint32_t nrules;
3717 # endif /* _LIBC */
3718 int match_len = 0;
3719 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3720 ? re_string_wchar_at (input, str_idx) : 0);
3721
3722 /* match with multibyte character? */
3723 for (i = 0; i < cset->nmbchars; ++i)
3724 if (wc == cset->mbchars[i])
3725 {
3726 match_len = char_len;
3727 goto check_node_accept_bytes_match;
3728 }
3729 /* match with character_class? */
3730 for (i = 0; i < cset->nchar_classes; ++i)
3731 {
3732 wctype_t wt = cset->char_classes[i];
3733 if (__iswctype (wc, wt))
3734 {
3735 match_len = char_len;
3736 goto check_node_accept_bytes_match;
3737 }
3738 }
3739
3740 # ifdef _LIBC
3741 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3742 if (nrules != 0)
3743 {
3744 unsigned int in_collseq = 0;
3745 const int32_t *table, *indirect;
3746 const unsigned char *weights, *extra;
3747 const char *collseqwc;
3748 int32_t idx;
3749 /* This #include defines a local function! */
3750 # include <locale/weight.h>
3751
3752 /* match with collating_symbol? */
3753 if (cset->ncoll_syms)
3754 extra = (const unsigned char *)
3755 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3756 for (i = 0; i < cset->ncoll_syms; ++i)
3757 {
3758 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3759 /* Compare the length of input collating element and
3760 the length of current collating element. */
3761 if (*coll_sym != elem_len)
3762 continue;
3763 /* Compare each bytes. */
3764 for (j = 0; j < *coll_sym; j++)
3765 if (pin[j] != coll_sym[1 + j])
3766 break;
3767 if (j == *coll_sym)
3768 {
3769 /* Match if every bytes is equal. */
3770 match_len = j;
3771 goto check_node_accept_bytes_match;
3772 }
3773 }
3774
3775 if (cset->nranges)
3776 {
3777 if (elem_len <= char_len)
3778 {
3779 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3780 in_collseq = __collseq_table_lookup (collseqwc, wc);
3781 }
3782 else
3783 in_collseq = find_collation_sequence_value (pin, elem_len);
3784 }
3785 /* match with range expression? */
3786 for (i = 0; i < cset->nranges; ++i)
3787 if (cset->range_starts[i] <= in_collseq
3788 && in_collseq <= cset->range_ends[i])
3789 {
3790 match_len = elem_len;
3791 goto check_node_accept_bytes_match;
3792 }
3793
3794 /* match with equivalence_class? */
3795 if (cset->nequiv_classes)
3796 {
3797 const unsigned char *cp = pin;
3798 table = (const int32_t *)
3799 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3800 weights = (const unsigned char *)
3801 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3802 extra = (const unsigned char *)
3803 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3804 indirect = (const int32_t *)
3805 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3806 idx = findidx (&cp);
3807 if (idx > 0)
3808 for (i = 0; i < cset->nequiv_classes; ++i)
3809 {
3810 int32_t equiv_class_idx = cset->equiv_classes[i];
3811 size_t weight_len = weights[idx];
3812 if (weight_len == weights[equiv_class_idx])
3813 {
3814 int cnt = 0;
3815 while (cnt <= weight_len
3816 && (weights[equiv_class_idx + 1 + cnt]
3817 == weights[idx + 1 + cnt]))
3818 ++cnt;
3819 if (cnt > weight_len)
3820 {
3821 match_len = elem_len;
3822 goto check_node_accept_bytes_match;
3823 }
3824 }
3825 }
3826 }
3827 }
3828 else
3829 # endif /* _LIBC */
3830 {
3831 /* match with range expression? */
3832 #if __GNUC__ >= 2
3833 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3834 #else
3835 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3836 cmp_buf[2] = wc;
3837 #endif
3838 for (i = 0; i < cset->nranges; ++i)
3839 {
3840 cmp_buf[0] = cset->range_starts[i];
3841 cmp_buf[4] = cset->range_ends[i];
3842 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3843 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3844 {
3845 match_len = char_len;
3846 goto check_node_accept_bytes_match;
3847 }
3848 }
3849 }
3850 check_node_accept_bytes_match:
3851 if (!cset->non_match)
3852 return match_len;
3853 else
3854 {
3855 if (match_len > 0)
3856 return 0;
3857 else
3858 return (elem_len > char_len) ? elem_len : char_len;
3859 }
3860 }
3861 return 0;
3862 }
3863
3864 # ifdef _LIBC
3865 static unsigned int
3866 find_collation_sequence_value (mbs, mbs_len)
3867 const unsigned char *mbs;
3868 size_t mbs_len;
3869 {
3870 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3871 if (nrules == 0)
3872 {
3873 if (mbs_len == 1)
3874 {
3875 /* No valid character. Match it as a single byte character. */
3876 const unsigned char *collseq = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3878 return collseq[mbs[0]];
3879 }
3880 return UINT_MAX;
3881 }
3882 else
3883 {
3884 int32_t idx;
3885 const unsigned char *extra = (const unsigned char *)
3886 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3887 int32_t extrasize = (const unsigned char *)
3888 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3889
3890 for (idx = 0; idx < extrasize;)
3891 {
3892 int mbs_cnt, found = 0;
3893 int32_t elem_mbs_len;
3894 /* Skip the name of collating element name. */
3895 idx = idx + extra[idx] + 1;
3896 elem_mbs_len = extra[idx++];
3897 if (mbs_len == elem_mbs_len)
3898 {
3899 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3900 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3901 break;
3902 if (mbs_cnt == elem_mbs_len)
3903 /* Found the entry. */
3904 found = 1;
3905 }
3906 /* Skip the byte sequence of the collating element. */
3907 idx += elem_mbs_len;
3908 /* Adjust for the alignment. */
3909 idx = (idx + 3) & ~3;
3910 /* Skip the collation sequence value. */
3911 idx += sizeof (uint32_t);
3912 /* Skip the wide char sequence of the collating element. */
3913 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3914 /* If we found the entry, return the sequence value. */
3915 if (found)
3916 return *(uint32_t *) (extra + idx);
3917 /* Skip the collation sequence value. */
3918 idx += sizeof (uint32_t);
3919 }
3920 return UINT_MAX;
3921 }
3922 }
3923 # endif /* _LIBC */
3924 #endif /* RE_ENABLE_I18N */
3925
3926 /* Check whether the node accepts the byte which is IDX-th
3927 byte of the INPUT. */
3928
3929 static int
3930 check_node_accept (mctx, node, idx)
3931 const re_match_context_t *mctx;
3932 const re_token_t *node;
3933 int idx;
3934 {
3935 re_dfa_t *const dfa = mctx->dfa;
3936 unsigned char ch;
3937 if (node->constraint)
3938 {
3939 /* The node has constraints. Check whether the current context
3940 satisfies the constraints. */
3941 unsigned int context = re_string_context_at (&mctx->input, idx,
3942 mctx->eflags);
3943 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3944 return 0;
3945 }
3946 ch = re_string_byte_at (&mctx->input, idx);
3947 switch (node->type)
3948 {
3949 case CHARACTER:
3950 return node->opr.c == ch;
3951 case SIMPLE_BRACKET:
3952 return bitset_contain (node->opr.sbcset, ch);
3953 #ifdef RE_ENABLE_I18N
3954 case OP_UTF8_PERIOD:
3955 if (ch >= 0x80)
3956 return 0;
3957 /* FALLTHROUGH */
3958 #endif
3959 case OP_PERIOD:
3960 return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
3961 || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
3962 default:
3963 return 0;
3964 }
3965 }
3966
3967 /* Extend the buffers, if the buffers have run out. */
3968
3969 static reg_errcode_t
3970 extend_buffers (mctx)
3971 re_match_context_t *mctx;
3972 {
3973 reg_errcode_t ret;
3974 re_string_t *pstr = &mctx->input;
3975
3976 /* Double the lengthes of the buffers. */
3977 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3978 if (BE (ret != REG_NOERROR, 0))
3979 return ret;
3980
3981 if (mctx->state_log != NULL)
3982 {
3983 /* And double the length of state_log. */
3984 /* XXX We have no indication of the size of this buffer. If this
3985 allocation fail we have no indication that the state_log array
3986 does not have the right size. */
3987 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3988 pstr->bufs_len + 1);
3989 if (BE (new_array == NULL, 0))
3990 return REG_ESPACE;
3991 mctx->state_log = new_array;
3992 }
3993
3994 /* Then reconstruct the buffers. */
3995 if (pstr->icase)
3996 {
3997 #ifdef RE_ENABLE_I18N
3998 if (pstr->mb_cur_max > 1)
3999 {
4000 ret = build_wcs_upper_buffer (pstr);
4001 if (BE (ret != REG_NOERROR, 0))
4002 return ret;
4003 }
4004 else
4005 #endif /* RE_ENABLE_I18N */
4006 build_upper_buffer (pstr);
4007 }
4008 else
4009 {
4010 #ifdef RE_ENABLE_I18N
4011 if (pstr->mb_cur_max > 1)
4012 build_wcs_buffer (pstr);
4013 else
4014 #endif /* RE_ENABLE_I18N */
4015 {
4016 if (pstr->trans != NULL)
4017 re_string_translate_buffer (pstr);
4018 }
4019 }
4020 return REG_NOERROR;
4021 }
4022
4023 \f
4024 /* Functions for matching context. */
4025
4026 /* Initialize MCTX. */
4027
4028 static reg_errcode_t
4029 match_ctx_init (mctx, eflags, n)
4030 re_match_context_t *mctx;
4031 int eflags, n;
4032 {
4033 mctx->eflags = eflags;
4034 mctx->match_last = -1;
4035 if (n > 0)
4036 {
4037 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4038 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4039 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4040 return REG_ESPACE;
4041 }
4042 /* Already zero-ed by the caller.
4043 else
4044 mctx->bkref_ents = NULL;
4045 mctx->nbkref_ents = 0;
4046 mctx->nsub_tops = 0; */
4047 mctx->abkref_ents = n;
4048 mctx->max_mb_elem_len = 1;
4049 mctx->asub_tops = n;
4050 return REG_NOERROR;
4051 }
4052
4053 /* Clean the entries which depend on the current input in MCTX.
4054 This function must be invoked when the matcher changes the start index
4055 of the input, or changes the input string. */
4056
4057 static void
4058 match_ctx_clean (mctx)
4059 re_match_context_t *mctx;
4060 {
4061 match_ctx_free_subtops (mctx);
4062 mctx->nsub_tops = 0;
4063 mctx->nbkref_ents = 0;
4064 }
4065
4066 /* Free all the memory associated with MCTX. */
4067
4068 static void
4069 match_ctx_free (mctx)
4070 re_match_context_t *mctx;
4071 {
4072 match_ctx_free_subtops (mctx);
4073 re_free (mctx->sub_tops);
4074 re_free (mctx->bkref_ents);
4075 }
4076
4077 /* Free all the memory associated with MCTX->SUB_TOPS. */
4078
4079 static void
4080 match_ctx_free_subtops (mctx)
4081 re_match_context_t *mctx;
4082 {
4083 int st_idx;
4084 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4085 {
4086 int sl_idx;
4087 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4088 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4089 {
4090 re_sub_match_last_t *last = top->lasts[sl_idx];
4091 re_free (last->path.array);
4092 re_free (last);
4093 }
4094 re_free (top->lasts);
4095 if (top->path)
4096 {
4097 re_free (top->path->array);
4098 re_free (top->path);
4099 }
4100 free (top);
4101 }
4102 }
4103
4104 /* Add a new backreference entry to MCTX.
4105 Note that we assume that caller never call this function with duplicate
4106 entry, and call with STR_IDX which isn't smaller than any existing entry.
4107 */
4108
4109 static reg_errcode_t
4110 match_ctx_add_entry (mctx, node, str_idx, from, to)
4111 re_match_context_t *mctx;
4112 int node, str_idx, from, to;
4113 {
4114 if (mctx->nbkref_ents >= mctx->abkref_ents)
4115 {
4116 struct re_backref_cache_entry* new_entry;
4117 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4118 mctx->abkref_ents * 2);
4119 if (BE (new_entry == NULL, 0))
4120 {
4121 re_free (mctx->bkref_ents);
4122 return REG_ESPACE;
4123 }
4124 mctx->bkref_ents = new_entry;
4125 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4126 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4127 mctx->abkref_ents *= 2;
4128 }
4129 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4130 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4131 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4132 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4133 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
4134 if (mctx->max_mb_elem_len < to - from)
4135 mctx->max_mb_elem_len = to - from;
4136 return REG_NOERROR;
4137 }
4138
4139 /* Search for the first entry which has the same str_idx.
4140 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4141
4142 static int
4143 search_cur_bkref_entry (mctx, str_idx)
4144 re_match_context_t *mctx;
4145 int str_idx;
4146 {
4147 int left, right, mid;
4148 right = mctx->nbkref_ents;
4149 for (left = 0; left < right;)
4150 {
4151 mid = (left + right) / 2;
4152 if (mctx->bkref_ents[mid].str_idx < str_idx)
4153 left = mid + 1;
4154 else
4155 right = mid;
4156 }
4157 return left;
4158 }
4159
4160 static void
4161 match_ctx_clear_flag (mctx)
4162 re_match_context_t *mctx;
4163 {
4164 int i;
4165 for (i = 0; i < mctx->nbkref_ents; ++i)
4166 mctx->bkref_ents[i].flag = 0;
4167 }
4168
4169 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4170 at STR_IDX. */
4171
4172 static reg_errcode_t
4173 match_ctx_add_subtop (mctx, node, str_idx)
4174 re_match_context_t *mctx;
4175 int node, str_idx;
4176 {
4177 #ifdef DEBUG
4178 assert (mctx->sub_tops != NULL);
4179 assert (mctx->asub_tops > 0);
4180 #endif
4181 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4182 {
4183 int new_asub_tops = mctx->asub_tops * 2;
4184 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4185 re_sub_match_top_t *,
4186 new_asub_tops);
4187 if (BE (new_array == NULL, 0))
4188 return REG_ESPACE;
4189 mctx->sub_tops = new_array;
4190 mctx->asub_tops = new_asub_tops;
4191 }
4192 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4193 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4194 return REG_ESPACE;
4195 mctx->sub_tops[mctx->nsub_tops]->node = node;
4196 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4197 return REG_NOERROR;
4198 }
4199
4200 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4201 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4202
4203 static re_sub_match_last_t *
4204 match_ctx_add_sublast (subtop, node, str_idx)
4205 re_sub_match_top_t *subtop;
4206 int node, str_idx;
4207 {
4208 re_sub_match_last_t *new_entry;
4209 if (BE (subtop->nlasts == subtop->alasts, 0))
4210 {
4211 int new_alasts = 2 * subtop->alasts + 1;
4212 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4213 re_sub_match_last_t *,
4214 new_alasts);
4215 if (BE (new_array == NULL, 0))
4216 return NULL;
4217 subtop->lasts = new_array;
4218 subtop->alasts = new_alasts;
4219 }
4220 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4221 if (BE (new_entry != NULL, 1))
4222 {
4223 subtop->lasts[subtop->nlasts] = new_entry;
4224 new_entry->node = node;
4225 new_entry->str_idx = str_idx;
4226 ++subtop->nlasts;
4227 }
4228 return new_entry;
4229 }
4230
4231 static void
4232 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
4233 check_subexp)
4234 re_sift_context_t *sctx;
4235 re_dfastate_t **sifted_sts, **limited_sts;
4236 int last_node, last_str_idx, check_subexp;
4237 {
4238 sctx->sifted_states = sifted_sts;
4239 sctx->limited_states = limited_sts;
4240 sctx->last_node = last_node;
4241 sctx->last_str_idx = last_str_idx;
4242 sctx->check_subexp = check_subexp;
4243 sctx->cur_bkref = -1;
4244 sctx->cls_subexp_idx = -1;
4245 re_node_set_init_empty (&sctx->limits);
4246 }