]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regexec.c
Update.
[thirdparty/glibc.git] / posix / regexec.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 re_string_t *input, int n);
23 static void match_ctx_clean (re_match_context_t *mctx);
24 static void match_ctx_free (re_match_context_t *cache);
25 static void match_ctx_free_subtops (re_match_context_t *mctx);
26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
27 int str_idx, int from, int to);
28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx);
29 static void match_ctx_clear_flag (re_match_context_t *mctx);
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx);
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx);
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, int last_node,
36 int last_str_idx, int check_subexp);
37 static reg_errcode_t re_search_internal (const regex_t *preg,
38 const char *string, int length,
39 int start, int range, int stop,
40 size_t nmatch, regmatch_t pmatch[],
41 int eflags);
42 static int re_search_2_stub (struct re_pattern_buffer *bufp,
43 const char *string1, int length1,
44 const char *string2, int length2,
45 int start, int range, struct re_registers *regs,
46 int stop, int ret_len);
47 static int re_search_stub (struct re_pattern_buffer *bufp,
48 const char *string, int length, int start,
49 int range, int stop, struct re_registers *regs,
50 int ret_len);
51 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
52 int nregs, int regs_allocated);
53 static re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
54 const regex_t *preg,
55 const re_match_context_t *mctx,
56 int idx);
57 static reg_errcode_t prune_impossible_nodes (const regex_t *preg,
58 re_match_context_t *mctx);
59 static int check_matching (const regex_t *preg, re_match_context_t *mctx,
60 int fl_search, int fl_longest_match);
61 static int check_halt_node_context (const re_dfa_t *dfa, int node,
62 unsigned int context);
63 static int check_halt_state_context (const regex_t *preg,
64 const re_dfastate_t *state,
65 const re_match_context_t *mctx, int idx);
66 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
67 int cur_idx, int nmatch);
68 static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs,
69 const re_match_context_t *mctx,
70 int *pidx, int node, re_node_set *eps_via_nodes,
71 struct re_fail_stack_t *fs);
72 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
73 int str_idx, int *dests, int nregs,
74 regmatch_t *regs,
75 re_node_set *eps_via_nodes);
76 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
77 regmatch_t *regs, re_node_set *eps_via_nodes);
78 static reg_errcode_t set_regs (const regex_t *preg,
79 const re_match_context_t *mctx,
80 size_t nmatch, regmatch_t *pmatch,
81 int fl_backtrack);
82 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
83
84 #ifdef RE_ENABLE_I18N
85 static int sift_states_iter_mb (const regex_t *preg,
86 const re_match_context_t *mctx,
87 re_sift_context_t *sctx,
88 int node_idx, int str_idx, int max_str_idx);
89 #endif /* RE_ENABLE_I18N */
90 static reg_errcode_t sift_states_backward (const regex_t *preg,
91 re_match_context_t *mctx,
92 re_sift_context_t *sctx);
93 static reg_errcode_t update_cur_sifted_state (const regex_t *preg,
94 re_match_context_t *mctx,
95 re_sift_context_t *sctx,
96 int str_idx,
97 re_node_set *dest_nodes);
98 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates);
101 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
102 re_node_set *dest_nodes,
103 const re_node_set *and_nodes);
104 static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits,
105 re_match_context_t *mctx, int dst_node,
106 int dst_idx, int src_node, int src_idx);
107 static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx,
108 int limit, re_node_set *eclosures,
109 int subexp_idx, int node, int str_idx);
110 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
111 re_node_set *dest_nodes,
112 const re_node_set *candidates,
113 re_node_set *limits,
114 struct re_backref_cache_entry *bkref_ents,
115 int str_idx);
116 static reg_errcode_t sift_states_bkref (const regex_t *preg,
117 re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, re_node_set *dest_nodes);
120 static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
121 int next_state_log_idx);
122 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
123 re_dfastate_t **src, int num);
124 static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
125 re_match_context_t *mctx,
126 re_dfastate_t *state, int fl_search);
127 static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa,
128 re_match_context_t *mctx,
129 re_node_set *cur_nodes,
130 int str_idx);
131 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
132 re_dfastate_t *pstate,
133 int fl_search,
134 re_match_context_t *mctx);
135 #ifdef RE_ENABLE_I18N
136 static reg_errcode_t transit_state_mb (const regex_t *preg,
137 re_dfastate_t *pstate,
138 re_match_context_t *mctx);
139 #endif /* RE_ENABLE_I18N */
140 static reg_errcode_t transit_state_bkref (const regex_t *preg,
141 re_node_set *nodes,
142 re_match_context_t *mctx);
143 static reg_errcode_t get_subexp (const regex_t *preg, re_match_context_t *mctx,
144 int bkref_node, int bkref_str_idx);
145 static reg_errcode_t get_subexp_sub (const regex_t *preg,
146 re_match_context_t *mctx,
147 re_sub_match_top_t *sub_top,
148 re_sub_match_last_t *sub_last,
149 int bkref_node, int bkref_str);
150 static int find_subexp_node (re_dfa_t *dfa, re_node_set *nodes,
151 int subexp_idx, int fl_open);
152 static reg_errcode_t check_arrival (const regex_t *preg,
153 re_match_context_t *mctx,
154 state_array_t *path, int top_node,
155 int top_str, int last_node, int last_str,
156 int fl_open);
157 static reg_errcode_t check_arrival_add_next_nodes (const regex_t *preg,
158 re_dfa_t *dfa,
159 re_match_context_t *mctx,
160 int str_idx,
161 re_node_set *cur_nodes,
162 re_node_set *next_nodes);
163 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
164 re_node_set *cur_nodes,
165 int ex_subexp, int fl_open);
166 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
167 re_node_set *dst_nodes,
168 int target, int ex_subexp,
169 int fl_open);
170 static reg_errcode_t expand_bkref_cache (const regex_t *preg,
171 re_match_context_t *mctx,
172 re_node_set *cur_nodes, int cur_str,
173 int last_str, int subexp_num,
174 int fl_open);
175 static re_dfastate_t **build_trtable (const regex_t *dfa,
176 const re_dfastate_t *state,
177 int fl_search);
178 #ifdef RE_ENABLE_I18N
179 static int check_node_accept_bytes (const regex_t *preg, int node_idx,
180 const re_string_t *input, int idx);
181 # ifdef _LIBC
182 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
183 size_t name_len);
184 # endif /* _LIBC */
185 #endif /* RE_ENABLE_I18N */
186 static int group_nodes_into_DFAstates (const regex_t *dfa,
187 const re_dfastate_t *state,
188 re_node_set *states_node,
189 bitset *states_ch);
190 static int check_node_accept (const regex_t *preg, const re_token_t *node,
191 const re_match_context_t *mctx, int idx);
192 static reg_errcode_t extend_buffers (re_match_context_t *mctx);
193 \f
194 /* Entry point for POSIX code. */
195
196 /* regexec searches for a given pattern, specified by PREG, in the
197 string STRING.
198
199 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
200 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
201 least NMATCH elements, and we set them to the offsets of the
202 corresponding matched substrings.
203
204 EFLAGS specifies `execution flags' which affect matching: if
205 REG_NOTBOL is set, then ^ does not match at the beginning of the
206 string; if REG_NOTEOL is set, then $ does not match at the end.
207
208 We return 0 if we find a match and REG_NOMATCH if not. */
209
210 int
211 regexec (preg, string, nmatch, pmatch, eflags)
212 const regex_t *__restrict preg;
213 const char *__restrict string;
214 size_t nmatch;
215 regmatch_t pmatch[];
216 int eflags;
217 {
218 reg_errcode_t err;
219 int length = strlen (string);
220 if (preg->no_sub)
221 err = re_search_internal (preg, string, length, 0, length, length, 0,
222 NULL, eflags);
223 else
224 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
225 pmatch, eflags);
226 return err != REG_NOERROR;
227 }
228 #ifdef _LIBC
229 weak_alias (__regexec, regexec)
230 #endif
231
232 /* Entry points for GNU code. */
233
234 /* re_match, re_search, re_match_2, re_search_2
235
236 The former two functions operate on STRING with length LENGTH,
237 while the later two operate on concatenation of STRING1 and STRING2
238 with lengths LENGTH1 and LENGTH2, respectively.
239
240 re_match() matches the compiled pattern in BUFP against the string,
241 starting at index START.
242
243 re_search() first tries matching at index START, then it tries to match
244 starting from index START + 1, and so on. The last start position tried
245 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
246 way as re_match().)
247
248 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
249 the first STOP characters of the concatenation of the strings should be
250 concerned.
251
252 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
253 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
254 computed relative to the concatenation, not relative to the individual
255 strings.)
256
257 On success, re_match* functions return the length of the match, re_search*
258 return the position of the start of the match. Return value -1 means no
259 match was found and -2 indicates an internal error. */
260
261 int
262 re_match (bufp, string, length, start, regs)
263 struct re_pattern_buffer *bufp;
264 const char *string;
265 int length, start;
266 struct re_registers *regs;
267 {
268 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
269 }
270 #ifdef _LIBC
271 weak_alias (__re_match, re_match)
272 #endif
273
274 int
275 re_search (bufp, string, length, start, range, regs)
276 struct re_pattern_buffer *bufp;
277 const char *string;
278 int length, start, range;
279 struct re_registers *regs;
280 {
281 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
282 }
283 #ifdef _LIBC
284 weak_alias (__re_search, re_search)
285 #endif
286
287 int
288 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
289 struct re_pattern_buffer *bufp;
290 const char *string1, *string2;
291 int length1, length2, start, stop;
292 struct re_registers *regs;
293 {
294 return re_search_2_stub (bufp, string1, length1, string2, length2,
295 start, 0, regs, stop, 1);
296 }
297 #ifdef _LIBC
298 weak_alias (__re_match_2, re_match_2)
299 #endif
300
301 int
302 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
303 struct re_pattern_buffer *bufp;
304 const char *string1, *string2;
305 int length1, length2, start, range, stop;
306 struct re_registers *regs;
307 {
308 return re_search_2_stub (bufp, string1, length1, string2, length2,
309 start, range, regs, stop, 0);
310 }
311 #ifdef _LIBC
312 weak_alias (__re_search_2, re_search_2)
313 #endif
314
315 static int
316 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
317 stop, ret_len)
318 struct re_pattern_buffer *bufp;
319 const char *string1, *string2;
320 int length1, length2, start, range, stop, ret_len;
321 struct re_registers *regs;
322 {
323 const char *str;
324 int rval;
325 int len = length1 + length2;
326 int free_str = 0;
327
328 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
329 return -2;
330
331 /* Concatenate the strings. */
332 if (length2 > 0)
333 if (length1 > 0)
334 {
335 char *s = re_malloc (char, len);
336
337 if (BE (s == NULL, 0))
338 return -2;
339 memcpy (s, string1, length1);
340 memcpy (s + length1, string2, length2);
341 str = s;
342 free_str = 1;
343 }
344 else
345 str = string2;
346 else
347 str = string1;
348
349 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
350 ret_len);
351 if (free_str)
352 re_free ((char *) str);
353 return rval;
354 }
355
356 /* The parameters have the same meaning as those of re_search.
357 Additional parameters:
358 If RET_LEN is nonzero the length of the match is returned (re_match style);
359 otherwise the position of the match is returned. */
360
361 static int
362 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
363 struct re_pattern_buffer *bufp;
364 const char *string;
365 int length, start, range, stop, ret_len;
366 struct re_registers *regs;
367 {
368 reg_errcode_t result;
369 regmatch_t *pmatch;
370 int nregs, rval;
371 int eflags = 0;
372
373 /* Check for out-of-range. */
374 if (BE (start < 0 || start > length, 0))
375 return -1;
376 if (BE (start + range > length, 0))
377 range = length - start;
378 else if (BE (start + range < 0, 0))
379 range = -start;
380
381 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
382 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
383
384 /* Compile fastmap if we haven't yet. */
385 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
386 re_compile_fastmap (bufp);
387
388 if (BE (bufp->no_sub, 0))
389 regs = NULL;
390
391 /* We need at least 1 register. */
392 if (regs == NULL)
393 nregs = 1;
394 else if (BE (bufp->regs_allocated == REGS_FIXED &&
395 regs->num_regs < bufp->re_nsub + 1, 0))
396 {
397 nregs = regs->num_regs;
398 if (BE (nregs < 1, 0))
399 {
400 /* Nothing can be copied to regs. */
401 regs = NULL;
402 nregs = 1;
403 }
404 }
405 else
406 nregs = bufp->re_nsub + 1;
407 pmatch = re_malloc (regmatch_t, nregs);
408 if (BE (pmatch == NULL, 0))
409 return -2;
410
411 result = re_search_internal (bufp, string, length, start, range, stop,
412 nregs, pmatch, eflags);
413
414 rval = 0;
415
416 /* I hope we needn't fill ther regs with -1's when no match was found. */
417 if (result != REG_NOERROR)
418 rval = -1;
419 else if (regs != NULL)
420 {
421 /* If caller wants register contents data back, copy them. */
422 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
423 bufp->regs_allocated);
424 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
425 rval = -2;
426 }
427
428 if (BE (rval == 0, 1))
429 {
430 if (ret_len)
431 {
432 assert (pmatch[0].rm_so == start);
433 rval = pmatch[0].rm_eo - start;
434 }
435 else
436 rval = pmatch[0].rm_so;
437 }
438 re_free (pmatch);
439 return rval;
440 }
441
442 static unsigned
443 re_copy_regs (regs, pmatch, nregs, regs_allocated)
444 struct re_registers *regs;
445 regmatch_t *pmatch;
446 int nregs, regs_allocated;
447 {
448 int rval = REGS_REALLOCATE;
449 int i;
450 int need_regs = nregs + 1;
451 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
452 uses. */
453
454 /* Have the register data arrays been allocated? */
455 if (regs_allocated == REGS_UNALLOCATED)
456 { /* No. So allocate them with malloc. */
457 regs->start = re_malloc (regoff_t, need_regs);
458 if (BE (regs->start == NULL, 0))
459 return REGS_UNALLOCATED;
460 regs->end = re_malloc (regoff_t, need_regs);
461 if (BE (regs->end == NULL, 0))
462 {
463 re_free (regs->start);
464 return REGS_UNALLOCATED;
465 }
466 regs->num_regs = need_regs;
467 }
468 else if (regs_allocated == REGS_REALLOCATE)
469 { /* Yes. If we need more elements than were already
470 allocated, reallocate them. If we need fewer, just
471 leave it alone. */
472 if (need_regs > regs->num_regs)
473 {
474 regs->start = re_realloc (regs->start, regoff_t, need_regs);
475 if (BE (regs->start == NULL, 0))
476 {
477 if (regs->end != NULL)
478 re_free (regs->end);
479 return REGS_UNALLOCATED;
480 }
481 regs->end = re_realloc (regs->end, regoff_t, need_regs);
482 if (BE (regs->end == NULL, 0))
483 {
484 re_free (regs->start);
485 return REGS_UNALLOCATED;
486 }
487 regs->num_regs = need_regs;
488 }
489 }
490 else
491 {
492 assert (regs_allocated == REGS_FIXED);
493 /* This function may not be called with REGS_FIXED and nregs too big. */
494 assert (regs->num_regs >= nregs);
495 rval = REGS_FIXED;
496 }
497
498 /* Copy the regs. */
499 for (i = 0; i < nregs; ++i)
500 {
501 regs->start[i] = pmatch[i].rm_so;
502 regs->end[i] = pmatch[i].rm_eo;
503 }
504 for ( ; i < regs->num_regs; ++i)
505 regs->start[i] = regs->end[i] = -1;
506
507 return rval;
508 }
509
510 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
511 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
512 this memory for recording register information. STARTS and ENDS
513 must be allocated using the malloc library routine, and must each
514 be at least NUM_REGS * sizeof (regoff_t) bytes long.
515
516 If NUM_REGS == 0, then subsequent matches should allocate their own
517 register data.
518
519 Unless this function is called, the first search or match using
520 PATTERN_BUFFER will allocate its own register data, without
521 freeing the old data. */
522
523 void
524 re_set_registers (bufp, regs, num_regs, starts, ends)
525 struct re_pattern_buffer *bufp;
526 struct re_registers *regs;
527 unsigned num_regs;
528 regoff_t *starts, *ends;
529 {
530 if (num_regs)
531 {
532 bufp->regs_allocated = REGS_REALLOCATE;
533 regs->num_regs = num_regs;
534 regs->start = starts;
535 regs->end = ends;
536 }
537 else
538 {
539 bufp->regs_allocated = REGS_UNALLOCATED;
540 regs->num_regs = 0;
541 regs->start = regs->end = (regoff_t *) 0;
542 }
543 }
544 #ifdef _LIBC
545 weak_alias (__re_set_registers, re_set_registers)
546 #endif
547 \f
548 /* Entry points compatible with 4.2 BSD regex library. We don't define
549 them unless specifically requested. */
550
551 #if defined _REGEX_RE_COMP || defined _LIBC
552 int
553 # ifdef _LIBC
554 weak_function
555 # endif
556 re_exec (s)
557 const char *s;
558 {
559 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
560 }
561 #endif /* _REGEX_RE_COMP */
562 \f
563 static re_node_set empty_set;
564
565 /* Internal entry point. */
566
567 /* Searches for a compiled pattern PREG in the string STRING, whose
568 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
569 mingings with regexec. START, and RANGE have the same meanings
570 with re_search.
571 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
572 otherwise return the error code.
573 Note: We assume front end functions already check ranges.
574 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
575
576 static reg_errcode_t
577 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
578 eflags)
579 const regex_t *preg;
580 const char *string;
581 int length, start, range, stop, eflags;
582 size_t nmatch;
583 regmatch_t pmatch[];
584 {
585 reg_errcode_t err;
586 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
587 re_string_t input;
588 int left_lim, right_lim, incr;
589 int fl_longest_match, match_first, match_last = -1;
590 int fast_translate, sb;
591 re_match_context_t mctx;
592 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
593 && range && !preg->can_be_null) ? preg->fastmap : NULL);
594
595 /* Check if the DFA haven't been compiled. */
596 if (BE (preg->used == 0 || dfa->init_state == NULL
597 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
598 || dfa->init_state_begbuf == NULL, 0))
599 return REG_NOMATCH;
600
601 re_node_set_init_empty (&empty_set);
602 memset (&mctx, '\0', sizeof (re_match_context_t));
603
604 /* We must check the longest matching, if nmatch > 0. */
605 fl_longest_match = (nmatch != 0 || dfa->nbackref);
606
607 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
608 preg->translate, preg->syntax & RE_ICASE, dfa);
609 if (BE (err != REG_NOERROR, 0))
610 goto free_return;
611 input.stop = stop;
612
613 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
614 if (BE (err != REG_NOERROR, 0))
615 goto free_return;
616
617 /* We will log all the DFA states through which the dfa pass,
618 if nmatch > 1, or this dfa has "multibyte node", which is a
619 back-reference or a node which can accept multibyte character or
620 multi character collating element. */
621 if (nmatch > 1 || dfa->has_mb_node)
622 {
623 mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
624 if (BE (mctx.state_log == NULL, 0))
625 {
626 err = REG_ESPACE;
627 goto free_return;
628 }
629 }
630 else
631 mctx.state_log = NULL;
632
633 #ifdef DEBUG
634 /* We assume front-end functions already check them. */
635 assert (start + range >= 0 && start + range <= length);
636 #endif
637
638 match_first = start;
639 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
640 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
641
642 /* Check incrementally whether of not the input string match. */
643 incr = (range < 0) ? -1 : 1;
644 left_lim = (range < 0) ? start + range : start;
645 right_lim = (range < 0) ? start : start + range;
646 sb = dfa->mb_cur_max == 1;
647 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
648
649 for (;;)
650 {
651 /* At first get the current byte from input string. */
652 if (fastmap)
653 {
654 if (BE (fast_translate, 1))
655 {
656 unsigned RE_TRANSLATE_TYPE t
657 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
658 if (BE (range >= 0, 1))
659 {
660 if (BE (t != NULL, 0))
661 {
662 while (BE (match_first < right_lim, 1)
663 && !fastmap[t[(unsigned char) string[match_first]]])
664 ++match_first;
665 }
666 else
667 {
668 while (BE (match_first < right_lim, 1)
669 && !fastmap[(unsigned char) string[match_first]])
670 ++match_first;
671 }
672 if (BE (match_first == right_lim, 0))
673 {
674 int ch = match_first >= length
675 ? 0 : (unsigned char) string[match_first];
676 if (!fastmap[t ? t[ch] : ch])
677 break;
678 }
679 }
680 else
681 {
682 while (match_first >= left_lim)
683 {
684 int ch = match_first >= length
685 ? 0 : (unsigned char) string[match_first];
686 if (fastmap[t ? t[ch] : ch])
687 break;
688 --match_first;
689 }
690 if (match_first < left_lim)
691 break;
692 }
693 }
694 else
695 {
696 int ch;
697
698 do
699 {
700 /* In this case, we can't determine easily the current byte,
701 since it might be a component byte of a multibyte
702 character. Then we use the constructed buffer
703 instead. */
704 /* If MATCH_FIRST is out of the valid range, reconstruct the
705 buffers. */
706 if (input.raw_mbs_idx + input.valid_len <= match_first
707 || match_first < input.raw_mbs_idx)
708 {
709 err = re_string_reconstruct (&input, match_first, eflags,
710 preg->newline_anchor);
711 if (BE (err != REG_NOERROR, 0))
712 goto free_return;
713 }
714 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
715 Note that MATCH_FIRST must not be smaller than 0. */
716 ch = ((match_first >= length) ? 0
717 : re_string_byte_at (&input,
718 match_first - input.raw_mbs_idx));
719 if (fastmap[ch])
720 break;
721 match_first += incr;
722 }
723 while (match_first >= left_lim && match_first <= right_lim);
724 if (! fastmap[ch])
725 break;
726 }
727 }
728
729 /* Reconstruct the buffers so that the matcher can assume that
730 the matching starts from the begining of the buffer. */
731 err = re_string_reconstruct (&input, match_first, eflags,
732 preg->newline_anchor);
733 if (BE (err != REG_NOERROR, 0))
734 goto free_return;
735 #ifdef RE_ENABLE_I18N
736 /* Eliminate it when it is a component of a multibyte character
737 and isn't the head of a multibyte character. */
738 if (sb || re_string_first_byte (&input, 0))
739 #endif
740 {
741 /* It seems to be appropriate one, then use the matcher. */
742 /* We assume that the matching starts from 0. */
743 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
744 match_last = check_matching (preg, &mctx, 0, fl_longest_match);
745 if (match_last != -1)
746 {
747 if (BE (match_last == -2, 0))
748 {
749 err = REG_ESPACE;
750 goto free_return;
751 }
752 else
753 {
754 mctx.match_last = match_last;
755 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
756 {
757 re_dfastate_t *pstate = mctx.state_log[match_last];
758 mctx.last_node = check_halt_state_context (preg, pstate,
759 &mctx, match_last);
760 }
761 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
762 || dfa->nbackref)
763 {
764 err = prune_impossible_nodes (preg, &mctx);
765 if (err == REG_NOERROR)
766 break;
767 if (BE (err != REG_NOMATCH, 0))
768 goto free_return;
769 }
770 else
771 break; /* We found a matching. */
772 }
773 }
774 match_ctx_clean (&mctx);
775 }
776 /* Update counter. */
777 match_first += incr;
778 if (match_first < left_lim || right_lim < match_first)
779 break;
780 }
781
782 /* Set pmatch[] if we need. */
783 if (match_last != -1 && nmatch > 0)
784 {
785 int reg_idx;
786
787 /* Initialize registers. */
788 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
789 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
790
791 /* Set the points where matching start/end. */
792 pmatch[0].rm_so = 0;
793 pmatch[0].rm_eo = mctx.match_last;
794
795 if (!preg->no_sub && nmatch > 1)
796 {
797 err = set_regs (preg, &mctx, nmatch, pmatch,
798 dfa->has_plural_match && dfa->nbackref > 0);
799 if (BE (err != REG_NOERROR, 0))
800 goto free_return;
801 }
802
803 /* At last, add the offset to the each registers, since we slided
804 the buffers so that We can assume that the matching starts from 0. */
805 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
806 if (pmatch[reg_idx].rm_so != -1)
807 {
808 pmatch[reg_idx].rm_so += match_first;
809 pmatch[reg_idx].rm_eo += match_first;
810 }
811 }
812 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
813 free_return:
814 re_free (mctx.state_log);
815 if (dfa->nbackref)
816 match_ctx_free (&mctx);
817 re_string_destruct (&input);
818 return err;
819 }
820
821 static reg_errcode_t
822 prune_impossible_nodes (preg, mctx)
823 const regex_t *preg;
824 re_match_context_t *mctx;
825 {
826 int halt_node, match_last;
827 reg_errcode_t ret;
828 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
829 re_dfastate_t **sifted_states;
830 re_dfastate_t **lim_states = NULL;
831 re_sift_context_t sctx;
832 #ifdef DEBUG
833 assert (mctx->state_log != NULL);
834 #endif
835 match_last = mctx->match_last;
836 halt_node = mctx->last_node;
837 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
838 if (BE (sifted_states == NULL, 0))
839 {
840 ret = REG_ESPACE;
841 goto free_return;
842 }
843 if (dfa->nbackref)
844 {
845 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
846 if (BE (lim_states == NULL, 0))
847 {
848 ret = REG_ESPACE;
849 goto free_return;
850 }
851 while (1)
852 {
853 memset (lim_states, '\0',
854 sizeof (re_dfastate_t *) * (match_last + 1));
855 match_ctx_clear_flag (mctx);
856 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
857 match_last, 0);
858 ret = sift_states_backward (preg, mctx, &sctx);
859 re_node_set_free (&sctx.limits);
860 if (BE (ret != REG_NOERROR, 0))
861 goto free_return;
862 if (sifted_states[0] != NULL || lim_states[0] != NULL)
863 break;
864 do
865 {
866 --match_last;
867 if (match_last < 0)
868 {
869 ret = REG_NOMATCH;
870 goto free_return;
871 }
872 } while (!mctx->state_log[match_last]->halt);
873 halt_node = check_halt_state_context (preg,
874 mctx->state_log[match_last],
875 mctx, match_last);
876 }
877 ret = merge_state_array (dfa, sifted_states, lim_states,
878 match_last + 1);
879 re_free (lim_states);
880 lim_states = NULL;
881 if (BE (ret != REG_NOERROR, 0))
882 goto free_return;
883 }
884 else
885 {
886 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
887 match_last, 0);
888 ret = sift_states_backward (preg, mctx, &sctx);
889 re_node_set_free (&sctx.limits);
890 if (BE (ret != REG_NOERROR, 0))
891 goto free_return;
892 }
893 re_free (mctx->state_log);
894 mctx->state_log = sifted_states;
895 sifted_states = NULL;
896 mctx->last_node = halt_node;
897 mctx->match_last = match_last;
898 ret = REG_NOERROR;
899 free_return:
900 re_free (sifted_states);
901 re_free (lim_states);
902 return ret;
903 }
904
905 /* Acquire an initial state and return it.
906 We must select appropriate initial state depending on the context,
907 since initial states may have constraints like "\<", "^", etc.. */
908
909 static re_dfastate_t *
910 acquire_init_state_context (err, preg, mctx, idx)
911 reg_errcode_t *err;
912 const regex_t *preg;
913 const re_match_context_t *mctx;
914 int idx;
915 {
916 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
917
918 *err = REG_NOERROR;
919 if (dfa->init_state->has_constraint)
920 {
921 unsigned int context;
922 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
923 preg->newline_anchor);
924 if (IS_WORD_CONTEXT (context))
925 return dfa->init_state_word;
926 else if (IS_ORDINARY_CONTEXT (context))
927 return dfa->init_state;
928 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
929 return dfa->init_state_begbuf;
930 else if (IS_NEWLINE_CONTEXT (context))
931 return dfa->init_state_nl;
932 else if (IS_BEGBUF_CONTEXT (context))
933 {
934 /* It is relatively rare case, then calculate on demand. */
935 return re_acquire_state_context (err, dfa,
936 dfa->init_state->entrance_nodes,
937 context);
938 }
939 else
940 /* Must not happen? */
941 return dfa->init_state;
942 }
943 else
944 return dfa->init_state;
945 }
946
947 /* Check whether the regular expression match input string INPUT or not,
948 and return the index where the matching end, return -1 if not match,
949 or return -2 in case of an error.
950 FL_SEARCH means we must search where the matching starts,
951 FL_LONGEST_MATCH means we want the POSIX longest matching.
952 Note that the matcher assume that the maching starts from the current
953 index of the buffer. */
954
955 static int
956 check_matching (preg, mctx, fl_search, fl_longest_match)
957 const regex_t *preg;
958 re_match_context_t *mctx;
959 int fl_search, fl_longest_match;
960 {
961 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
962 reg_errcode_t err;
963 int match = 0;
964 int match_last = -1;
965 int cur_str_idx = re_string_cur_idx (mctx->input);
966 re_dfastate_t *cur_state;
967
968 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
969 /* An initial state must not be NULL(invalid state). */
970 if (BE (cur_state == NULL, 0))
971 return -2;
972 if (mctx->state_log != NULL)
973 mctx->state_log[cur_str_idx] = cur_state;
974
975 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
976 later. E.g. Processing back references. */
977 if (dfa->nbackref)
978 {
979 err = check_subexp_matching_top (dfa, mctx, &cur_state->nodes, 0);
980 if (BE (err != REG_NOERROR, 0))
981 return err;
982 }
983
984 if (cur_state->has_backref)
985 {
986 err = transit_state_bkref (preg, &cur_state->nodes, mctx);
987 if (BE (err != REG_NOERROR, 0))
988 return err;
989 }
990
991 /* If the RE accepts NULL string. */
992 if (cur_state->halt)
993 {
994 if (!cur_state->has_constraint
995 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
996 {
997 if (!fl_longest_match)
998 return cur_str_idx;
999 else
1000 {
1001 match_last = cur_str_idx;
1002 match = 1;
1003 }
1004 }
1005 }
1006
1007 while (!re_string_eoi (mctx->input))
1008 {
1009 cur_state = transit_state (&err, preg, mctx, cur_state,
1010 fl_search && !match);
1011 if (cur_state == NULL) /* Reached at the invalid state or an error. */
1012 {
1013 cur_str_idx = re_string_cur_idx (mctx->input);
1014 if (BE (err != REG_NOERROR, 0))
1015 return -2;
1016 if (fl_search && !match)
1017 {
1018 /* Restart from initial state, since we are searching
1019 the point from where matching start. */
1020 #ifdef RE_ENABLE_I18N
1021 if (dfa->mb_cur_max == 1
1022 || re_string_first_byte (mctx->input, cur_str_idx))
1023 #endif /* RE_ENABLE_I18N */
1024 cur_state = acquire_init_state_context (&err, preg, mctx,
1025 cur_str_idx);
1026 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
1027 return -2;
1028 if (mctx->state_log != NULL)
1029 mctx->state_log[cur_str_idx] = cur_state;
1030 }
1031 else if (!fl_longest_match && match)
1032 break;
1033 else /* (fl_longest_match && match) || (!fl_search && !match) */
1034 {
1035 if (mctx->state_log == NULL)
1036 break;
1037 else
1038 {
1039 int max = mctx->state_log_top;
1040 for (; cur_str_idx <= max; ++cur_str_idx)
1041 if (mctx->state_log[cur_str_idx] != NULL)
1042 break;
1043 if (cur_str_idx > max)
1044 break;
1045 }
1046 }
1047 }
1048
1049 if (cur_state != NULL && cur_state->halt)
1050 {
1051 /* Reached at a halt state.
1052 Check the halt state can satisfy the current context. */
1053 if (!cur_state->has_constraint
1054 || check_halt_state_context (preg, cur_state, mctx,
1055 re_string_cur_idx (mctx->input)))
1056 {
1057 /* We found an appropriate halt state. */
1058 match_last = re_string_cur_idx (mctx->input);
1059 match = 1;
1060 if (!fl_longest_match)
1061 break;
1062 }
1063 }
1064 }
1065 return match_last;
1066 }
1067
1068 /* Check NODE match the current context. */
1069
1070 static int check_halt_node_context (dfa, node, context)
1071 const re_dfa_t *dfa;
1072 int node;
1073 unsigned int context;
1074 {
1075 re_token_type_t type = dfa->nodes[node].type;
1076 unsigned int constraint = dfa->nodes[node].constraint;
1077 if (type != END_OF_RE)
1078 return 0;
1079 if (!constraint)
1080 return 1;
1081 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1082 return 0;
1083 return 1;
1084 }
1085
1086 /* Check the halt state STATE match the current context.
1087 Return 0 if not match, if the node, STATE has, is a halt node and
1088 match the context, return the node. */
1089
1090 static int
1091 check_halt_state_context (preg, state, mctx, idx)
1092 const regex_t *preg;
1093 const re_dfastate_t *state;
1094 const re_match_context_t *mctx;
1095 int idx;
1096 {
1097 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1098 int i;
1099 unsigned int context;
1100 #ifdef DEBUG
1101 assert (state->halt);
1102 #endif
1103 context = re_string_context_at (mctx->input, idx, mctx->eflags,
1104 preg->newline_anchor);
1105 for (i = 0; i < state->nodes.nelem; ++i)
1106 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
1107 return state->nodes.elems[i];
1108 return 0;
1109 }
1110
1111 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1112 corresponding to the DFA).
1113 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1114 of errors. */
1115
1116 static int
1117 proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs)
1118 const regex_t *preg;
1119 regmatch_t *regs;
1120 const re_match_context_t *mctx;
1121 int nregs, *pidx, node;
1122 re_node_set *eps_via_nodes;
1123 struct re_fail_stack_t *fs;
1124 {
1125 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1126 int i, err, dest_node;
1127 dest_node = -1;
1128 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1129 {
1130 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1131 int ndest, dest_nodes[2];
1132 err = re_node_set_insert (eps_via_nodes, node);
1133 if (BE (err < 0, 0))
1134 return -1;
1135 /* Pick up valid destinations. */
1136 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1137 {
1138 int candidate = dfa->edests[node].elems[i];
1139 if (!re_node_set_contains (cur_nodes, candidate))
1140 continue;
1141 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1142 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1143 ++ndest;
1144 }
1145 if (ndest <= 1)
1146 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1147 /* In order to avoid infinite loop like "(a*)*". */
1148 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1149 return dest_nodes[1];
1150 if (fs != NULL)
1151 push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes);
1152 return dest_nodes[0];
1153 }
1154 else
1155 {
1156 int naccepted = 0;
1157 re_token_type_t type = dfa->nodes[node].type;
1158
1159 #ifdef RE_ENABLE_I18N
1160 if (ACCEPT_MB_NODE (type))
1161 naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx);
1162 else
1163 #endif /* RE_ENABLE_I18N */
1164 if (type == OP_BACK_REF)
1165 {
1166 int subexp_idx = dfa->nodes[node].opr.idx;
1167 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1168 if (fs != NULL)
1169 {
1170 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1171 return -1;
1172 else if (naccepted)
1173 {
1174 char *buf = (char *) re_string_get_buffer (mctx->input);
1175 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1176 naccepted) != 0)
1177 return -1;
1178 }
1179 }
1180
1181 if (naccepted == 0)
1182 {
1183 err = re_node_set_insert (eps_via_nodes, node);
1184 if (BE (err < 0, 0))
1185 return -2;
1186 dest_node = dfa->edests[node].elems[0];
1187 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1188 dest_node))
1189 return dest_node;
1190 }
1191 }
1192
1193 if (naccepted != 0
1194 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
1195 {
1196 dest_node = dfa->nexts[node];
1197 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1198 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1199 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1200 dest_node)))
1201 return -1;
1202 re_node_set_empty (eps_via_nodes);
1203 return dest_node;
1204 }
1205 }
1206 return -1;
1207 }
1208
1209 static reg_errcode_t
1210 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1211 struct re_fail_stack_t *fs;
1212 int str_idx, *dests, nregs;
1213 regmatch_t *regs;
1214 re_node_set *eps_via_nodes;
1215 {
1216 reg_errcode_t err;
1217 int num = fs->num++;
1218 if (fs->num == fs->alloc)
1219 {
1220 struct re_fail_stack_ent_t *new_array;
1221 fs->alloc *= 2;
1222 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1223 * fs->alloc));
1224 if (new_array == NULL)
1225 return REG_ESPACE;
1226 fs->stack = new_array;
1227 }
1228 fs->stack[num].idx = str_idx;
1229 fs->stack[num].node = dests[1];
1230 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1231 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1232 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1233 return err;
1234 }
1235
1236 static int
1237 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1238 struct re_fail_stack_t *fs;
1239 int *pidx, nregs;
1240 regmatch_t *regs;
1241 re_node_set *eps_via_nodes;
1242 {
1243 int num = --fs->num;
1244 assert (num >= 0);
1245 *pidx = fs->stack[num].idx;
1246 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1247 re_node_set_free (eps_via_nodes);
1248 re_free (fs->stack[num].regs);
1249 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1250 return fs->stack[num].node;
1251 }
1252
1253 /* Set the positions where the subexpressions are starts/ends to registers
1254 PMATCH.
1255 Note: We assume that pmatch[0] is already set, and
1256 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1257
1258 static reg_errcode_t
1259 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1260 const regex_t *preg;
1261 const re_match_context_t *mctx;
1262 size_t nmatch;
1263 regmatch_t *pmatch;
1264 int fl_backtrack;
1265 {
1266 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1267 int idx, cur_node, real_nmatch;
1268 re_node_set eps_via_nodes;
1269 struct re_fail_stack_t *fs;
1270 struct re_fail_stack_t fs_body = {0, 2, NULL};
1271 #ifdef DEBUG
1272 assert (nmatch > 1);
1273 assert (mctx->state_log != NULL);
1274 #endif
1275 if (fl_backtrack)
1276 {
1277 fs = &fs_body;
1278 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1279 }
1280 else
1281 fs = NULL;
1282 cur_node = dfa->init_node;
1283 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1284 re_node_set_init_empty (&eps_via_nodes);
1285 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1286 {
1287 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1288 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1289 {
1290 int reg_idx;
1291 if (fs)
1292 {
1293 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1294 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1295 break;
1296 if (reg_idx == nmatch)
1297 {
1298 re_node_set_free (&eps_via_nodes);
1299 return free_fail_stack_return (fs);
1300 }
1301 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1302 &eps_via_nodes);
1303 }
1304 else
1305 {
1306 re_node_set_free (&eps_via_nodes);
1307 return REG_NOERROR;
1308 }
1309 }
1310
1311 /* Proceed to next node. */
1312 cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node,
1313 &eps_via_nodes, fs);
1314
1315 if (BE (cur_node < 0, 0))
1316 {
1317 if (cur_node == -2)
1318 return REG_ESPACE;
1319 if (fs)
1320 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1321 &eps_via_nodes);
1322 else
1323 {
1324 re_node_set_free (&eps_via_nodes);
1325 return REG_NOMATCH;
1326 }
1327 }
1328 }
1329 re_node_set_free (&eps_via_nodes);
1330 return free_fail_stack_return (fs);
1331 }
1332
1333 static reg_errcode_t
1334 free_fail_stack_return (fs)
1335 struct re_fail_stack_t *fs;
1336 {
1337 if (fs)
1338 {
1339 int fs_idx;
1340 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1341 {
1342 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1343 re_free (fs->stack[fs_idx].regs);
1344 }
1345 re_free (fs->stack);
1346 }
1347 return REG_NOERROR;
1348 }
1349
1350 static void
1351 update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1352 re_dfa_t *dfa;
1353 regmatch_t *pmatch;
1354 int cur_node, cur_idx, nmatch;
1355 {
1356 int type = dfa->nodes[cur_node].type;
1357 int reg_num;
1358 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1359 return;
1360 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1361 if (reg_num >= nmatch)
1362 return;
1363 if (type == OP_OPEN_SUBEXP)
1364 {
1365 /* We are at the first node of this sub expression. */
1366 pmatch[reg_num].rm_so = cur_idx;
1367 pmatch[reg_num].rm_eo = -1;
1368 }
1369 else if (type == OP_CLOSE_SUBEXP)
1370 /* We are at the first node of this sub expression. */
1371 pmatch[reg_num].rm_eo = cur_idx;
1372 }
1373
1374 #define NUMBER_OF_STATE 1
1375
1376 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1377 and sift the nodes in each states according to the following rules.
1378 Updated state_log will be wrote to STATE_LOG.
1379
1380 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1381 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1382 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1383 the LAST_NODE, we throw away the node `a'.
1384 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1385 string `s' and transit to `b':
1386 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1387 away the node `a'.
1388 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1389 throwed away, we throw away the node `a'.
1390 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1391 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1392 node `a'.
1393 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1394 we throw away the node `a'. */
1395
1396 #define STATE_NODE_CONTAINS(state,node) \
1397 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1398
1399 static reg_errcode_t
1400 sift_states_backward (preg, mctx, sctx)
1401 const regex_t *preg;
1402 re_match_context_t *mctx;
1403 re_sift_context_t *sctx;
1404 {
1405 reg_errcode_t err;
1406 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1407 int null_cnt = 0;
1408 int str_idx = sctx->last_str_idx;
1409 re_node_set cur_dest;
1410 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
1411
1412 #ifdef DEBUG
1413 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1414 #endif
1415 cur_src = &mctx->state_log[str_idx]->nodes;
1416
1417 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1418 transit to the last_node and the last_node itself. */
1419 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1420 if (BE (err != REG_NOERROR, 0))
1421 return err;
1422 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1423 if (BE (err != REG_NOERROR, 0))
1424 goto free_return;
1425
1426 /* Then check each states in the state_log. */
1427 while (str_idx > 0)
1428 {
1429 int i, ret;
1430 /* Update counters. */
1431 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1432 if (null_cnt > mctx->max_mb_elem_len)
1433 {
1434 memset (sctx->sifted_states, '\0',
1435 sizeof (re_dfastate_t *) * str_idx);
1436 re_node_set_free (&cur_dest);
1437 return REG_NOERROR;
1438 }
1439 re_node_set_empty (&cur_dest);
1440 --str_idx;
1441 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1442 : &mctx->state_log[str_idx]->nodes);
1443
1444 /* Then build the next sifted state.
1445 We build the next sifted state on `cur_dest', and update
1446 `sifted_states[str_idx]' with `cur_dest'.
1447 Note:
1448 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1449 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1450 for (i = 0; i < cur_src->nelem; i++)
1451 {
1452 int prev_node = cur_src->elems[i];
1453 int naccepted = 0;
1454 re_token_type_t type = dfa->nodes[prev_node].type;
1455
1456 if (IS_EPSILON_NODE(type))
1457 continue;
1458 #ifdef RE_ENABLE_I18N
1459 /* If the node may accept `multi byte'. */
1460 if (ACCEPT_MB_NODE (type))
1461 naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node,
1462 str_idx, sctx->last_str_idx);
1463
1464 #endif /* RE_ENABLE_I18N */
1465 /* We don't check backreferences here.
1466 See update_cur_sifted_state(). */
1467
1468 if (!naccepted
1469 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1470 str_idx)
1471 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1472 dfa->nexts[prev_node]))
1473 naccepted = 1;
1474
1475 if (naccepted == 0)
1476 continue;
1477
1478 if (sctx->limits.nelem)
1479 {
1480 int to_idx = str_idx + naccepted;
1481 if (check_dst_limits (dfa, &sctx->limits, mctx,
1482 dfa->nexts[prev_node], to_idx,
1483 prev_node, str_idx))
1484 continue;
1485 }
1486 ret = re_node_set_insert (&cur_dest, prev_node);
1487 if (BE (ret == -1, 0))
1488 {
1489 err = REG_ESPACE;
1490 goto free_return;
1491 }
1492 }
1493
1494 /* Add all the nodes which satisfy the following conditions:
1495 - It can epsilon transit to a node in CUR_DEST.
1496 - It is in CUR_SRC.
1497 And update state_log. */
1498 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1499 if (BE (err != REG_NOERROR, 0))
1500 goto free_return;
1501 }
1502 err = REG_NOERROR;
1503 free_return:
1504 re_node_set_free (&cur_dest);
1505 return err;
1506 }
1507
1508 /* Helper functions. */
1509
1510 static reg_errcode_t
1511 clean_state_log_if_need (mctx, next_state_log_idx)
1512 re_match_context_t *mctx;
1513 int next_state_log_idx;
1514 {
1515 int top = mctx->state_log_top;
1516
1517 if (next_state_log_idx >= mctx->input->bufs_len
1518 || (next_state_log_idx >= mctx->input->valid_len
1519 && mctx->input->valid_len < mctx->input->len))
1520 {
1521 reg_errcode_t err;
1522 err = extend_buffers (mctx);
1523 if (BE (err != REG_NOERROR, 0))
1524 return err;
1525 }
1526
1527 if (top < next_state_log_idx)
1528 {
1529 memset (mctx->state_log + top + 1, '\0',
1530 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1531 mctx->state_log_top = next_state_log_idx;
1532 }
1533 return REG_NOERROR;
1534 }
1535
1536 static reg_errcode_t
1537 merge_state_array (dfa, dst, src, num)
1538 re_dfa_t *dfa;
1539 re_dfastate_t **dst;
1540 re_dfastate_t **src;
1541 int num;
1542 {
1543 int st_idx;
1544 reg_errcode_t err;
1545 for (st_idx = 0; st_idx < num; ++st_idx)
1546 {
1547 if (dst[st_idx] == NULL)
1548 dst[st_idx] = src[st_idx];
1549 else if (src[st_idx] != NULL)
1550 {
1551 re_node_set merged_set;
1552 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1553 &src[st_idx]->nodes);
1554 if (BE (err != REG_NOERROR, 0))
1555 return err;
1556 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1557 re_node_set_free (&merged_set);
1558 if (BE (err != REG_NOERROR, 0))
1559 return err;
1560 }
1561 }
1562 return REG_NOERROR;
1563 }
1564
1565 static reg_errcode_t
1566 update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes)
1567 const regex_t *preg;
1568 re_match_context_t *mctx;
1569 re_sift_context_t *sctx;
1570 int str_idx;
1571 re_node_set *dest_nodes;
1572 {
1573 reg_errcode_t err;
1574 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1575 const re_node_set *candidates;
1576 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1577 : &mctx->state_log[str_idx]->nodes);
1578
1579 /* At first, add the nodes which can epsilon transit to a node in
1580 DEST_NODE. */
1581 if (dest_nodes->nelem)
1582 {
1583 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1584 if (BE (err != REG_NOERROR, 0))
1585 return err;
1586 }
1587
1588 /* Then, check the limitations in the current sift_context. */
1589 if (dest_nodes->nelem && sctx->limits.nelem)
1590 {
1591 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1592 mctx->bkref_ents, str_idx);
1593 if (BE (err != REG_NOERROR, 0))
1594 return err;
1595 }
1596
1597 /* Update state_log. */
1598 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1599 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1600 return err;
1601
1602 if ((mctx->state_log[str_idx] != NULL
1603 && mctx->state_log[str_idx]->has_backref))
1604 {
1605 err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes);
1606 if (BE (err != REG_NOERROR, 0))
1607 return err;
1608 }
1609 return REG_NOERROR;
1610 }
1611
1612 static reg_errcode_t
1613 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1614 re_dfa_t *dfa;
1615 re_node_set *dest_nodes;
1616 const re_node_set *candidates;
1617 {
1618 reg_errcode_t err;
1619 int src_idx;
1620 re_node_set src_copy;
1621
1622 err = re_node_set_init_copy (&src_copy, dest_nodes);
1623 if (BE (err != REG_NOERROR, 0))
1624 return err;
1625 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1626 {
1627 err = re_node_set_add_intersect (dest_nodes, candidates,
1628 dfa->inveclosures
1629 + src_copy.elems[src_idx]);
1630 if (BE (err != REG_NOERROR, 0))
1631 {
1632 re_node_set_free (&src_copy);
1633 return err;
1634 }
1635 }
1636 re_node_set_free (&src_copy);
1637 return REG_NOERROR;
1638 }
1639
1640 static reg_errcode_t
1641 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1642 re_dfa_t *dfa;
1643 int node;
1644 re_node_set *dest_nodes;
1645 const re_node_set *candidates;
1646 {
1647 int ecl_idx;
1648 reg_errcode_t err;
1649 re_node_set *inv_eclosure = dfa->inveclosures + node;
1650 re_node_set except_nodes;
1651 re_node_set_init_empty (&except_nodes);
1652 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1653 {
1654 int cur_node = inv_eclosure->elems[ecl_idx];
1655 if (cur_node == node)
1656 continue;
1657 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1658 {
1659 int edst1 = dfa->edests[cur_node].elems[0];
1660 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1661 ? dfa->edests[cur_node].elems[1] : -1);
1662 if ((!re_node_set_contains (inv_eclosure, edst1)
1663 && re_node_set_contains (dest_nodes, edst1))
1664 || (edst2 > 0
1665 && !re_node_set_contains (inv_eclosure, edst2)
1666 && re_node_set_contains (dest_nodes, edst2)))
1667 {
1668 err = re_node_set_add_intersect (&except_nodes, candidates,
1669 dfa->inveclosures + cur_node);
1670 if (BE (err != REG_NOERROR, 0))
1671 {
1672 re_node_set_free (&except_nodes);
1673 return err;
1674 }
1675 }
1676 }
1677 }
1678 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1679 {
1680 int cur_node = inv_eclosure->elems[ecl_idx];
1681 if (!re_node_set_contains (&except_nodes, cur_node))
1682 {
1683 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1684 re_node_set_remove_at (dest_nodes, idx);
1685 }
1686 }
1687 re_node_set_free (&except_nodes);
1688 return REG_NOERROR;
1689 }
1690
1691 static int
1692 check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx)
1693 re_dfa_t *dfa;
1694 re_node_set *limits;
1695 re_match_context_t *mctx;
1696 int dst_node, dst_idx, src_node, src_idx;
1697 {
1698 int lim_idx, src_pos, dst_pos;
1699
1700 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1701 {
1702 int subexp_idx;
1703 struct re_backref_cache_entry *ent;
1704 ent = mctx->bkref_ents + limits->elems[lim_idx];
1705 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1706
1707 dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1708 dfa->eclosures + dst_node,
1709 subexp_idx, dst_node, dst_idx);
1710 src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1711 dfa->eclosures + src_node,
1712 subexp_idx, src_node, src_idx);
1713
1714 /* In case of:
1715 <src> <dst> ( <subexp> )
1716 ( <subexp> ) <src> <dst>
1717 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1718 if (src_pos == dst_pos)
1719 continue; /* This is unrelated limitation. */
1720 else
1721 return 1;
1722 }
1723 return 0;
1724 }
1725
1726 static int
1727 check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, from_node,
1728 str_idx)
1729 re_dfa_t *dfa;
1730 re_match_context_t *mctx;
1731 re_node_set *eclosures;
1732 int limit, subexp_idx, from_node, str_idx;
1733 {
1734 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1735 int node_idx;
1736
1737 /* If we are outside the range of the subexpression, return -1 or 1. */
1738 if (str_idx < lim->subexp_from)
1739 return -1;
1740
1741 if (lim->subexp_to < str_idx)
1742 return 1;
1743
1744 /* If we are within the subexpression, return 0. */
1745 if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1746 return 0;
1747
1748 /* Else, we are on the boundary: examine the nodes on the epsilon
1749 closure. */
1750 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1751 {
1752 int node = eclosures->elems[node_idx];
1753 switch (dfa->nodes[node].type)
1754 {
1755 case OP_BACK_REF:
1756 {
1757 int bi = search_cur_bkref_entry (mctx, str_idx);
1758 for (; bi < mctx->nbkref_ents; ++bi)
1759 {
1760 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1761 int dst, cpos;
1762
1763 /* If this backreference goes beyond the point we're
1764 examining, don't go any further. */
1765 if (ent->str_idx > str_idx)
1766 break;
1767
1768 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1769 continue;
1770
1771 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1772 OP_CLOSE_SUBEXP cases below. But, if the
1773 destination node is the same node as the source
1774 node, don't recurse because it would cause an
1775 infinite loop: a regex that exhibits this behavior
1776 is ()\1*\1* */
1777 dst = dfa->edests[node].elems[0];
1778 if (dst == from_node)
1779 {
1780 if (str_idx == lim->subexp_from)
1781 return -1;
1782 else /* if (str_idx == lim->subexp_to) */
1783 return 0;
1784 }
1785
1786 cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
1787 dfa->eclosures + dst,
1788 subexp_idx, dst,
1789 str_idx);
1790
1791 if (cpos == -1 && str_idx == lim->subexp_from)
1792 return -1;
1793
1794 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1795 return 0;
1796 }
1797 break;
1798 }
1799
1800 case OP_OPEN_SUBEXP:
1801 if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1802 return -1;
1803 break;
1804
1805 case OP_CLOSE_SUBEXP:
1806 if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1807 return 0;
1808 break;
1809
1810 default:
1811 break;
1812 }
1813 }
1814
1815 if (str_idx == lim->subexp_to)
1816 return 1;
1817 else
1818 return 0;
1819 }
1820
1821 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1822 which are against limitations from DEST_NODES. */
1823
1824 static reg_errcode_t
1825 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1826 re_dfa_t *dfa;
1827 re_node_set *dest_nodes;
1828 const re_node_set *candidates;
1829 re_node_set *limits;
1830 struct re_backref_cache_entry *bkref_ents;
1831 int str_idx;
1832 {
1833 reg_errcode_t err;
1834 int node_idx, lim_idx;
1835
1836 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1837 {
1838 int subexp_idx;
1839 struct re_backref_cache_entry *ent;
1840 ent = bkref_ents + limits->elems[lim_idx];
1841
1842 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1843 continue; /* This is unrelated limitation. */
1844
1845 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1846 if (ent->subexp_to == str_idx)
1847 {
1848 int ops_node = -1;
1849 int cls_node = -1;
1850 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1851 {
1852 int node = dest_nodes->elems[node_idx];
1853 re_token_type_t type= dfa->nodes[node].type;
1854 if (type == OP_OPEN_SUBEXP
1855 && subexp_idx == dfa->nodes[node].opr.idx)
1856 ops_node = node;
1857 else if (type == OP_CLOSE_SUBEXP
1858 && subexp_idx == dfa->nodes[node].opr.idx)
1859 cls_node = node;
1860 }
1861
1862 /* Check the limitation of the open subexpression. */
1863 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1864 if (ops_node >= 0)
1865 {
1866 err = sub_epsilon_src_nodes(dfa, ops_node, dest_nodes,
1867 candidates);
1868 if (BE (err != REG_NOERROR, 0))
1869 return err;
1870 }
1871 /* Check the limitation of the close subexpression. */
1872 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1873 {
1874 int node = dest_nodes->elems[node_idx];
1875 if (!re_node_set_contains (dfa->inveclosures + node, cls_node)
1876 && !re_node_set_contains (dfa->eclosures + node, cls_node))
1877 {
1878 /* It is against this limitation.
1879 Remove it form the current sifted state. */
1880 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1881 candidates);
1882 if (BE (err != REG_NOERROR, 0))
1883 return err;
1884 --node_idx;
1885 }
1886 }
1887 }
1888 else /* (ent->subexp_to != str_idx) */
1889 {
1890 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1891 {
1892 int node = dest_nodes->elems[node_idx];
1893 re_token_type_t type= dfa->nodes[node].type;
1894 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1895 {
1896 if (subexp_idx != dfa->nodes[node].opr.idx)
1897 continue;
1898 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1899 || (type == OP_OPEN_SUBEXP))
1900 {
1901 /* It is against this limitation.
1902 Remove it form the current sifted state. */
1903 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1904 candidates);
1905 if (BE (err != REG_NOERROR, 0))
1906 return err;
1907 }
1908 }
1909 }
1910 }
1911 }
1912 return REG_NOERROR;
1913 }
1914
1915 static reg_errcode_t
1916 sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes)
1917 const regex_t *preg;
1918 re_match_context_t *mctx;
1919 re_sift_context_t *sctx;
1920 int str_idx;
1921 re_node_set *dest_nodes;
1922 {
1923 reg_errcode_t err;
1924 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1925 int node_idx, node;
1926 re_sift_context_t local_sctx;
1927 const re_node_set *candidates;
1928 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1929 : &mctx->state_log[str_idx]->nodes);
1930 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1931
1932 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
1933 {
1934 int cur_bkref_idx = re_string_cur_idx (mctx->input);
1935 re_token_type_t type;
1936 node = candidates->elems[node_idx];
1937 type = dfa->nodes[node].type;
1938 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
1939 continue;
1940 /* Avoid infinite loop for the REs like "()\1+". */
1941 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
1942 continue;
1943 if (type == OP_BACK_REF)
1944 {
1945 int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
1946 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
1947 {
1948 int disabled_idx, subexp_len, to_idx, dst_node;
1949 struct re_backref_cache_entry *entry;
1950 entry = mctx->bkref_ents + enabled_idx;
1951 if (entry->str_idx > str_idx)
1952 break;
1953 if (entry->node != node)
1954 continue;
1955 subexp_len = entry->subexp_to - entry->subexp_from;
1956 to_idx = str_idx + subexp_len;
1957 dst_node = (subexp_len ? dfa->nexts[node]
1958 : dfa->edests[node].elems[0]);
1959
1960 if (to_idx > sctx->last_str_idx
1961 || sctx->sifted_states[to_idx] == NULL
1962 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
1963 dst_node)
1964 || check_dst_limits (dfa, &sctx->limits, mctx, node,
1965 str_idx, dst_node, to_idx))
1966 continue;
1967 {
1968 re_dfastate_t *cur_state;
1969 entry->flag = 0;
1970 for (disabled_idx = enabled_idx + 1;
1971 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
1972 {
1973 struct re_backref_cache_entry *entry2;
1974 entry2 = mctx->bkref_ents + disabled_idx;
1975 if (entry2->str_idx > str_idx)
1976 break;
1977 entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
1978 }
1979
1980 if (local_sctx.sifted_states == NULL)
1981 {
1982 local_sctx = *sctx;
1983 err = re_node_set_init_copy (&local_sctx.limits,
1984 &sctx->limits);
1985 if (BE (err != REG_NOERROR, 0))
1986 goto free_return;
1987 }
1988 local_sctx.last_node = node;
1989 local_sctx.last_str_idx = str_idx;
1990 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
1991 if (BE (err < 0, 0))
1992 {
1993 err = REG_ESPACE;
1994 goto free_return;
1995 }
1996 cur_state = local_sctx.sifted_states[str_idx];
1997 err = sift_states_backward (preg, mctx, &local_sctx);
1998 if (BE (err != REG_NOERROR, 0))
1999 goto free_return;
2000 if (sctx->limited_states != NULL)
2001 {
2002 err = merge_state_array (dfa, sctx->limited_states,
2003 local_sctx.sifted_states,
2004 str_idx + 1);
2005 if (BE (err != REG_NOERROR, 0))
2006 goto free_return;
2007 }
2008 local_sctx.sifted_states[str_idx] = cur_state;
2009 re_node_set_remove (&local_sctx.limits, enabled_idx);
2010 /* We must not use the variable entry here, since
2011 mctx->bkref_ents might be realloced. */
2012 mctx->bkref_ents[enabled_idx].flag = 1;
2013 }
2014 }
2015 enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2016 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2017 {
2018 struct re_backref_cache_entry *entry;
2019 entry = mctx->bkref_ents + enabled_idx;
2020 if (entry->str_idx > str_idx)
2021 break;
2022 if (entry->node == node)
2023 entry->flag = 0;
2024 }
2025 }
2026 }
2027 err = REG_NOERROR;
2028 free_return:
2029 if (local_sctx.sifted_states != NULL)
2030 {
2031 re_node_set_free (&local_sctx.limits);
2032 }
2033
2034 return err;
2035 }
2036
2037
2038 #ifdef RE_ENABLE_I18N
2039 static int
2040 sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
2041 const regex_t *preg;
2042 const re_match_context_t *mctx;
2043 re_sift_context_t *sctx;
2044 int node_idx, str_idx, max_str_idx;
2045 {
2046 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2047 int naccepted;
2048 /* Check the node can accept `multi byte'. */
2049 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
2050 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2051 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2052 dfa->nexts[node_idx]))
2053 /* The node can't accept the `multi byte', or the
2054 destination was already throwed away, then the node
2055 could't accept the current input `multi byte'. */
2056 naccepted = 0;
2057 /* Otherwise, it is sure that the node could accept
2058 `naccepted' bytes input. */
2059 return naccepted;
2060 }
2061 #endif /* RE_ENABLE_I18N */
2062
2063 \f
2064 /* Functions for state transition. */
2065
2066 /* Return the next state to which the current state STATE will transit by
2067 accepting the current input byte, and update STATE_LOG if necessary.
2068 If STATE can accept a multibyte char/collating element/back reference
2069 update the destination of STATE_LOG. */
2070
2071 static re_dfastate_t *
2072 transit_state (err, preg, mctx, state, fl_search)
2073 reg_errcode_t *err;
2074 const regex_t *preg;
2075 re_match_context_t *mctx;
2076 re_dfastate_t *state;
2077 int fl_search;
2078 {
2079 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2080 re_dfastate_t **trtable, *next_state;
2081 unsigned char ch;
2082 int cur_idx;
2083
2084 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
2085 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
2086 && mctx->input->valid_len < mctx->input->len))
2087 {
2088 *err = extend_buffers (mctx);
2089 if (BE (*err != REG_NOERROR, 0))
2090 return NULL;
2091 }
2092
2093 *err = REG_NOERROR;
2094 if (state == NULL)
2095 {
2096 next_state = state;
2097 re_string_skip_bytes (mctx->input, 1);
2098 }
2099 else
2100 {
2101 #ifdef RE_ENABLE_I18N
2102 /* If the current state can accept multibyte. */
2103 if (state->accept_mb)
2104 {
2105 *err = transit_state_mb (preg, state, mctx);
2106 if (BE (*err != REG_NOERROR, 0))
2107 return NULL;
2108 }
2109 #endif /* RE_ENABLE_I18N */
2110
2111 /* Then decide the next state with the single byte. */
2112 if (1)
2113 {
2114 /* Use transition table */
2115 ch = re_string_fetch_byte (mctx->input);
2116 trtable = fl_search ? state->trtable_search : state->trtable;
2117 if (trtable == NULL)
2118 {
2119 trtable = build_trtable (preg, state, fl_search);
2120 if (fl_search)
2121 state->trtable_search = trtable;
2122 else
2123 state->trtable = trtable;
2124 }
2125 next_state = trtable[ch];
2126 }
2127 else
2128 {
2129 /* don't use transition table */
2130 next_state = transit_state_sb (err, preg, state, fl_search, mctx);
2131 if (BE (next_state == NULL && err != REG_NOERROR, 0))
2132 return NULL;
2133 }
2134 }
2135
2136 cur_idx = re_string_cur_idx (mctx->input);
2137 /* Update the state_log if we need. */
2138 if (mctx->state_log != NULL)
2139 {
2140 if (cur_idx > mctx->state_log_top)
2141 {
2142 mctx->state_log[cur_idx] = next_state;
2143 mctx->state_log_top = cur_idx;
2144 }
2145 else if (mctx->state_log[cur_idx] == 0)
2146 {
2147 mctx->state_log[cur_idx] = next_state;
2148 }
2149 else
2150 {
2151 re_dfastate_t *pstate;
2152 unsigned int context;
2153 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2154 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2155 the destination of a multibyte char/collating element/
2156 back reference. Then the next state is the union set of
2157 these destinations and the results of the transition table. */
2158 pstate = mctx->state_log[cur_idx];
2159 log_nodes = pstate->entrance_nodes;
2160 if (next_state != NULL)
2161 {
2162 table_nodes = next_state->entrance_nodes;
2163 *err = re_node_set_init_union (&next_nodes, table_nodes,
2164 log_nodes);
2165 if (BE (*err != REG_NOERROR, 0))
2166 return NULL;
2167 }
2168 else
2169 next_nodes = *log_nodes;
2170 /* Note: We already add the nodes of the initial state,
2171 then we don't need to add them here. */
2172
2173 context = re_string_context_at (mctx->input,
2174 re_string_cur_idx (mctx->input) - 1,
2175 mctx->eflags, preg->newline_anchor);
2176 next_state = mctx->state_log[cur_idx]
2177 = re_acquire_state_context (err, dfa, &next_nodes, context);
2178 /* We don't need to check errors here, since the return value of
2179 this function is next_state and ERR is already set. */
2180
2181 if (table_nodes != NULL)
2182 re_node_set_free (&next_nodes);
2183 }
2184 }
2185
2186 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2187 later. We must check them here, since the back references in the
2188 next state might use them. */
2189 if (dfa->nbackref && next_state/* && fl_process_bkref */)
2190 {
2191 *err = check_subexp_matching_top (dfa, mctx, &next_state->nodes,
2192 cur_idx);
2193 if (BE (*err != REG_NOERROR, 0))
2194 return NULL;
2195 }
2196
2197 /* If the next state has back references. */
2198 if (next_state != NULL && next_state->has_backref)
2199 {
2200 *err = transit_state_bkref (preg, &next_state->nodes, mctx);
2201 if (BE (*err != REG_NOERROR, 0))
2202 return NULL;
2203 next_state = mctx->state_log[cur_idx];
2204 }
2205 return next_state;
2206 }
2207
2208 /* Helper functions for transit_state. */
2209
2210 /* From the node set CUR_NODES, pick up the nodes whose types are
2211 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2212 expression. And register them to use them later for evaluating the
2213 correspoding back references. */
2214
2215 static reg_errcode_t
2216 check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx)
2217 re_dfa_t *dfa;
2218 re_match_context_t *mctx;
2219 re_node_set *cur_nodes;
2220 int str_idx;
2221 {
2222 int node_idx;
2223 reg_errcode_t err;
2224
2225 /* TODO: This isn't efficient.
2226 Because there might be more than one nodes whose types are
2227 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2228 nodes.
2229 E.g. RE: (a){2} */
2230 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2231 {
2232 int node = cur_nodes->elems[node_idx];
2233 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2234 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2235 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2236 {
2237 err = match_ctx_add_subtop (mctx, node, str_idx);
2238 if (BE (err != REG_NOERROR, 0))
2239 return err;
2240 }
2241 }
2242 return REG_NOERROR;
2243 }
2244
2245 /* Return the next state to which the current state STATE will transit by
2246 accepting the current input byte. */
2247
2248 static re_dfastate_t *
2249 transit_state_sb (err, preg, state, fl_search, mctx)
2250 reg_errcode_t *err;
2251 const regex_t *preg;
2252 re_dfastate_t *state;
2253 int fl_search;
2254 re_match_context_t *mctx;
2255 {
2256 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2257 re_node_set next_nodes;
2258 re_dfastate_t *next_state;
2259 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
2260 unsigned int context;
2261
2262 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2263 if (BE (*err != REG_NOERROR, 0))
2264 return NULL;
2265 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2266 {
2267 int cur_node = state->nodes.elems[node_cnt];
2268 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
2269 {
2270 *err = re_node_set_merge (&next_nodes,
2271 dfa->eclosures + dfa->nexts[cur_node]);
2272 if (BE (*err != REG_NOERROR, 0))
2273 {
2274 re_node_set_free (&next_nodes);
2275 return NULL;
2276 }
2277 }
2278 }
2279 if (fl_search)
2280 {
2281 #ifdef RE_ENABLE_I18N
2282 int not_initial = 0;
2283 if (dfa->mb_cur_max > 1)
2284 for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
2285 if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
2286 {
2287 not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
2288 break;
2289 }
2290 if (!not_initial)
2291 #endif
2292 {
2293 *err = re_node_set_merge (&next_nodes,
2294 dfa->init_state->entrance_nodes);
2295 if (BE (*err != REG_NOERROR, 0))
2296 {
2297 re_node_set_free (&next_nodes);
2298 return NULL;
2299 }
2300 }
2301 }
2302 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
2303 preg->newline_anchor);
2304 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2305 /* We don't need to check errors here, since the return value of
2306 this function is next_state and ERR is already set. */
2307
2308 re_node_set_free (&next_nodes);
2309 re_string_skip_bytes (mctx->input, 1);
2310 return next_state;
2311 }
2312
2313 #ifdef RE_ENABLE_I18N
2314 static reg_errcode_t
2315 transit_state_mb (preg, pstate, mctx)
2316 const regex_t *preg;
2317 re_dfastate_t *pstate;
2318 re_match_context_t *mctx;
2319 {
2320 reg_errcode_t err;
2321 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2322 int i;
2323
2324 for (i = 0; i < pstate->nodes.nelem; ++i)
2325 {
2326 re_node_set dest_nodes, *new_nodes;
2327 int cur_node_idx = pstate->nodes.elems[i];
2328 int naccepted = 0, dest_idx;
2329 unsigned int context;
2330 re_dfastate_t *dest_state;
2331
2332 if (dfa->nodes[cur_node_idx].constraint)
2333 {
2334 context = re_string_context_at (mctx->input,
2335 re_string_cur_idx (mctx->input),
2336 mctx->eflags, preg->newline_anchor);
2337 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2338 context))
2339 continue;
2340 }
2341
2342 /* How many bytes the node can accept? */
2343 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2344 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
2345 re_string_cur_idx (mctx->input));
2346 if (naccepted == 0)
2347 continue;
2348
2349 /* The node can accepts `naccepted' bytes. */
2350 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
2351 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2352 : mctx->max_mb_elem_len);
2353 err = clean_state_log_if_need (mctx, dest_idx);
2354 if (BE (err != REG_NOERROR, 0))
2355 return err;
2356 #ifdef DEBUG
2357 assert (dfa->nexts[cur_node_idx] != -1);
2358 #endif
2359 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2360 then we use pstate->nodes.elems[i] instead. */
2361 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2362
2363 dest_state = mctx->state_log[dest_idx];
2364 if (dest_state == NULL)
2365 dest_nodes = *new_nodes;
2366 else
2367 {
2368 err = re_node_set_init_union (&dest_nodes,
2369 dest_state->entrance_nodes, new_nodes);
2370 if (BE (err != REG_NOERROR, 0))
2371 return err;
2372 }
2373 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
2374 preg->newline_anchor);
2375 mctx->state_log[dest_idx]
2376 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2377 if (dest_state != NULL)
2378 re_node_set_free (&dest_nodes);
2379 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2380 return err;
2381 }
2382 return REG_NOERROR;
2383 }
2384 #endif /* RE_ENABLE_I18N */
2385
2386 static reg_errcode_t
2387 transit_state_bkref (preg, nodes, mctx)
2388 const regex_t *preg;
2389 re_node_set *nodes;
2390 re_match_context_t *mctx;
2391 {
2392 reg_errcode_t err;
2393 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2394 int i;
2395 int cur_str_idx = re_string_cur_idx (mctx->input);
2396
2397 for (i = 0; i < nodes->nelem; ++i)
2398 {
2399 int dest_str_idx, prev_nelem, bkc_idx;
2400 int node_idx = nodes->elems[i];
2401 unsigned int context;
2402 re_token_t *node = dfa->nodes + node_idx;
2403 re_node_set *new_dest_nodes;
2404
2405 /* Check whether `node' is a backreference or not. */
2406 if (node->type != OP_BACK_REF)
2407 continue;
2408
2409 if (node->constraint)
2410 {
2411 context = re_string_context_at (mctx->input, cur_str_idx,
2412 mctx->eflags, preg->newline_anchor);
2413 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2414 continue;
2415 }
2416
2417 /* `node' is a backreference.
2418 Check the substring which the substring matched. */
2419 bkc_idx = mctx->nbkref_ents;
2420 err = get_subexp (preg, mctx, node_idx, cur_str_idx);
2421 if (BE (err != REG_NOERROR, 0))
2422 goto free_return;
2423
2424 /* And add the epsilon closures (which is `new_dest_nodes') of
2425 the backreference to appropriate state_log. */
2426 #ifdef DEBUG
2427 assert (dfa->nexts[node_idx] != -1);
2428 #endif
2429 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2430 {
2431 int subexp_len;
2432 re_dfastate_t *dest_state;
2433 struct re_backref_cache_entry *bkref_ent;
2434 bkref_ent = mctx->bkref_ents + bkc_idx;
2435 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2436 continue;
2437 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2438 new_dest_nodes = (subexp_len == 0
2439 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2440 : dfa->eclosures + dfa->nexts[node_idx]);
2441 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2442 - bkref_ent->subexp_from);
2443 context = re_string_context_at (mctx->input, dest_str_idx - 1,
2444 mctx->eflags, preg->newline_anchor);
2445 dest_state = mctx->state_log[dest_str_idx];
2446 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2447 : mctx->state_log[cur_str_idx]->nodes.nelem);
2448 /* Add `new_dest_node' to state_log. */
2449 if (dest_state == NULL)
2450 {
2451 mctx->state_log[dest_str_idx]
2452 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2453 context);
2454 if (BE (mctx->state_log[dest_str_idx] == NULL
2455 && err != REG_NOERROR, 0))
2456 goto free_return;
2457 }
2458 else
2459 {
2460 re_node_set dest_nodes;
2461 err = re_node_set_init_union (&dest_nodes,
2462 dest_state->entrance_nodes,
2463 new_dest_nodes);
2464 if (BE (err != REG_NOERROR, 0))
2465 {
2466 re_node_set_free (&dest_nodes);
2467 goto free_return;
2468 }
2469 mctx->state_log[dest_str_idx]
2470 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2471 re_node_set_free (&dest_nodes);
2472 if (BE (mctx->state_log[dest_str_idx] == NULL
2473 && err != REG_NOERROR, 0))
2474 goto free_return;
2475 }
2476 /* We need to check recursively if the backreference can epsilon
2477 transit. */
2478 if (subexp_len == 0
2479 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2480 {
2481 err = check_subexp_matching_top (dfa, mctx, new_dest_nodes,
2482 cur_str_idx);
2483 if (BE (err != REG_NOERROR, 0))
2484 goto free_return;
2485 err = transit_state_bkref (preg, new_dest_nodes, mctx);
2486 if (BE (err != REG_NOERROR, 0))
2487 goto free_return;
2488 }
2489 }
2490 }
2491 err = REG_NOERROR;
2492 free_return:
2493 return err;
2494 }
2495
2496 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2497 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2498 Note that we might collect inappropriate candidates here.
2499 However, the cost of checking them strictly here is too high, then we
2500 delay these checking for prune_impossible_nodes(). */
2501
2502 static reg_errcode_t
2503 get_subexp (preg, mctx, bkref_node, bkref_str_idx)
2504 const regex_t *preg;
2505 re_match_context_t *mctx;
2506 int bkref_node, bkref_str_idx;
2507 {
2508 int subexp_num, sub_top_idx;
2509 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2510 char *buf = (char *) re_string_get_buffer (mctx->input);
2511 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2512 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2513 for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2514 {
2515 struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2516 if (entry->str_idx > bkref_str_idx)
2517 break;
2518 if (entry->node == bkref_node)
2519 return REG_NOERROR; /* We already checked it. */
2520 }
2521 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2522
2523 /* For each sub expression */
2524 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2525 {
2526 reg_errcode_t err;
2527 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2528 re_sub_match_last_t *sub_last;
2529 int sub_last_idx, sl_str;
2530 char *bkref_str;
2531
2532 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2533 continue; /* It isn't related. */
2534
2535 sl_str = sub_top->str_idx;
2536 bkref_str = buf + bkref_str_idx;
2537 /* At first, check the last node of sub expressions we already
2538 evaluated. */
2539 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2540 {
2541 int sl_str_diff;
2542 sub_last = sub_top->lasts[sub_last_idx];
2543 sl_str_diff = sub_last->str_idx - sl_str;
2544 /* The matched string by the sub expression match with the substring
2545 at the back reference? */
2546 if (sl_str_diff > 0
2547 && memcmp (bkref_str, buf + sl_str, sl_str_diff) != 0)
2548 break; /* We don't need to search this sub expression any more. */
2549 bkref_str += sl_str_diff;
2550 sl_str += sl_str_diff;
2551 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2552 bkref_str_idx);
2553
2554 /* Reload buf and bkref_str, since the preceding call might
2555 have reallocated the buffer. */
2556 buf = (char *) re_string_get_buffer (mctx->input);
2557 bkref_str = buf + bkref_str_idx;
2558
2559 if (err == REG_NOMATCH)
2560 continue;
2561 if (BE (err != REG_NOERROR, 0))
2562 return err;
2563 }
2564 if (sub_last_idx < sub_top->nlasts)
2565 continue;
2566 if (sub_last_idx > 0)
2567 ++sl_str;
2568 /* Then, search for the other last nodes of the sub expression. */
2569 for (; sl_str <= bkref_str_idx; ++sl_str)
2570 {
2571 int cls_node, sl_str_off;
2572 re_node_set *nodes;
2573 sl_str_off = sl_str - sub_top->str_idx;
2574 /* The matched string by the sub expression match with the substring
2575 at the back reference? */
2576 if (sl_str_off > 0 && *bkref_str++ != buf[sl_str - 1])
2577 break; /* We don't need to search this sub expression any more. */
2578 if (mctx->state_log[sl_str] == NULL)
2579 continue;
2580 /* Does this state have a ')' of the sub expression? */
2581 nodes = &mctx->state_log[sl_str]->nodes;
2582 cls_node = find_subexp_node (dfa, nodes, subexp_num, 0);
2583 if (cls_node == -1)
2584 continue; /* No. */
2585 if (sub_top->path == NULL)
2586 {
2587 sub_top->path = calloc (sizeof (state_array_t),
2588 sl_str - sub_top->str_idx + 1);
2589 if (sub_top->path == NULL)
2590 return REG_ESPACE;
2591 }
2592 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2593 in the current context? */
2594 err = check_arrival (preg, mctx, sub_top->path, sub_top->node,
2595 sub_top->str_idx, cls_node, sl_str, 0);
2596 if (err == REG_NOMATCH)
2597 continue;
2598 if (BE (err != REG_NOERROR, 0))
2599 return err;
2600 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2601 if (BE (sub_last == NULL, 0))
2602 return REG_ESPACE;
2603 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2604 bkref_str_idx);
2605 if (err == REG_NOMATCH)
2606 continue;
2607 }
2608 }
2609 return REG_NOERROR;
2610 }
2611
2612 /* Helper functions for get_subexp(). */
2613
2614 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2615 If it can arrive, register the sub expression expressed with SUB_TOP
2616 and SUB_LAST. */
2617
2618 static reg_errcode_t
2619 get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, bkref_str)
2620 const regex_t *preg;
2621 re_match_context_t *mctx;
2622 re_sub_match_top_t *sub_top;
2623 re_sub_match_last_t *sub_last;
2624 int bkref_node, bkref_str;
2625 {
2626 reg_errcode_t err;
2627 int to_idx;
2628 /* Can the subexpression arrive the back reference? */
2629 err = check_arrival (preg, mctx, &sub_last->path, sub_last->node,
2630 sub_last->str_idx, bkref_node, bkref_str, 1);
2631 if (err != REG_NOERROR)
2632 return err;
2633 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2634 sub_last->str_idx);
2635 if (BE (err != REG_NOERROR, 0))
2636 return err;
2637 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2638 clean_state_log_if_need (mctx, to_idx);
2639 return REG_NOERROR;
2640 }
2641
2642 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2643 Search '(' if FL_OPEN, or search ')' otherwise.
2644 TODO: This function isn't efficient...
2645 Because there might be more than one nodes whose types are
2646 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2647 nodes.
2648 E.g. RE: (a){2} */
2649
2650 static int
2651 find_subexp_node (dfa, nodes, subexp_idx, fl_open)
2652 re_dfa_t *dfa;
2653 re_node_set *nodes;
2654 int subexp_idx, fl_open;
2655 {
2656 int cls_idx;
2657 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2658 {
2659 int cls_node = nodes->elems[cls_idx];
2660 re_token_t *node = dfa->nodes + cls_node;
2661 if (((fl_open && node->type == OP_OPEN_SUBEXP)
2662 || (!fl_open && node->type == OP_CLOSE_SUBEXP))
2663 && node->opr.idx == subexp_idx)
2664 return cls_node;
2665 }
2666 return -1;
2667 }
2668
2669 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2670 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2671 heavily reused.
2672 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2673
2674 static reg_errcode_t
2675 check_arrival (preg, mctx, path, top_node, top_str, last_node, last_str,
2676 fl_open)
2677 const regex_t *preg;
2678 re_match_context_t *mctx;
2679 state_array_t *path;
2680 int top_node, top_str, last_node, last_str, fl_open;
2681 {
2682 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2683 reg_errcode_t err;
2684 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2685 re_dfastate_t *cur_state = NULL;
2686 re_node_set *cur_nodes, next_nodes;
2687 re_dfastate_t **backup_state_log;
2688 unsigned int context;
2689
2690 subexp_num = dfa->nodes[top_node].opr.idx;
2691 /* Extend the buffer if we need. */
2692 if (path->alloc < last_str + mctx->max_mb_elem_len + 1)
2693 {
2694 re_dfastate_t **new_array;
2695 int old_alloc = path->alloc;
2696 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2697 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2698 if (new_array == NULL)
2699 return REG_ESPACE;
2700 path->array = new_array;
2701 memset (new_array + old_alloc, '\0',
2702 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2703 }
2704
2705 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2706
2707 /* Temporary modify MCTX. */
2708 backup_state_log = mctx->state_log;
2709 backup_cur_idx = mctx->input->cur_idx;
2710 mctx->state_log = path->array;
2711 mctx->input->cur_idx = str_idx;
2712
2713 /* Setup initial node set. */
2714 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2715 preg->newline_anchor);
2716 if (str_idx == top_str)
2717 {
2718 err = re_node_set_init_1 (&next_nodes, top_node);
2719 if (BE (err != REG_NOERROR, 0))
2720 return err;
2721 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, fl_open);
2722 if (BE (err != REG_NOERROR, 0))
2723 {
2724 re_node_set_free (&next_nodes);
2725 return err;
2726 }
2727 }
2728 else
2729 {
2730 cur_state = mctx->state_log[str_idx];
2731 if (cur_state && cur_state->has_backref)
2732 {
2733 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2734 if (BE ( err != REG_NOERROR, 0))
2735 return err;
2736 }
2737 else
2738 re_node_set_init_empty (&next_nodes);
2739 }
2740 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2741 {
2742 if (next_nodes.nelem)
2743 {
2744 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2745 subexp_num, fl_open);
2746 if (BE ( err != REG_NOERROR, 0))
2747 {
2748 re_node_set_free (&next_nodes);
2749 return err;
2750 }
2751 }
2752 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2753 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2754 {
2755 re_node_set_free (&next_nodes);
2756 return err;
2757 }
2758 mctx->state_log[str_idx] = cur_state;
2759 }
2760
2761 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2762 {
2763 re_node_set_empty (&next_nodes);
2764 if (mctx->state_log[str_idx + 1])
2765 {
2766 err = re_node_set_merge (&next_nodes,
2767 &mctx->state_log[str_idx + 1]->nodes);
2768 if (BE (err != REG_NOERROR, 0))
2769 {
2770 re_node_set_free (&next_nodes);
2771 return err;
2772 }
2773 }
2774 if (cur_state)
2775 {
2776 err = check_arrival_add_next_nodes(preg, dfa, mctx, str_idx,
2777 &cur_state->nodes, &next_nodes);
2778 if (BE (err != REG_NOERROR, 0))
2779 {
2780 re_node_set_free (&next_nodes);
2781 return err;
2782 }
2783 }
2784 ++str_idx;
2785 if (next_nodes.nelem)
2786 {
2787 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num,
2788 fl_open);
2789 if (BE (err != REG_NOERROR, 0))
2790 {
2791 re_node_set_free (&next_nodes);
2792 return err;
2793 }
2794 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2795 subexp_num, fl_open);
2796 if (BE ( err != REG_NOERROR, 0))
2797 {
2798 re_node_set_free (&next_nodes);
2799 return err;
2800 }
2801 }
2802 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2803 preg->newline_anchor);
2804 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2805 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2806 {
2807 re_node_set_free (&next_nodes);
2808 return err;
2809 }
2810 mctx->state_log[str_idx] = cur_state;
2811 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2812 }
2813 re_node_set_free (&next_nodes);
2814 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2815 : &mctx->state_log[last_str]->nodes);
2816 path->next_idx = str_idx;
2817
2818 /* Fix MCTX. */
2819 mctx->state_log = backup_state_log;
2820 mctx->input->cur_idx = backup_cur_idx;
2821
2822 if (cur_nodes == NULL)
2823 return REG_NOMATCH;
2824 /* Then check the current node set has the node LAST_NODE. */
2825 return (re_node_set_contains (cur_nodes, last_node)
2826 || re_node_set_contains (cur_nodes, last_node) ? REG_NOERROR
2827 : REG_NOMATCH);
2828 }
2829
2830 /* Helper functions for check_arrival. */
2831
2832 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2833 to NEXT_NODES.
2834 TODO: This function is similar to the functions transit_state*(),
2835 however this function has many additional works.
2836 Can't we unify them? */
2837
2838 static reg_errcode_t
2839 check_arrival_add_next_nodes (preg, dfa, mctx, str_idx, cur_nodes, next_nodes)
2840 const regex_t *preg;
2841 re_dfa_t *dfa;
2842 re_match_context_t *mctx;
2843 int str_idx;
2844 re_node_set *cur_nodes, *next_nodes;
2845 {
2846 int cur_idx;
2847 reg_errcode_t err;
2848 re_node_set union_set;
2849 re_node_set_init_empty (&union_set);
2850 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2851 {
2852 int naccepted = 0;
2853 int cur_node = cur_nodes->elems[cur_idx];
2854 re_token_type_t type = dfa->nodes[cur_node].type;
2855 if (IS_EPSILON_NODE(type))
2856 continue;
2857 #ifdef RE_ENABLE_I18N
2858 /* If the node may accept `multi byte'. */
2859 if (ACCEPT_MB_NODE (type))
2860 {
2861 naccepted = check_node_accept_bytes (preg, cur_node, mctx->input,
2862 str_idx);
2863 if (naccepted > 1)
2864 {
2865 re_dfastate_t *dest_state;
2866 int next_node = dfa->nexts[cur_node];
2867 int next_idx = str_idx + naccepted;
2868 dest_state = mctx->state_log[next_idx];
2869 re_node_set_empty (&union_set);
2870 if (dest_state)
2871 {
2872 err = re_node_set_merge (&union_set, &dest_state->nodes);
2873 if (BE (err != REG_NOERROR, 0))
2874 {
2875 re_node_set_free (&union_set);
2876 return err;
2877 }
2878 err = re_node_set_insert (&union_set, next_node);
2879 if (BE (err < 0, 0))
2880 {
2881 re_node_set_free (&union_set);
2882 return REG_ESPACE;
2883 }
2884 }
2885 else
2886 {
2887 err = re_node_set_insert (&union_set, next_node);
2888 if (BE (err < 0, 0))
2889 {
2890 re_node_set_free (&union_set);
2891 return REG_ESPACE;
2892 }
2893 }
2894 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2895 &union_set);
2896 if (BE (mctx->state_log[next_idx] == NULL
2897 && err != REG_NOERROR, 0))
2898 {
2899 re_node_set_free (&union_set);
2900 return err;
2901 }
2902 }
2903 }
2904 #endif /* RE_ENABLE_I18N */
2905 if (naccepted
2906 || check_node_accept (preg, dfa->nodes + cur_node, mctx,
2907 str_idx))
2908 {
2909 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
2910 if (BE (err < 0, 0))
2911 {
2912 re_node_set_free (&union_set);
2913 return REG_ESPACE;
2914 }
2915 }
2916 }
2917 re_node_set_free (&union_set);
2918 return REG_NOERROR;
2919 }
2920
2921 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
2922 CUR_NODES, however exclude the nodes which are:
2923 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2924 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2925 */
2926
2927 static reg_errcode_t
2928 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, fl_open)
2929 re_dfa_t *dfa;
2930 re_node_set *cur_nodes;
2931 int ex_subexp, fl_open;
2932 {
2933 reg_errcode_t err;
2934 int idx, outside_node;
2935 re_node_set new_nodes;
2936 #ifdef DEBUG
2937 assert (cur_nodes->nelem);
2938 #endif
2939 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
2940 if (BE (err != REG_NOERROR, 0))
2941 return err;
2942 /* Create a new node set NEW_NODES with the nodes which are epsilon
2943 closures of the node in CUR_NODES. */
2944
2945 for (idx = 0; idx < cur_nodes->nelem; ++idx)
2946 {
2947 int cur_node = cur_nodes->elems[idx];
2948 re_node_set *eclosure = dfa->eclosures + cur_node;
2949 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, fl_open);
2950 if (outside_node == -1)
2951 {
2952 /* There are no problematic nodes, just merge them. */
2953 err = re_node_set_merge (&new_nodes, eclosure);
2954 if (BE (err != REG_NOERROR, 0))
2955 {
2956 re_node_set_free (&new_nodes);
2957 return err;
2958 }
2959 }
2960 else
2961 {
2962 /* There are problematic nodes, re-calculate incrementally. */
2963 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
2964 ex_subexp, fl_open);
2965 if (BE (err != REG_NOERROR, 0))
2966 {
2967 re_node_set_free (&new_nodes);
2968 return err;
2969 }
2970 }
2971 }
2972 re_node_set_free (cur_nodes);
2973 *cur_nodes = new_nodes;
2974 return REG_NOERROR;
2975 }
2976
2977 /* Helper function for check_arrival_expand_ecl.
2978 Check incrementally the epsilon closure of TARGET, and if it isn't
2979 problematic append it to DST_NODES. */
2980
2981 static reg_errcode_t
2982 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, fl_open)
2983 re_dfa_t *dfa;
2984 int target, ex_subexp, fl_open;
2985 re_node_set *dst_nodes;
2986 {
2987 int cur_node, type;
2988 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
2989 {
2990 int err;
2991 type = dfa->nodes[cur_node].type;
2992
2993 if (((type == OP_OPEN_SUBEXP && fl_open)
2994 || (type == OP_CLOSE_SUBEXP && !fl_open))
2995 && dfa->nodes[cur_node].opr.idx == ex_subexp)
2996 {
2997 if (!fl_open)
2998 {
2999 err = re_node_set_insert (dst_nodes, cur_node);
3000 if (BE (err == -1, 0))
3001 return REG_ESPACE;
3002 }
3003 break;
3004 }
3005 err = re_node_set_insert (dst_nodes, cur_node);
3006 if (BE (err == -1, 0))
3007 return REG_ESPACE;
3008 if (dfa->edests[cur_node].nelem == 0)
3009 break;
3010 if (dfa->edests[cur_node].nelem == 2)
3011 {
3012 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3013 dfa->edests[cur_node].elems[1],
3014 ex_subexp, fl_open);
3015 if (BE (err != REG_NOERROR, 0))
3016 return err;
3017 }
3018 cur_node = dfa->edests[cur_node].elems[0];
3019 }
3020 return REG_NOERROR;
3021 }
3022
3023
3024 /* For all the back references in the current state, calculate the
3025 destination of the back references by the appropriate entry
3026 in MCTX->BKREF_ENTS. */
3027
3028 static reg_errcode_t
3029 expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num,
3030 fl_open)
3031 const regex_t *preg;
3032 re_match_context_t *mctx;
3033 int cur_str, last_str, subexp_num, fl_open;
3034 re_node_set *cur_nodes;
3035 {
3036 reg_errcode_t err;
3037 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3038 int cache_idx, cache_idx_start;
3039 /* The current state. */
3040
3041 cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3042 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3043 {
3044 int to_idx, next_node;
3045 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3046 if (ent->str_idx > cur_str)
3047 break;
3048 /* Is this entry ENT is appropriate? */
3049 if (!re_node_set_contains (cur_nodes, ent->node))
3050 continue; /* No. */
3051
3052 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3053 /* Calculate the destination of the back reference, and append it
3054 to MCTX->STATE_LOG. */
3055 if (to_idx == cur_str)
3056 {
3057 /* The backreference did epsilon transit, we must re-check all the
3058 node in the current state. */
3059 re_node_set new_dests;
3060 reg_errcode_t err2, err3;
3061 next_node = dfa->edests[ent->node].elems[0];
3062 if (re_node_set_contains (cur_nodes, next_node))
3063 continue;
3064 err = re_node_set_init_1 (&new_dests, next_node);
3065 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num,
3066 fl_open);
3067 err3 = re_node_set_merge (cur_nodes, &new_dests);
3068 re_node_set_free (&new_dests);
3069 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3070 || err3 != REG_NOERROR, 0))
3071 {
3072 err = (err != REG_NOERROR ? err
3073 : (err2 != REG_NOERROR ? err2 : err3));
3074 return err;
3075 }
3076 /* TODO: It is still inefficient... */
3077 cache_idx = cache_idx_start - 1;
3078 continue;
3079 }
3080 else
3081 {
3082 re_node_set union_set;
3083 next_node = dfa->nexts[ent->node];
3084 if (mctx->state_log[to_idx])
3085 {
3086 int ret;
3087 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3088 next_node))
3089 continue;
3090 err = re_node_set_init_copy (&union_set,
3091 &mctx->state_log[to_idx]->nodes);
3092 ret = re_node_set_insert (&union_set, next_node);
3093 if (BE (err != REG_NOERROR || ret < 0, 0))
3094 {
3095 re_node_set_free (&union_set);
3096 err = err != REG_NOERROR ? err : REG_ESPACE;
3097 return err;
3098 }
3099 }
3100 else
3101 {
3102 err = re_node_set_init_1 (&union_set, next_node);
3103 if (BE (err != REG_NOERROR, 0))
3104 return err;
3105 }
3106 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3107 re_node_set_free (&union_set);
3108 if (BE (mctx->state_log[to_idx] == NULL
3109 && err != REG_NOERROR, 0))
3110 return err;
3111 }
3112 }
3113 return REG_NOERROR;
3114 }
3115
3116 /* Build transition table for the state.
3117 Return the new table if succeeded, otherwise return NULL. */
3118
3119 static re_dfastate_t **
3120 build_trtable (preg, state, fl_search)
3121 const regex_t *preg;
3122 const re_dfastate_t *state;
3123 int fl_search;
3124 {
3125 reg_errcode_t err;
3126 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3127 int i, j, k, ch;
3128 int dests_node_malloced = 0, dest_states_malloced = 0;
3129 int ndests; /* Number of the destination states from `state'. */
3130 re_dfastate_t **trtable;
3131 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3132 re_node_set follows, *dests_node;
3133 bitset *dests_ch;
3134 bitset acceptable;
3135
3136 /* We build DFA states which corresponds to the destination nodes
3137 from `state'. `dests_node[i]' represents the nodes which i-th
3138 destination state contains, and `dests_ch[i]' represents the
3139 characters which i-th destination state accepts. */
3140 #ifdef _LIBC
3141 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3142 dests_node = (re_node_set *)
3143 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3144 else
3145 #endif
3146 {
3147 dests_node = (re_node_set *)
3148 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3149 if (BE (dests_node == NULL, 0))
3150 return NULL;
3151 dests_node_malloced = 1;
3152 }
3153 dests_ch = (bitset *) (dests_node + SBC_MAX);
3154
3155 /* Initialize transiton table. */
3156 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3157 if (BE (trtable == NULL, 0))
3158 {
3159 if (dests_node_malloced)
3160 free (dests_node);
3161 return NULL;
3162 }
3163
3164 /* At first, group all nodes belonging to `state' into several
3165 destinations. */
3166 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
3167 if (BE (ndests <= 0, 0))
3168 {
3169 if (dests_node_malloced)
3170 free (dests_node);
3171 /* Return NULL in case of an error, trtable otherwise. */
3172 if (ndests == 0)
3173 return trtable;
3174 free (trtable);
3175 return NULL;
3176 }
3177
3178 err = re_node_set_alloc (&follows, ndests + 1);
3179 if (BE (err != REG_NOERROR, 0))
3180 goto out_free;
3181
3182 #ifdef _LIBC
3183 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3184 + ndests * 3 * sizeof (re_dfastate_t *)))
3185 dest_states = (re_dfastate_t **)
3186 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3187 else
3188 #endif
3189 {
3190 dest_states = (re_dfastate_t **)
3191 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3192 if (BE (dest_states == NULL, 0))
3193 {
3194 out_free:
3195 if (dest_states_malloced)
3196 free (dest_states);
3197 re_node_set_free (&follows);
3198 for (i = 0; i < ndests; ++i)
3199 re_node_set_free (dests_node + i);
3200 free (trtable);
3201 if (dests_node_malloced)
3202 free (dests_node);
3203 return NULL;
3204 }
3205 dest_states_malloced = 1;
3206 }
3207 dest_states_word = dest_states + ndests;
3208 dest_states_nl = dest_states_word + ndests;
3209 bitset_empty (acceptable);
3210
3211 /* Then build the states for all destinations. */
3212 for (i = 0; i < ndests; ++i)
3213 {
3214 int next_node;
3215 re_node_set_empty (&follows);
3216 /* Merge the follows of this destination states. */
3217 for (j = 0; j < dests_node[i].nelem; ++j)
3218 {
3219 next_node = dfa->nexts[dests_node[i].elems[j]];
3220 if (next_node != -1)
3221 {
3222 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3223 if (BE (err != REG_NOERROR, 0))
3224 goto out_free;
3225 }
3226 }
3227 /* If search flag is set, merge the initial state. */
3228 if (fl_search)
3229 {
3230 #ifdef RE_ENABLE_I18N
3231 int not_initial = 0;
3232 for (j = 0; j < follows.nelem; ++j)
3233 if (dfa->nodes[follows.elems[j]].type == CHARACTER)
3234 {
3235 not_initial = dfa->nodes[follows.elems[j]].mb_partial;
3236 break;
3237 }
3238 if (!not_initial)
3239 #endif
3240 {
3241 err = re_node_set_merge (&follows,
3242 dfa->init_state->entrance_nodes);
3243 if (BE (err != REG_NOERROR, 0))
3244 goto out_free;
3245 }
3246 }
3247 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3248 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3249 goto out_free;
3250 /* If the new state has context constraint,
3251 build appropriate states for these contexts. */
3252 if (dest_states[i]->has_constraint)
3253 {
3254 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3255 CONTEXT_WORD);
3256 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3257 goto out_free;
3258 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3259 CONTEXT_NEWLINE);
3260 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3261 goto out_free;
3262 }
3263 else
3264 {
3265 dest_states_word[i] = dest_states[i];
3266 dest_states_nl[i] = dest_states[i];
3267 }
3268 bitset_merge (acceptable, dests_ch[i]);
3269 }
3270
3271 /* Update the transition table. */
3272 /* For all characters ch...: */
3273 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
3274 for (j = 0; j < UINT_BITS; ++j, ++ch)
3275 if ((acceptable[i] >> j) & 1)
3276 {
3277 /* The current state accepts the character ch. */
3278 if (IS_WORD_CHAR (ch))
3279 {
3280 for (k = 0; k < ndests; ++k)
3281 if ((dests_ch[k][i] >> j) & 1)
3282 {
3283 /* k-th destination accepts the word character ch. */
3284 trtable[ch] = dest_states_word[k];
3285 /* There must be only one destination which accepts
3286 character ch. See group_nodes_into_DFAstates. */
3287 break;
3288 }
3289 }
3290 else /* not WORD_CHAR */
3291 {
3292 for (k = 0; k < ndests; ++k)
3293 if ((dests_ch[k][i] >> j) & 1)
3294 {
3295 /* k-th destination accepts the non-word character ch. */
3296 trtable[ch] = dest_states[k];
3297 /* There must be only one destination which accepts
3298 character ch. See group_nodes_into_DFAstates. */
3299 break;
3300 }
3301 }
3302 }
3303 /* new line */
3304 if (bitset_contain (acceptable, NEWLINE_CHAR))
3305 {
3306 /* The current state accepts newline character. */
3307 for (k = 0; k < ndests; ++k)
3308 if (bitset_contain (dests_ch[k], NEWLINE_CHAR))
3309 {
3310 /* k-th destination accepts newline character. */
3311 trtable[NEWLINE_CHAR] = dest_states_nl[k];
3312 /* There must be only one destination which accepts
3313 newline. See group_nodes_into_DFAstates. */
3314 break;
3315 }
3316 }
3317
3318 if (dest_states_malloced)
3319 free (dest_states);
3320
3321 re_node_set_free (&follows);
3322 for (i = 0; i < ndests; ++i)
3323 re_node_set_free (dests_node + i);
3324
3325 if (dests_node_malloced)
3326 free (dests_node);
3327
3328 return trtable;
3329 }
3330
3331 /* Group all nodes belonging to STATE into several destinations.
3332 Then for all destinations, set the nodes belonging to the destination
3333 to DESTS_NODE[i] and set the characters accepted by the destination
3334 to DEST_CH[i]. This function return the number of destinations. */
3335
3336 static int
3337 group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
3338 const regex_t *preg;
3339 const re_dfastate_t *state;
3340 re_node_set *dests_node;
3341 bitset *dests_ch;
3342 {
3343 reg_errcode_t err;
3344 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3345 int i, j, k;
3346 int ndests; /* Number of the destinations from `state'. */
3347 bitset accepts; /* Characters a node can accept. */
3348 const re_node_set *cur_nodes = &state->nodes;
3349 bitset_empty (accepts);
3350 ndests = 0;
3351
3352 /* For all the nodes belonging to `state', */
3353 for (i = 0; i < cur_nodes->nelem; ++i)
3354 {
3355 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3356 re_token_type_t type = node->type;
3357 unsigned int constraint = node->constraint;
3358
3359 /* Enumerate all single byte character this node can accept. */
3360 if (type == CHARACTER)
3361 bitset_set (accepts, node->opr.c);
3362 else if (type == SIMPLE_BRACKET)
3363 {
3364 bitset_merge (accepts, node->opr.sbcset);
3365 }
3366 else if (type == OP_PERIOD)
3367 {
3368 bitset_set_all (accepts);
3369 if (!(preg->syntax & RE_DOT_NEWLINE))
3370 bitset_clear (accepts, '\n');
3371 if (preg->syntax & RE_DOT_NOT_NULL)
3372 bitset_clear (accepts, '\0');
3373 }
3374 else if (type == OP_UTF8_PERIOD)
3375 {
3376 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3377 if (!(preg->syntax & RE_DOT_NEWLINE))
3378 bitset_clear (accepts, '\n');
3379 if (preg->syntax & RE_DOT_NOT_NULL)
3380 bitset_clear (accepts, '\0');
3381 }
3382 else
3383 continue;
3384
3385 /* Check the `accepts' and sift the characters which are not
3386 match it the context. */
3387 if (constraint)
3388 {
3389 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3390 {
3391 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3392 bitset_empty (accepts);
3393 if (accepts_newline)
3394 bitset_set (accepts, NEWLINE_CHAR);
3395 else
3396 continue;
3397 }
3398 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3399 {
3400 bitset_empty (accepts);
3401 continue;
3402 }
3403 if (constraint & NEXT_WORD_CONSTRAINT)
3404 for (j = 0; j < BITSET_UINTS; ++j)
3405 accepts[j] &= dfa->word_char[j];
3406 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3407 for (j = 0; j < BITSET_UINTS; ++j)
3408 accepts[j] &= ~dfa->word_char[j];
3409 }
3410
3411 /* Then divide `accepts' into DFA states, or create a new
3412 state. */
3413 for (j = 0; j < ndests; ++j)
3414 {
3415 bitset intersec; /* Intersection sets, see below. */
3416 bitset remains;
3417 /* Flags, see below. */
3418 int has_intersec, not_subset, not_consumed;
3419
3420 /* Optimization, skip if this state doesn't accept the character. */
3421 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3422 continue;
3423
3424 /* Enumerate the intersection set of this state and `accepts'. */
3425 has_intersec = 0;
3426 for (k = 0; k < BITSET_UINTS; ++k)
3427 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3428 /* And skip if the intersection set is empty. */
3429 if (!has_intersec)
3430 continue;
3431
3432 /* Then check if this state is a subset of `accepts'. */
3433 not_subset = not_consumed = 0;
3434 for (k = 0; k < BITSET_UINTS; ++k)
3435 {
3436 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3437 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3438 }
3439
3440 /* If this state isn't a subset of `accepts', create a
3441 new group state, which has the `remains'. */
3442 if (not_subset)
3443 {
3444 bitset_copy (dests_ch[ndests], remains);
3445 bitset_copy (dests_ch[j], intersec);
3446 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3447 if (BE (err != REG_NOERROR, 0))
3448 goto error_return;
3449 ++ndests;
3450 }
3451
3452 /* Put the position in the current group. */
3453 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3454 if (BE (err < 0, 0))
3455 goto error_return;
3456
3457 /* If all characters are consumed, go to next node. */
3458 if (!not_consumed)
3459 break;
3460 }
3461 /* Some characters remain, create a new group. */
3462 if (j == ndests)
3463 {
3464 bitset_copy (dests_ch[ndests], accepts);
3465 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3466 if (BE (err != REG_NOERROR, 0))
3467 goto error_return;
3468 ++ndests;
3469 bitset_empty (accepts);
3470 }
3471 }
3472 return ndests;
3473 error_return:
3474 for (j = 0; j < ndests; ++j)
3475 re_node_set_free (dests_node + j);
3476 return -1;
3477 }
3478
3479 #ifdef RE_ENABLE_I18N
3480 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3481 Return the number of the bytes the node accepts.
3482 STR_IDX is the current index of the input string.
3483
3484 This function handles the nodes which can accept one character, or
3485 one collating element like '.', '[a-z]', opposite to the other nodes
3486 can only accept one byte. */
3487
3488 static int
3489 check_node_accept_bytes (preg, node_idx, input, str_idx)
3490 const regex_t *preg;
3491 int node_idx, str_idx;
3492 const re_string_t *input;
3493 {
3494 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3495 const re_token_t *node = dfa->nodes + node_idx;
3496 int char_len, elem_len;
3497 int i;
3498
3499 if (BE (node->type == OP_UTF8_PERIOD, 0))
3500 {
3501 unsigned char c = re_string_byte_at (input, str_idx), d;
3502 if (BE (c < 0xc2, 1))
3503 return 0;
3504
3505 if (str_idx + 2 > input->len)
3506 return 0;
3507
3508 d = re_string_byte_at (input, str_idx + 1);
3509 if (c < 0xe0)
3510 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3511 else if (c < 0xf0)
3512 {
3513 char_len = 3;
3514 if (c == 0xe0 && d < 0xa0)
3515 return 0;
3516 }
3517 else if (c < 0xf8)
3518 {
3519 char_len = 4;
3520 if (c == 0xf0 && d < 0x90)
3521 return 0;
3522 }
3523 else if (c < 0xfc)
3524 {
3525 char_len = 5;
3526 if (c == 0xf8 && d < 0x88)
3527 return 0;
3528 }
3529 else if (c < 0xfe)
3530 {
3531 char_len = 6;
3532 if (c == 0xfc && d < 0x84)
3533 return 0;
3534 }
3535 else
3536 return 0;
3537
3538 if (str_idx + char_len > input->len)
3539 return 0;
3540
3541 for (i = 1; i < char_len; ++i)
3542 {
3543 d = re_string_byte_at (input, str_idx + i);
3544 if (d < 0x80 || d > 0xbf)
3545 return 0;
3546 }
3547 return char_len;
3548 }
3549
3550 char_len = re_string_char_size_at (input, str_idx);
3551 if (node->type == OP_PERIOD)
3552 {
3553 if (char_len <= 1)
3554 return 0;
3555 /* FIXME: I don't think this if is needed, as both '\n'
3556 and '\0' are char_len == 1. */
3557 /* '.' accepts any one character except the following two cases. */
3558 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
3559 re_string_byte_at (input, str_idx) == '\n') ||
3560 ((preg->syntax & RE_DOT_NOT_NULL) &&
3561 re_string_byte_at (input, str_idx) == '\0'))
3562 return 0;
3563 return char_len;
3564 }
3565
3566 elem_len = re_string_elem_size_at (input, str_idx);
3567 if (elem_len <= 1 && char_len <= 1)
3568 return 0;
3569
3570 if (node->type == COMPLEX_BRACKET)
3571 {
3572 const re_charset_t *cset = node->opr.mbcset;
3573 # ifdef _LIBC
3574 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3575 + str_idx);
3576 int j;
3577 uint32_t nrules;
3578 # endif /* _LIBC */
3579 int match_len = 0;
3580 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3581 ? re_string_wchar_at (input, str_idx) : 0);
3582
3583 /* match with multibyte character? */
3584 for (i = 0; i < cset->nmbchars; ++i)
3585 if (wc == cset->mbchars[i])
3586 {
3587 match_len = char_len;
3588 goto check_node_accept_bytes_match;
3589 }
3590 /* match with character_class? */
3591 for (i = 0; i < cset->nchar_classes; ++i)
3592 {
3593 wctype_t wt = cset->char_classes[i];
3594 if (__iswctype (wc, wt))
3595 {
3596 match_len = char_len;
3597 goto check_node_accept_bytes_match;
3598 }
3599 }
3600
3601 # ifdef _LIBC
3602 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3603 if (nrules != 0)
3604 {
3605 unsigned int in_collseq = 0;
3606 const int32_t *table, *indirect;
3607 const unsigned char *weights, *extra;
3608 const char *collseqwc;
3609 int32_t idx;
3610 /* This #include defines a local function! */
3611 # include <locale/weight.h>
3612
3613 /* match with collating_symbol? */
3614 if (cset->ncoll_syms)
3615 extra = (const unsigned char *)
3616 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3617 for (i = 0; i < cset->ncoll_syms; ++i)
3618 {
3619 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3620 /* Compare the length of input collating element and
3621 the length of current collating element. */
3622 if (*coll_sym != elem_len)
3623 continue;
3624 /* Compare each bytes. */
3625 for (j = 0; j < *coll_sym; j++)
3626 if (pin[j] != coll_sym[1 + j])
3627 break;
3628 if (j == *coll_sym)
3629 {
3630 /* Match if every bytes is equal. */
3631 match_len = j;
3632 goto check_node_accept_bytes_match;
3633 }
3634 }
3635
3636 if (cset->nranges)
3637 {
3638 if (elem_len <= char_len)
3639 {
3640 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3641 in_collseq = __collseq_table_lookup (collseqwc, wc);
3642 }
3643 else
3644 in_collseq = find_collation_sequence_value (pin, elem_len);
3645 }
3646 /* match with range expression? */
3647 for (i = 0; i < cset->nranges; ++i)
3648 if (cset->range_starts[i] <= in_collseq
3649 && in_collseq <= cset->range_ends[i])
3650 {
3651 match_len = elem_len;
3652 goto check_node_accept_bytes_match;
3653 }
3654
3655 /* match with equivalence_class? */
3656 if (cset->nequiv_classes)
3657 {
3658 const unsigned char *cp = pin;
3659 table = (const int32_t *)
3660 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3661 weights = (const unsigned char *)
3662 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3663 extra = (const unsigned char *)
3664 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3665 indirect = (const int32_t *)
3666 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3667 idx = findidx (&cp);
3668 if (idx > 0)
3669 for (i = 0; i < cset->nequiv_classes; ++i)
3670 {
3671 int32_t equiv_class_idx = cset->equiv_classes[i];
3672 size_t weight_len = weights[idx];
3673 if (weight_len == weights[equiv_class_idx])
3674 {
3675 int cnt = 0;
3676 while (cnt <= weight_len
3677 && (weights[equiv_class_idx + 1 + cnt]
3678 == weights[idx + 1 + cnt]))
3679 ++cnt;
3680 if (cnt > weight_len)
3681 {
3682 match_len = elem_len;
3683 goto check_node_accept_bytes_match;
3684 }
3685 }
3686 }
3687 }
3688 }
3689 else
3690 # endif /* _LIBC */
3691 {
3692 /* match with range expression? */
3693 #if __GNUC__ >= 2
3694 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3695 #else
3696 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3697 cmp_buf[2] = wc;
3698 #endif
3699 for (i = 0; i < cset->nranges; ++i)
3700 {
3701 cmp_buf[0] = cset->range_starts[i];
3702 cmp_buf[4] = cset->range_ends[i];
3703 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3704 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3705 {
3706 match_len = char_len;
3707 goto check_node_accept_bytes_match;
3708 }
3709 }
3710 }
3711 check_node_accept_bytes_match:
3712 if (!cset->non_match)
3713 return match_len;
3714 else
3715 {
3716 if (match_len > 0)
3717 return 0;
3718 else
3719 return (elem_len > char_len) ? elem_len : char_len;
3720 }
3721 }
3722 return 0;
3723 }
3724
3725 # ifdef _LIBC
3726 static unsigned int
3727 find_collation_sequence_value (mbs, mbs_len)
3728 const unsigned char *mbs;
3729 size_t mbs_len;
3730 {
3731 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3732 if (nrules == 0)
3733 {
3734 if (mbs_len == 1)
3735 {
3736 /* No valid character. Match it as a single byte character. */
3737 const unsigned char *collseq = (const unsigned char *)
3738 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3739 return collseq[mbs[0]];
3740 }
3741 return UINT_MAX;
3742 }
3743 else
3744 {
3745 int32_t idx;
3746 const unsigned char *extra = (const unsigned char *)
3747 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3748
3749 for (idx = 0; ;)
3750 {
3751 int mbs_cnt, found = 0;
3752 int32_t elem_mbs_len;
3753 /* Skip the name of collating element name. */
3754 idx = idx + extra[idx] + 1;
3755 elem_mbs_len = extra[idx++];
3756 if (mbs_len == elem_mbs_len)
3757 {
3758 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3759 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3760 break;
3761 if (mbs_cnt == elem_mbs_len)
3762 /* Found the entry. */
3763 found = 1;
3764 }
3765 /* Skip the byte sequence of the collating element. */
3766 idx += elem_mbs_len;
3767 /* Adjust for the alignment. */
3768 idx = (idx + 3) & ~3;
3769 /* Skip the collation sequence value. */
3770 idx += sizeof (uint32_t);
3771 /* Skip the wide char sequence of the collating element. */
3772 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3773 /* If we found the entry, return the sequence value. */
3774 if (found)
3775 return *(uint32_t *) (extra + idx);
3776 /* Skip the collation sequence value. */
3777 idx += sizeof (uint32_t);
3778 }
3779 }
3780 }
3781 # endif /* _LIBC */
3782 #endif /* RE_ENABLE_I18N */
3783
3784 /* Check whether the node accepts the byte which is IDX-th
3785 byte of the INPUT. */
3786
3787 static int
3788 check_node_accept (preg, node, mctx, idx)
3789 const regex_t *preg;
3790 const re_token_t *node;
3791 const re_match_context_t *mctx;
3792 int idx;
3793 {
3794 unsigned char ch;
3795 if (node->constraint)
3796 {
3797 /* The node has constraints. Check whether the current context
3798 satisfies the constraints. */
3799 unsigned int context = re_string_context_at (mctx->input, idx,
3800 mctx->eflags,
3801 preg->newline_anchor);
3802 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3803 return 0;
3804 }
3805 ch = re_string_byte_at (mctx->input, idx);
3806 switch (node->type)
3807 {
3808 case CHARACTER:
3809 return node->opr.c == ch;
3810 case SIMPLE_BRACKET:
3811 return bitset_contain (node->opr.sbcset, ch);
3812 case OP_UTF8_PERIOD:
3813 if (ch >= 0x80)
3814 return 0;
3815 /* FALLTHROUGH */
3816 case OP_PERIOD:
3817 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
3818 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
3819 default:
3820 return 0;
3821 }
3822 }
3823
3824 /* Extend the buffers, if the buffers have run out. */
3825
3826 static reg_errcode_t
3827 extend_buffers (mctx)
3828 re_match_context_t *mctx;
3829 {
3830 reg_errcode_t ret;
3831 re_string_t *pstr = mctx->input;
3832
3833 /* Double the lengthes of the buffers. */
3834 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3835 if (BE (ret != REG_NOERROR, 0))
3836 return ret;
3837
3838 if (mctx->state_log != NULL)
3839 {
3840 /* And double the length of state_log. */
3841 re_dfastate_t **new_array;
3842 new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3843 pstr->bufs_len + 1);
3844 if (BE (new_array == NULL, 0))
3845 return REG_ESPACE;
3846 mctx->state_log = new_array;
3847 }
3848
3849 /* Then reconstruct the buffers. */
3850 if (pstr->icase)
3851 {
3852 #ifdef RE_ENABLE_I18N
3853 if (pstr->mb_cur_max > 1)
3854 build_wcs_upper_buffer (pstr);
3855 else
3856 #endif /* RE_ENABLE_I18N */
3857 build_upper_buffer (pstr);
3858 }
3859 else
3860 {
3861 #ifdef RE_ENABLE_I18N
3862 if (pstr->mb_cur_max > 1)
3863 build_wcs_buffer (pstr);
3864 else
3865 #endif /* RE_ENABLE_I18N */
3866 {
3867 if (pstr->trans != NULL)
3868 re_string_translate_buffer (pstr);
3869 }
3870 }
3871 return REG_NOERROR;
3872 }
3873
3874 \f
3875 /* Functions for matching context. */
3876
3877 /* Initialize MCTX. */
3878
3879 static reg_errcode_t
3880 match_ctx_init (mctx, eflags, input, n)
3881 re_match_context_t *mctx;
3882 int eflags, n;
3883 re_string_t *input;
3884 {
3885 mctx->eflags = eflags;
3886 mctx->input = input;
3887 mctx->match_last = -1;
3888 if (n > 0)
3889 {
3890 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
3891 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
3892 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
3893 return REG_ESPACE;
3894 }
3895 else
3896 mctx->bkref_ents = NULL;
3897 mctx->nbkref_ents = 0;
3898 mctx->abkref_ents = n;
3899 mctx->max_mb_elem_len = 1;
3900 mctx->nsub_tops = 0;
3901 mctx->asub_tops = n;
3902 return REG_NOERROR;
3903 }
3904
3905 /* Clean the entries which depend on the current input in MCTX.
3906 This function must be invoked when the matcher changes the start index
3907 of the input, or changes the input string. */
3908
3909 static void
3910 match_ctx_clean (mctx)
3911 re_match_context_t *mctx;
3912 {
3913 match_ctx_free_subtops (mctx);
3914 mctx->nsub_tops = 0;
3915 mctx->nbkref_ents = 0;
3916 }
3917
3918 /* Free all the memory associated with MCTX. */
3919
3920 static void
3921 match_ctx_free (mctx)
3922 re_match_context_t *mctx;
3923 {
3924 match_ctx_free_subtops (mctx);
3925 re_free (mctx->sub_tops);
3926 re_free (mctx->bkref_ents);
3927 }
3928
3929 /* Free all the memory associated with MCTX->SUB_TOPS. */
3930
3931 static void
3932 match_ctx_free_subtops (mctx)
3933 re_match_context_t *mctx;
3934 {
3935 int st_idx;
3936 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
3937 {
3938 int sl_idx;
3939 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
3940 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
3941 {
3942 re_sub_match_last_t *last = top->lasts[sl_idx];
3943 re_free (last->path.array);
3944 re_free (last);
3945 }
3946 re_free (top->lasts);
3947 if (top->path)
3948 {
3949 re_free (top->path->array);
3950 re_free (top->path);
3951 }
3952 free (top);
3953 }
3954 }
3955
3956 /* Add a new backreference entry to MCTX.
3957 Note that we assume that caller never call this function with duplicate
3958 entry, and call with STR_IDX which isn't smaller than any existing entry.
3959 */
3960
3961 static reg_errcode_t
3962 match_ctx_add_entry (mctx, node, str_idx, from, to)
3963 re_match_context_t *mctx;
3964 int node, str_idx, from, to;
3965 {
3966 if (mctx->nbkref_ents >= mctx->abkref_ents)
3967 {
3968 struct re_backref_cache_entry* new_entry;
3969 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
3970 mctx->abkref_ents * 2);
3971 if (BE (new_entry == NULL, 0))
3972 {
3973 re_free (mctx->bkref_ents);
3974 return REG_ESPACE;
3975 }
3976 mctx->bkref_ents = new_entry;
3977 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
3978 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3979 mctx->abkref_ents *= 2;
3980 }
3981 mctx->bkref_ents[mctx->nbkref_ents].node = node;
3982 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
3983 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
3984 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
3985 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
3986 if (mctx->max_mb_elem_len < to - from)
3987 mctx->max_mb_elem_len = to - from;
3988 return REG_NOERROR;
3989 }
3990
3991 /* Search for the first entry which has the same str_idx.
3992 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
3993
3994 static int
3995 search_cur_bkref_entry (mctx, str_idx)
3996 re_match_context_t *mctx;
3997 int str_idx;
3998 {
3999 int left, right, mid;
4000 right = mctx->nbkref_ents;
4001 for (left = 0; left < right;)
4002 {
4003 mid = (left + right) / 2;
4004 if (mctx->bkref_ents[mid].str_idx < str_idx)
4005 left = mid + 1;
4006 else
4007 right = mid;
4008 }
4009 return left;
4010 }
4011
4012 static void
4013 match_ctx_clear_flag (mctx)
4014 re_match_context_t *mctx;
4015 {
4016 int i;
4017 for (i = 0; i < mctx->nbkref_ents; ++i)
4018 {
4019 mctx->bkref_ents[i].flag = 0;
4020 }
4021 }
4022
4023 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4024 at STR_IDX. */
4025
4026 static reg_errcode_t
4027 match_ctx_add_subtop (mctx, node, str_idx)
4028 re_match_context_t *mctx;
4029 int node, str_idx;
4030 {
4031 #ifdef DEBUG
4032 assert (mctx->sub_tops != NULL);
4033 assert (mctx->asub_tops > 0);
4034 #endif
4035 if (mctx->nsub_tops == mctx->asub_tops)
4036 {
4037 re_sub_match_top_t **new_array;
4038 mctx->asub_tops *= 2;
4039 new_array = re_realloc (mctx->sub_tops, re_sub_match_top_t *,
4040 mctx->asub_tops);
4041 if (BE (new_array == NULL, 0))
4042 return REG_ESPACE;
4043 mctx->sub_tops = new_array;
4044 }
4045 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4046 if (mctx->sub_tops[mctx->nsub_tops] == NULL)
4047 return REG_ESPACE;
4048 mctx->sub_tops[mctx->nsub_tops]->node = node;
4049 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4050 return REG_NOERROR;
4051 }
4052
4053 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4054 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4055
4056 static re_sub_match_last_t *
4057 match_ctx_add_sublast (subtop, node, str_idx)
4058 re_sub_match_top_t *subtop;
4059 int node, str_idx;
4060 {
4061 re_sub_match_last_t *new_entry;
4062 if (subtop->nlasts == subtop->alasts)
4063 {
4064 re_sub_match_last_t **new_array;
4065 subtop->alasts = 2 * subtop->alasts + 1;
4066 new_array = re_realloc (subtop->lasts, re_sub_match_last_t *,
4067 subtop->alasts);
4068 if (BE (new_array == NULL, 0))
4069 return NULL;
4070 subtop->lasts = new_array;
4071 }
4072 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4073 if (BE (new_entry == NULL, 0))
4074 return NULL;
4075 subtop->lasts[subtop->nlasts] = new_entry;
4076 new_entry->node = node;
4077 new_entry->str_idx = str_idx;
4078 ++subtop->nlasts;
4079 return new_entry;
4080 }
4081
4082 static void
4083 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
4084 check_subexp)
4085 re_sift_context_t *sctx;
4086 re_dfastate_t **sifted_sts, **limited_sts;
4087 int last_node, last_str_idx, check_subexp;
4088 {
4089 sctx->sifted_states = sifted_sts;
4090 sctx->limited_states = limited_sts;
4091 sctx->last_node = last_node;
4092 sctx->last_str_idx = last_str_idx;
4093 sctx->check_subexp = check_subexp;
4094 sctx->cur_bkref = -1;
4095 sctx->cls_subexp_idx = -1;
4096 re_node_set_init_empty (&sctx->limits);
4097 }