]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regexec.c
Update.
[thirdparty/glibc.git] / posix / regexec.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 re_string_t *input, int n);
23 static void match_ctx_clean (re_match_context_t *mctx);
24 static void match_ctx_free (re_match_context_t *cache);
25 static void match_ctx_free_subtops (re_match_context_t *mctx);
26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
27 int str_idx, int from, int to);
28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx);
29 static void match_ctx_clear_flag (re_match_context_t *mctx);
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx);
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx);
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, int last_node,
36 int last_str_idx, int check_subexp);
37 static reg_errcode_t re_search_internal (const regex_t *preg,
38 const char *string, int length,
39 int start, int range, int stop,
40 size_t nmatch, regmatch_t pmatch[],
41 int eflags);
42 static int re_search_2_stub (struct re_pattern_buffer *bufp,
43 const char *string1, int length1,
44 const char *string2, int length2,
45 int start, int range, struct re_registers *regs,
46 int stop, int ret_len);
47 static int re_search_stub (struct re_pattern_buffer *bufp,
48 const char *string, int length, int start,
49 int range, int stop, struct re_registers *regs,
50 int ret_len);
51 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
52 int nregs, int regs_allocated);
53 static inline re_dfastate_t *acquire_init_state_context
54 (reg_errcode_t *err, const regex_t *preg, const re_match_context_t *mctx,
55 int idx) __attribute ((always_inline));
56 static reg_errcode_t prune_impossible_nodes (const regex_t *preg,
57 re_match_context_t *mctx);
58 static int check_matching (const regex_t *preg, re_match_context_t *mctx,
59 int fl_longest_match);
60 static int check_halt_node_context (const re_dfa_t *dfa, int node,
61 unsigned int context);
62 static int check_halt_state_context (const regex_t *preg,
63 const re_dfastate_t *state,
64 const re_match_context_t *mctx, int idx);
65 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
66 int cur_idx, int nmatch);
67 static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs,
68 const re_match_context_t *mctx,
69 int *pidx, int node, re_node_set *eps_via_nodes,
70 struct re_fail_stack_t *fs);
71 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
72 int str_idx, int *dests, int nregs,
73 regmatch_t *regs,
74 re_node_set *eps_via_nodes);
75 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
76 regmatch_t *regs, re_node_set *eps_via_nodes);
77 static reg_errcode_t set_regs (const regex_t *preg,
78 const re_match_context_t *mctx,
79 size_t nmatch, regmatch_t *pmatch,
80 int fl_backtrack);
81 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
82
83 #ifdef RE_ENABLE_I18N
84 static int sift_states_iter_mb (const regex_t *preg,
85 const re_match_context_t *mctx,
86 re_sift_context_t *sctx,
87 int node_idx, int str_idx, int max_str_idx);
88 #endif /* RE_ENABLE_I18N */
89 static reg_errcode_t sift_states_backward (const regex_t *preg,
90 re_match_context_t *mctx,
91 re_sift_context_t *sctx);
92 static reg_errcode_t update_cur_sifted_state (const regex_t *preg,
93 re_match_context_t *mctx,
94 re_sift_context_t *sctx,
95 int str_idx,
96 re_node_set *dest_nodes);
97 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
98 re_node_set *dest_nodes,
99 const re_node_set *candidates);
100 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
101 re_node_set *dest_nodes,
102 const re_node_set *and_nodes);
103 static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits,
104 re_match_context_t *mctx, int dst_node,
105 int dst_idx, int src_node, int src_idx);
106 static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx,
107 int limit, re_node_set *eclosures,
108 int subexp_idx, int node, int str_idx);
109 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
110 re_node_set *dest_nodes,
111 const re_node_set *candidates,
112 re_node_set *limits,
113 struct re_backref_cache_entry *bkref_ents,
114 int str_idx);
115 static reg_errcode_t sift_states_bkref (const regex_t *preg,
116 re_match_context_t *mctx,
117 re_sift_context_t *sctx,
118 int str_idx, re_node_set *dest_nodes);
119 static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
120 int next_state_log_idx);
121 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
122 re_dfastate_t **src, int num);
123 static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
124 re_match_context_t *mctx,
125 re_dfastate_t *state);
126 static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa,
127 re_match_context_t *mctx,
128 re_node_set *cur_nodes,
129 int str_idx);
130 #if 0
131 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
132 re_dfastate_t *pstate,
133 re_match_context_t *mctx);
134 #endif
135 #ifdef RE_ENABLE_I18N
136 static reg_errcode_t transit_state_mb (const regex_t *preg,
137 re_dfastate_t *pstate,
138 re_match_context_t *mctx);
139 #endif /* RE_ENABLE_I18N */
140 static reg_errcode_t transit_state_bkref (const regex_t *preg,
141 const re_node_set *nodes,
142 re_match_context_t *mctx);
143 static reg_errcode_t get_subexp (const regex_t *preg, re_match_context_t *mctx,
144 int bkref_node, int bkref_str_idx);
145 static reg_errcode_t get_subexp_sub (const regex_t *preg,
146 re_match_context_t *mctx,
147 const re_sub_match_top_t *sub_top,
148 re_sub_match_last_t *sub_last,
149 int bkref_node, int bkref_str);
150 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
151 int subexp_idx, int type);
152 static reg_errcode_t check_arrival (const regex_t *preg,
153 re_match_context_t *mctx,
154 state_array_t *path, int top_node,
155 int top_str, int last_node, int last_str,
156 int type);
157 static reg_errcode_t check_arrival_add_next_nodes (const regex_t *preg,
158 re_dfa_t *dfa,
159 re_match_context_t *mctx,
160 int str_idx,
161 re_node_set *cur_nodes,
162 re_node_set *next_nodes);
163 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
164 re_node_set *cur_nodes,
165 int ex_subexp, int type);
166 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
167 re_node_set *dst_nodes,
168 int target, int ex_subexp,
169 int type);
170 static reg_errcode_t expand_bkref_cache (const regex_t *preg,
171 re_match_context_t *mctx,
172 re_node_set *cur_nodes, int cur_str,
173 int last_str, int subexp_num,
174 int type);
175 static re_dfastate_t **build_trtable (const regex_t *dfa,
176 re_dfastate_t *state);
177 #ifdef RE_ENABLE_I18N
178 static int check_node_accept_bytes (const regex_t *preg, int node_idx,
179 const re_string_t *input, int idx);
180 # ifdef _LIBC
181 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
182 size_t name_len);
183 # endif /* _LIBC */
184 #endif /* RE_ENABLE_I18N */
185 static int group_nodes_into_DFAstates (const regex_t *dfa,
186 const re_dfastate_t *state,
187 re_node_set *states_node,
188 bitset *states_ch);
189 static int check_node_accept (const regex_t *preg, const re_token_t *node,
190 const re_match_context_t *mctx, int idx);
191 static reg_errcode_t extend_buffers (re_match_context_t *mctx);
192 \f
193 /* Entry point for POSIX code. */
194
195 /* regexec searches for a given pattern, specified by PREG, in the
196 string STRING.
197
198 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
199 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
200 least NMATCH elements, and we set them to the offsets of the
201 corresponding matched substrings.
202
203 EFLAGS specifies `execution flags' which affect matching: if
204 REG_NOTBOL is set, then ^ does not match at the beginning of the
205 string; if REG_NOTEOL is set, then $ does not match at the end.
206
207 We return 0 if we find a match and REG_NOMATCH if not. */
208
209 int
210 regexec (preg, string, nmatch, pmatch, eflags)
211 const regex_t *__restrict preg;
212 const char *__restrict string;
213 size_t nmatch;
214 regmatch_t pmatch[];
215 int eflags;
216 {
217 reg_errcode_t err;
218 int length = strlen (string);
219 if (preg->no_sub)
220 err = re_search_internal (preg, string, length, 0, length, length, 0,
221 NULL, eflags);
222 else
223 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
224 pmatch, eflags);
225 return err != REG_NOERROR;
226 }
227 #ifdef _LIBC
228 weak_alias (__regexec, regexec)
229 #endif
230
231 /* Entry points for GNU code. */
232
233 /* re_match, re_search, re_match_2, re_search_2
234
235 The former two functions operate on STRING with length LENGTH,
236 while the later two operate on concatenation of STRING1 and STRING2
237 with lengths LENGTH1 and LENGTH2, respectively.
238
239 re_match() matches the compiled pattern in BUFP against the string,
240 starting at index START.
241
242 re_search() first tries matching at index START, then it tries to match
243 starting from index START + 1, and so on. The last start position tried
244 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
245 way as re_match().)
246
247 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
248 the first STOP characters of the concatenation of the strings should be
249 concerned.
250
251 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
252 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
253 computed relative to the concatenation, not relative to the individual
254 strings.)
255
256 On success, re_match* functions return the length of the match, re_search*
257 return the position of the start of the match. Return value -1 means no
258 match was found and -2 indicates an internal error. */
259
260 int
261 re_match (bufp, string, length, start, regs)
262 struct re_pattern_buffer *bufp;
263 const char *string;
264 int length, start;
265 struct re_registers *regs;
266 {
267 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
268 }
269 #ifdef _LIBC
270 weak_alias (__re_match, re_match)
271 #endif
272
273 int
274 re_search (bufp, string, length, start, range, regs)
275 struct re_pattern_buffer *bufp;
276 const char *string;
277 int length, start, range;
278 struct re_registers *regs;
279 {
280 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
281 }
282 #ifdef _LIBC
283 weak_alias (__re_search, re_search)
284 #endif
285
286 int
287 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
288 struct re_pattern_buffer *bufp;
289 const char *string1, *string2;
290 int length1, length2, start, stop;
291 struct re_registers *regs;
292 {
293 return re_search_2_stub (bufp, string1, length1, string2, length2,
294 start, 0, regs, stop, 1);
295 }
296 #ifdef _LIBC
297 weak_alias (__re_match_2, re_match_2)
298 #endif
299
300 int
301 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
302 struct re_pattern_buffer *bufp;
303 const char *string1, *string2;
304 int length1, length2, start, range, stop;
305 struct re_registers *regs;
306 {
307 return re_search_2_stub (bufp, string1, length1, string2, length2,
308 start, range, regs, stop, 0);
309 }
310 #ifdef _LIBC
311 weak_alias (__re_search_2, re_search_2)
312 #endif
313
314 static int
315 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
316 stop, ret_len)
317 struct re_pattern_buffer *bufp;
318 const char *string1, *string2;
319 int length1, length2, start, range, stop, ret_len;
320 struct re_registers *regs;
321 {
322 const char *str;
323 int rval;
324 int len = length1 + length2;
325 int free_str = 0;
326
327 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
328 return -2;
329
330 /* Concatenate the strings. */
331 if (length2 > 0)
332 if (length1 > 0)
333 {
334 char *s = re_malloc (char, len);
335
336 if (BE (s == NULL, 0))
337 return -2;
338 memcpy (s, string1, length1);
339 memcpy (s + length1, string2, length2);
340 str = s;
341 free_str = 1;
342 }
343 else
344 str = string2;
345 else
346 str = string1;
347
348 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
349 ret_len);
350 if (free_str)
351 re_free ((char *) str);
352 return rval;
353 }
354
355 /* The parameters have the same meaning as those of re_search.
356 Additional parameters:
357 If RET_LEN is nonzero the length of the match is returned (re_match style);
358 otherwise the position of the match is returned. */
359
360 static int
361 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
362 struct re_pattern_buffer *bufp;
363 const char *string;
364 int length, start, range, stop, ret_len;
365 struct re_registers *regs;
366 {
367 reg_errcode_t result;
368 regmatch_t *pmatch;
369 int nregs, rval;
370 int eflags = 0;
371
372 /* Check for out-of-range. */
373 if (BE (start < 0 || start > length, 0))
374 return -1;
375 if (BE (start + range > length, 0))
376 range = length - start;
377 else if (BE (start + range < 0, 0))
378 range = -start;
379
380 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
381 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
382
383 /* Compile fastmap if we haven't yet. */
384 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
385 re_compile_fastmap (bufp);
386
387 if (BE (bufp->no_sub, 0))
388 regs = NULL;
389
390 /* We need at least 1 register. */
391 if (regs == NULL)
392 nregs = 1;
393 else if (BE (bufp->regs_allocated == REGS_FIXED &&
394 regs->num_regs < bufp->re_nsub + 1, 0))
395 {
396 nregs = regs->num_regs;
397 if (BE (nregs < 1, 0))
398 {
399 /* Nothing can be copied to regs. */
400 regs = NULL;
401 nregs = 1;
402 }
403 }
404 else
405 nregs = bufp->re_nsub + 1;
406 pmatch = re_malloc (regmatch_t, nregs);
407 if (BE (pmatch == NULL, 0))
408 return -2;
409
410 result = re_search_internal (bufp, string, length, start, range, stop,
411 nregs, pmatch, eflags);
412
413 rval = 0;
414
415 /* I hope we needn't fill ther regs with -1's when no match was found. */
416 if (result != REG_NOERROR)
417 rval = -1;
418 else if (regs != NULL)
419 {
420 /* If caller wants register contents data back, copy them. */
421 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
422 bufp->regs_allocated);
423 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
424 rval = -2;
425 }
426
427 if (BE (rval == 0, 1))
428 {
429 if (ret_len)
430 {
431 assert (pmatch[0].rm_so == start);
432 rval = pmatch[0].rm_eo - start;
433 }
434 else
435 rval = pmatch[0].rm_so;
436 }
437 re_free (pmatch);
438 return rval;
439 }
440
441 static unsigned
442 re_copy_regs (regs, pmatch, nregs, regs_allocated)
443 struct re_registers *regs;
444 regmatch_t *pmatch;
445 int nregs, regs_allocated;
446 {
447 int rval = REGS_REALLOCATE;
448 int i;
449 int need_regs = nregs + 1;
450 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
451 uses. */
452
453 /* Have the register data arrays been allocated? */
454 if (regs_allocated == REGS_UNALLOCATED)
455 { /* No. So allocate them with malloc. */
456 regs->start = re_malloc (regoff_t, need_regs);
457 if (BE (regs->start == NULL, 0))
458 return REGS_UNALLOCATED;
459 regs->end = re_malloc (regoff_t, need_regs);
460 if (BE (regs->end == NULL, 0))
461 {
462 re_free (regs->start);
463 return REGS_UNALLOCATED;
464 }
465 regs->num_regs = need_regs;
466 }
467 else if (regs_allocated == REGS_REALLOCATE)
468 { /* Yes. If we need more elements than were already
469 allocated, reallocate them. If we need fewer, just
470 leave it alone. */
471 if (BE (need_regs > regs->num_regs, 0))
472 {
473 regs->start = re_realloc (regs->start, regoff_t, need_regs);
474 if (BE (regs->start == NULL, 0))
475 {
476 if (regs->end != NULL)
477 re_free (regs->end);
478 return REGS_UNALLOCATED;
479 }
480 regs->end = re_realloc (regs->end, regoff_t, need_regs);
481 if (BE (regs->end == NULL, 0))
482 {
483 re_free (regs->start);
484 return REGS_UNALLOCATED;
485 }
486 regs->num_regs = need_regs;
487 }
488 }
489 else
490 {
491 assert (regs_allocated == REGS_FIXED);
492 /* This function may not be called with REGS_FIXED and nregs too big. */
493 assert (regs->num_regs >= nregs);
494 rval = REGS_FIXED;
495 }
496
497 /* Copy the regs. */
498 for (i = 0; i < nregs; ++i)
499 {
500 regs->start[i] = pmatch[i].rm_so;
501 regs->end[i] = pmatch[i].rm_eo;
502 }
503 for ( ; i < regs->num_regs; ++i)
504 regs->start[i] = regs->end[i] = -1;
505
506 return rval;
507 }
508
509 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
510 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
511 this memory for recording register information. STARTS and ENDS
512 must be allocated using the malloc library routine, and must each
513 be at least NUM_REGS * sizeof (regoff_t) bytes long.
514
515 If NUM_REGS == 0, then subsequent matches should allocate their own
516 register data.
517
518 Unless this function is called, the first search or match using
519 PATTERN_BUFFER will allocate its own register data, without
520 freeing the old data. */
521
522 void
523 re_set_registers (bufp, regs, num_regs, starts, ends)
524 struct re_pattern_buffer *bufp;
525 struct re_registers *regs;
526 unsigned num_regs;
527 regoff_t *starts, *ends;
528 {
529 if (num_regs)
530 {
531 bufp->regs_allocated = REGS_REALLOCATE;
532 regs->num_regs = num_regs;
533 regs->start = starts;
534 regs->end = ends;
535 }
536 else
537 {
538 bufp->regs_allocated = REGS_UNALLOCATED;
539 regs->num_regs = 0;
540 regs->start = regs->end = (regoff_t *) 0;
541 }
542 }
543 #ifdef _LIBC
544 weak_alias (__re_set_registers, re_set_registers)
545 #endif
546 \f
547 /* Entry points compatible with 4.2 BSD regex library. We don't define
548 them unless specifically requested. */
549
550 #if defined _REGEX_RE_COMP || defined _LIBC
551 int
552 # ifdef _LIBC
553 weak_function
554 # endif
555 re_exec (s)
556 const char *s;
557 {
558 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
559 }
560 #endif /* _REGEX_RE_COMP */
561 \f
562 static re_node_set empty_set;
563
564 /* Internal entry point. */
565
566 /* Searches for a compiled pattern PREG in the string STRING, whose
567 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
568 mingings with regexec. START, and RANGE have the same meanings
569 with re_search.
570 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
571 otherwise return the error code.
572 Note: We assume front end functions already check ranges.
573 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
574
575 static reg_errcode_t
576 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
577 eflags)
578 const regex_t *preg;
579 const char *string;
580 int length, start, range, stop, eflags;
581 size_t nmatch;
582 regmatch_t pmatch[];
583 {
584 reg_errcode_t err;
585 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
586 re_string_t input;
587 int left_lim, right_lim, incr;
588 int fl_longest_match, match_first, match_last = -1;
589 int fast_translate, sb;
590 re_match_context_t mctx;
591 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
592 && range && !preg->can_be_null) ? preg->fastmap : NULL);
593
594 /* Check if the DFA haven't been compiled. */
595 if (BE (preg->used == 0 || dfa->init_state == NULL
596 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
597 || dfa->init_state_begbuf == NULL, 0))
598 return REG_NOMATCH;
599
600 #ifdef DEBUG
601 /* We assume front-end functions already check them. */
602 assert (start + range >= 0 && start + range <= length);
603 #endif
604
605 /* If initial states with non-begbuf contexts have no elements,
606 the regex must be anchored. If preg->newline_anchor is set,
607 we'll never use init_state_nl, so do not check it. */
608 if (dfa->init_state->nodes.nelem == 0
609 && dfa->init_state_word->nodes.nelem == 0
610 && (dfa->init_state_nl->nodes.nelem == 0
611 || !preg->newline_anchor))
612 {
613 if (start != 0 && start + range != 0)
614 return REG_NOMATCH;
615 start = range = 0;
616 }
617
618 re_node_set_init_empty (&empty_set);
619 memset (&mctx, '\0', sizeof (re_match_context_t));
620
621 /* We must check the longest matching, if nmatch > 0. */
622 fl_longest_match = (nmatch != 0 || dfa->nbackref);
623
624 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
625 preg->translate, preg->syntax & RE_ICASE, dfa);
626 if (BE (err != REG_NOERROR, 0))
627 goto free_return;
628 input.stop = stop;
629 input.raw_stop = stop;
630
631 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
632 if (BE (err != REG_NOERROR, 0))
633 goto free_return;
634
635 /* We will log all the DFA states through which the dfa pass,
636 if nmatch > 1, or this dfa has "multibyte node", which is a
637 back-reference or a node which can accept multibyte character or
638 multi character collating element. */
639 if (nmatch > 1 || dfa->has_mb_node)
640 {
641 mctx.state_log = re_malloc (re_dfastate_t *, input.bufs_len + 1);
642 if (BE (mctx.state_log == NULL, 0))
643 {
644 err = REG_ESPACE;
645 goto free_return;
646 }
647 }
648 else
649 mctx.state_log = NULL;
650
651 match_first = start;
652 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
653 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
654
655 /* Check incrementally whether of not the input string match. */
656 incr = (range < 0) ? -1 : 1;
657 left_lim = (range < 0) ? start + range : start;
658 right_lim = (range < 0) ? start : start + range;
659 sb = dfa->mb_cur_max == 1;
660 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
661
662 for (;;)
663 {
664 /* At first get the current byte from input string. */
665 if (fastmap)
666 {
667 if (BE (fast_translate, 1))
668 {
669 unsigned RE_TRANSLATE_TYPE t
670 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
671 if (BE (range >= 0, 1))
672 {
673 if (BE (t != NULL, 0))
674 {
675 while (BE (match_first < right_lim, 1)
676 && !fastmap[t[(unsigned char) string[match_first]]])
677 ++match_first;
678 }
679 else
680 {
681 while (BE (match_first < right_lim, 1)
682 && !fastmap[(unsigned char) string[match_first]])
683 ++match_first;
684 }
685 if (BE (match_first == right_lim, 0))
686 {
687 int ch = match_first >= length
688 ? 0 : (unsigned char) string[match_first];
689 if (!fastmap[t ? t[ch] : ch])
690 break;
691 }
692 }
693 else
694 {
695 while (match_first >= left_lim)
696 {
697 int ch = match_first >= length
698 ? 0 : (unsigned char) string[match_first];
699 if (fastmap[t ? t[ch] : ch])
700 break;
701 --match_first;
702 }
703 if (match_first < left_lim)
704 break;
705 }
706 }
707 else
708 {
709 int ch;
710
711 do
712 {
713 /* In this case, we can't determine easily the current byte,
714 since it might be a component byte of a multibyte
715 character. Then we use the constructed buffer
716 instead. */
717 /* If MATCH_FIRST is out of the valid range, reconstruct the
718 buffers. */
719 if (input.raw_mbs_idx + input.valid_raw_len <= match_first
720 || match_first < input.raw_mbs_idx)
721 {
722 err = re_string_reconstruct (&input, match_first, eflags,
723 preg->newline_anchor);
724 if (BE (err != REG_NOERROR, 0))
725 goto free_return;
726 }
727 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
728 Note that MATCH_FIRST must not be smaller than 0. */
729 ch = ((match_first >= length) ? 0
730 : re_string_byte_at (&input,
731 match_first - input.raw_mbs_idx));
732 if (fastmap[ch])
733 break;
734 match_first += incr;
735 }
736 while (match_first >= left_lim && match_first <= right_lim);
737 if (! fastmap[ch])
738 break;
739 }
740 }
741
742 /* Reconstruct the buffers so that the matcher can assume that
743 the matching starts from the beginning of the buffer. */
744 err = re_string_reconstruct (&input, match_first, eflags,
745 preg->newline_anchor);
746 if (BE (err != REG_NOERROR, 0))
747 goto free_return;
748 #ifdef RE_ENABLE_I18N
749 /* Eliminate it when it is a component of a multibyte character
750 and isn't the head of a multibyte character. */
751 if (sb || re_string_first_byte (&input, 0))
752 #endif
753 {
754 /* It seems to be appropriate one, then use the matcher. */
755 /* We assume that the matching starts from 0. */
756 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
757 match_last = check_matching (preg, &mctx, fl_longest_match);
758 if (match_last != -1)
759 {
760 if (BE (match_last == -2, 0))
761 {
762 err = REG_ESPACE;
763 goto free_return;
764 }
765 else
766 {
767 mctx.match_last = match_last;
768 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
769 {
770 re_dfastate_t *pstate = mctx.state_log[match_last];
771 mctx.last_node = check_halt_state_context (preg, pstate,
772 &mctx, match_last);
773 }
774 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
775 || dfa->nbackref)
776 {
777 err = prune_impossible_nodes (preg, &mctx);
778 if (err == REG_NOERROR)
779 break;
780 if (BE (err != REG_NOMATCH, 0))
781 goto free_return;
782 match_last = -1;
783 }
784 else
785 break; /* We found a match. */
786 }
787 }
788 match_ctx_clean (&mctx);
789 }
790 /* Update counter. */
791 match_first += incr;
792 if (match_first < left_lim || right_lim < match_first)
793 break;
794 }
795
796 /* Set pmatch[] if we need. */
797 if (match_last != -1 && nmatch > 0)
798 {
799 int reg_idx;
800
801 /* Initialize registers. */
802 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
803 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
804
805 /* Set the points where matching start/end. */
806 pmatch[0].rm_so = 0;
807 pmatch[0].rm_eo = mctx.match_last;
808
809 if (!preg->no_sub && nmatch > 1)
810 {
811 err = set_regs (preg, &mctx, nmatch, pmatch,
812 dfa->has_plural_match && dfa->nbackref > 0);
813 if (BE (err != REG_NOERROR, 0))
814 goto free_return;
815 }
816
817 /* At last, add the offset to the each registers, since we slided
818 the buffers so that we could assume that the matching starts
819 from 0. */
820 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
821 if (pmatch[reg_idx].rm_so != -1)
822 {
823 #ifdef RE_ENABLE_I18N
824 if (BE (input.offsets_needed != 0, 0))
825 {
826 if (pmatch[reg_idx].rm_so == input.valid_len)
827 pmatch[reg_idx].rm_so += input.valid_raw_len - input.valid_len;
828 else
829 pmatch[reg_idx].rm_so = input.offsets[pmatch[reg_idx].rm_so];
830 if (pmatch[reg_idx].rm_eo == input.valid_len)
831 pmatch[reg_idx].rm_eo += input.valid_raw_len - input.valid_len;
832 else
833 pmatch[reg_idx].rm_eo = input.offsets[pmatch[reg_idx].rm_eo];
834 }
835 #else
836 assert (input.offsets_needed == 0);
837 #endif
838 pmatch[reg_idx].rm_so += match_first;
839 pmatch[reg_idx].rm_eo += match_first;
840 }
841 }
842 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
843 free_return:
844 re_free (mctx.state_log);
845 if (dfa->nbackref)
846 match_ctx_free (&mctx);
847 re_string_destruct (&input);
848 return err;
849 }
850
851 static reg_errcode_t
852 prune_impossible_nodes (preg, mctx)
853 const regex_t *preg;
854 re_match_context_t *mctx;
855 {
856 int halt_node, match_last;
857 reg_errcode_t ret;
858 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
859 re_dfastate_t **sifted_states;
860 re_dfastate_t **lim_states = NULL;
861 re_sift_context_t sctx;
862 #ifdef DEBUG
863 assert (mctx->state_log != NULL);
864 #endif
865 match_last = mctx->match_last;
866 halt_node = mctx->last_node;
867 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
868 if (BE (sifted_states == NULL, 0))
869 {
870 ret = REG_ESPACE;
871 goto free_return;
872 }
873 if (dfa->nbackref)
874 {
875 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
876 if (BE (lim_states == NULL, 0))
877 {
878 ret = REG_ESPACE;
879 goto free_return;
880 }
881 while (1)
882 {
883 memset (lim_states, '\0',
884 sizeof (re_dfastate_t *) * (match_last + 1));
885 match_ctx_clear_flag (mctx);
886 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
887 match_last, 0);
888 ret = sift_states_backward (preg, mctx, &sctx);
889 re_node_set_free (&sctx.limits);
890 if (BE (ret != REG_NOERROR, 0))
891 goto free_return;
892 if (sifted_states[0] != NULL || lim_states[0] != NULL)
893 break;
894 do
895 {
896 --match_last;
897 if (match_last < 0)
898 {
899 ret = REG_NOMATCH;
900 goto free_return;
901 }
902 } while (mctx->state_log[match_last] == NULL
903 || !mctx->state_log[match_last]->halt);
904 halt_node = check_halt_state_context (preg,
905 mctx->state_log[match_last],
906 mctx, match_last);
907 }
908 ret = merge_state_array (dfa, sifted_states, lim_states,
909 match_last + 1);
910 re_free (lim_states);
911 lim_states = NULL;
912 if (BE (ret != REG_NOERROR, 0))
913 goto free_return;
914 }
915 else
916 {
917 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
918 match_last, 0);
919 ret = sift_states_backward (preg, mctx, &sctx);
920 re_node_set_free (&sctx.limits);
921 if (BE (ret != REG_NOERROR, 0))
922 goto free_return;
923 }
924 re_free (mctx->state_log);
925 mctx->state_log = sifted_states;
926 sifted_states = NULL;
927 mctx->last_node = halt_node;
928 mctx->match_last = match_last;
929 ret = REG_NOERROR;
930 free_return:
931 re_free (sifted_states);
932 re_free (lim_states);
933 return ret;
934 }
935
936 /* Acquire an initial state and return it.
937 We must select appropriate initial state depending on the context,
938 since initial states may have constraints like "\<", "^", etc.. */
939
940 static inline re_dfastate_t *
941 acquire_init_state_context (err, preg, mctx, idx)
942 reg_errcode_t *err;
943 const regex_t *preg;
944 const re_match_context_t *mctx;
945 int idx;
946 {
947 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
948
949 *err = REG_NOERROR;
950 if (dfa->init_state->has_constraint)
951 {
952 unsigned int context;
953 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
954 preg->newline_anchor);
955 if (IS_WORD_CONTEXT (context))
956 return dfa->init_state_word;
957 else if (IS_ORDINARY_CONTEXT (context))
958 return dfa->init_state;
959 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
960 return dfa->init_state_begbuf;
961 else if (IS_NEWLINE_CONTEXT (context))
962 return dfa->init_state_nl;
963 else if (IS_BEGBUF_CONTEXT (context))
964 {
965 /* It is relatively rare case, then calculate on demand. */
966 return re_acquire_state_context (err, dfa,
967 dfa->init_state->entrance_nodes,
968 context);
969 }
970 else
971 /* Must not happen? */
972 return dfa->init_state;
973 }
974 else
975 return dfa->init_state;
976 }
977
978 /* Check whether the regular expression match input string INPUT or not,
979 and return the index where the matching end, return -1 if not match,
980 or return -2 in case of an error.
981 FL_LONGEST_MATCH means we want the POSIX longest matching.
982 Note that the matcher assume that the maching starts from the current
983 index of the buffer. */
984
985 static int
986 check_matching (preg, mctx, fl_longest_match)
987 const regex_t *preg;
988 re_match_context_t *mctx;
989 int fl_longest_match;
990 {
991 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
992 reg_errcode_t err;
993 int match = 0;
994 int match_last = -1;
995 int cur_str_idx = re_string_cur_idx (mctx->input);
996 re_dfastate_t *cur_state;
997
998 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
999 /* An initial state must not be NULL(invalid state). */
1000 if (BE (cur_state == NULL, 0))
1001 return -2;
1002 if (mctx->state_log != NULL)
1003 mctx->state_log[cur_str_idx] = cur_state;
1004
1005 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1006 later. E.g. Processing back references. */
1007 if (BE (dfa->nbackref, 0))
1008 {
1009 err = check_subexp_matching_top (dfa, mctx, &cur_state->nodes, 0);
1010 if (BE (err != REG_NOERROR, 0))
1011 return err;
1012
1013 if (cur_state->has_backref)
1014 {
1015 err = transit_state_bkref (preg, &cur_state->nodes, mctx);
1016 if (BE (err != REG_NOERROR, 0))
1017 return err;
1018 }
1019 }
1020
1021 /* If the RE accepts NULL string. */
1022 if (BE (cur_state->halt, 0))
1023 {
1024 if (!cur_state->has_constraint
1025 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
1026 {
1027 if (!fl_longest_match)
1028 return cur_str_idx;
1029 else
1030 {
1031 match_last = cur_str_idx;
1032 match = 1;
1033 }
1034 }
1035 }
1036
1037 while (!re_string_eoi (mctx->input))
1038 {
1039 cur_state = transit_state (&err, preg, mctx, cur_state);
1040 if (cur_state == NULL) /* Reached at the invalid state or an error. */
1041 {
1042 cur_str_idx = re_string_cur_idx (mctx->input);
1043 if (BE (err != REG_NOERROR, 0))
1044 return -2;
1045 if (!fl_longest_match && match)
1046 break;
1047 else
1048 {
1049 if (mctx->state_log == NULL)
1050 break;
1051 else
1052 {
1053 int max = mctx->state_log_top;
1054 for (; cur_str_idx <= max; ++cur_str_idx)
1055 if (mctx->state_log[cur_str_idx] != NULL)
1056 break;
1057 if (cur_str_idx > max)
1058 break;
1059 }
1060 }
1061 }
1062
1063 if (cur_state != NULL && cur_state->halt)
1064 {
1065 /* Reached at a halt state.
1066 Check the halt state can satisfy the current context. */
1067 if (!cur_state->has_constraint
1068 || check_halt_state_context (preg, cur_state, mctx,
1069 re_string_cur_idx (mctx->input)))
1070 {
1071 /* We found an appropriate halt state. */
1072 match_last = re_string_cur_idx (mctx->input);
1073 match = 1;
1074 if (!fl_longest_match)
1075 break;
1076 }
1077 }
1078 }
1079 return match_last;
1080 }
1081
1082 /* Check NODE match the current context. */
1083
1084 static int check_halt_node_context (dfa, node, context)
1085 const re_dfa_t *dfa;
1086 int node;
1087 unsigned int context;
1088 {
1089 re_token_type_t type = dfa->nodes[node].type;
1090 unsigned int constraint = dfa->nodes[node].constraint;
1091 if (type != END_OF_RE)
1092 return 0;
1093 if (!constraint)
1094 return 1;
1095 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1096 return 0;
1097 return 1;
1098 }
1099
1100 /* Check the halt state STATE match the current context.
1101 Return 0 if not match, if the node, STATE has, is a halt node and
1102 match the context, return the node. */
1103
1104 static int
1105 check_halt_state_context (preg, state, mctx, idx)
1106 const regex_t *preg;
1107 const re_dfastate_t *state;
1108 const re_match_context_t *mctx;
1109 int idx;
1110 {
1111 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1112 int i;
1113 unsigned int context;
1114 #ifdef DEBUG
1115 assert (state->halt);
1116 #endif
1117 context = re_string_context_at (mctx->input, idx, mctx->eflags,
1118 preg->newline_anchor);
1119 for (i = 0; i < state->nodes.nelem; ++i)
1120 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
1121 return state->nodes.elems[i];
1122 return 0;
1123 }
1124
1125 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1126 corresponding to the DFA).
1127 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1128 of errors. */
1129
1130 static int
1131 proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs)
1132 const regex_t *preg;
1133 regmatch_t *regs;
1134 const re_match_context_t *mctx;
1135 int nregs, *pidx, node;
1136 re_node_set *eps_via_nodes;
1137 struct re_fail_stack_t *fs;
1138 {
1139 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1140 int i, err, dest_node;
1141 dest_node = -1;
1142 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1143 {
1144 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1145 int ndest, dest_nodes[2];
1146 err = re_node_set_insert (eps_via_nodes, node);
1147 if (BE (err < 0, 0))
1148 return -1;
1149 /* Pick up valid destinations. */
1150 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1151 {
1152 int candidate = dfa->edests[node].elems[i];
1153 if (!re_node_set_contains (cur_nodes, candidate))
1154 continue;
1155 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1156 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1157 ++ndest;
1158 }
1159 if (ndest <= 1)
1160 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1161 /* In order to avoid infinite loop like "(a*)*". */
1162 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1163 return dest_nodes[1];
1164 if (fs != NULL)
1165 push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes);
1166 return dest_nodes[0];
1167 }
1168 else
1169 {
1170 int naccepted = 0;
1171 re_token_type_t type = dfa->nodes[node].type;
1172
1173 #ifdef RE_ENABLE_I18N
1174 if (ACCEPT_MB_NODE (type))
1175 naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx);
1176 else
1177 #endif /* RE_ENABLE_I18N */
1178 if (type == OP_BACK_REF)
1179 {
1180 int subexp_idx = dfa->nodes[node].opr.idx;
1181 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1182 if (fs != NULL)
1183 {
1184 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1185 return -1;
1186 else if (naccepted)
1187 {
1188 char *buf = (char *) re_string_get_buffer (mctx->input);
1189 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1190 naccepted) != 0)
1191 return -1;
1192 }
1193 }
1194
1195 if (naccepted == 0)
1196 {
1197 err = re_node_set_insert (eps_via_nodes, node);
1198 if (BE (err < 0, 0))
1199 return -2;
1200 dest_node = dfa->edests[node].elems[0];
1201 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1202 dest_node))
1203 return dest_node;
1204 }
1205 }
1206
1207 if (naccepted != 0
1208 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
1209 {
1210 dest_node = dfa->nexts[node];
1211 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1212 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1213 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1214 dest_node)))
1215 return -1;
1216 re_node_set_empty (eps_via_nodes);
1217 return dest_node;
1218 }
1219 }
1220 return -1;
1221 }
1222
1223 static reg_errcode_t
1224 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1225 struct re_fail_stack_t *fs;
1226 int str_idx, *dests, nregs;
1227 regmatch_t *regs;
1228 re_node_set *eps_via_nodes;
1229 {
1230 reg_errcode_t err;
1231 int num = fs->num++;
1232 if (fs->num == fs->alloc)
1233 {
1234 struct re_fail_stack_ent_t *new_array;
1235 fs->alloc *= 2;
1236 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1237 * fs->alloc));
1238 if (new_array == NULL)
1239 return REG_ESPACE;
1240 fs->stack = new_array;
1241 }
1242 fs->stack[num].idx = str_idx;
1243 fs->stack[num].node = dests[1];
1244 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1245 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1246 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1247 return err;
1248 }
1249
1250 static int
1251 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1252 struct re_fail_stack_t *fs;
1253 int *pidx, nregs;
1254 regmatch_t *regs;
1255 re_node_set *eps_via_nodes;
1256 {
1257 int num = --fs->num;
1258 assert (num >= 0);
1259 *pidx = fs->stack[num].idx;
1260 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1261 re_node_set_free (eps_via_nodes);
1262 re_free (fs->stack[num].regs);
1263 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1264 return fs->stack[num].node;
1265 }
1266
1267 /* Set the positions where the subexpressions are starts/ends to registers
1268 PMATCH.
1269 Note: We assume that pmatch[0] is already set, and
1270 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1271
1272 static reg_errcode_t
1273 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1274 const regex_t *preg;
1275 const re_match_context_t *mctx;
1276 size_t nmatch;
1277 regmatch_t *pmatch;
1278 int fl_backtrack;
1279 {
1280 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1281 int idx, cur_node, real_nmatch;
1282 re_node_set eps_via_nodes;
1283 struct re_fail_stack_t *fs;
1284 struct re_fail_stack_t fs_body = {0, 2, NULL};
1285 #ifdef DEBUG
1286 assert (nmatch > 1);
1287 assert (mctx->state_log != NULL);
1288 #endif
1289 if (fl_backtrack)
1290 {
1291 fs = &fs_body;
1292 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1293 }
1294 else
1295 fs = NULL;
1296 cur_node = dfa->init_node;
1297 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1298 re_node_set_init_empty (&eps_via_nodes);
1299 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1300 {
1301 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1302 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1303 {
1304 int reg_idx;
1305 if (fs)
1306 {
1307 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1308 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1309 break;
1310 if (reg_idx == nmatch)
1311 {
1312 re_node_set_free (&eps_via_nodes);
1313 return free_fail_stack_return (fs);
1314 }
1315 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1316 &eps_via_nodes);
1317 }
1318 else
1319 {
1320 re_node_set_free (&eps_via_nodes);
1321 return REG_NOERROR;
1322 }
1323 }
1324
1325 /* Proceed to next node. */
1326 cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node,
1327 &eps_via_nodes, fs);
1328
1329 if (BE (cur_node < 0, 0))
1330 {
1331 if (cur_node == -2)
1332 return REG_ESPACE;
1333 if (fs)
1334 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1335 &eps_via_nodes);
1336 else
1337 {
1338 re_node_set_free (&eps_via_nodes);
1339 return REG_NOMATCH;
1340 }
1341 }
1342 }
1343 re_node_set_free (&eps_via_nodes);
1344 return free_fail_stack_return (fs);
1345 }
1346
1347 static reg_errcode_t
1348 free_fail_stack_return (fs)
1349 struct re_fail_stack_t *fs;
1350 {
1351 if (fs)
1352 {
1353 int fs_idx;
1354 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1355 {
1356 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1357 re_free (fs->stack[fs_idx].regs);
1358 }
1359 re_free (fs->stack);
1360 }
1361 return REG_NOERROR;
1362 }
1363
1364 static void
1365 update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1366 re_dfa_t *dfa;
1367 regmatch_t *pmatch;
1368 int cur_node, cur_idx, nmatch;
1369 {
1370 int type = dfa->nodes[cur_node].type;
1371 int reg_num;
1372 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1373 return;
1374 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1375 if (reg_num >= nmatch)
1376 return;
1377 if (type == OP_OPEN_SUBEXP)
1378 {
1379 /* We are at the first node of this sub expression. */
1380 pmatch[reg_num].rm_so = cur_idx;
1381 pmatch[reg_num].rm_eo = -1;
1382 }
1383 else if (type == OP_CLOSE_SUBEXP)
1384 /* We are at the first node of this sub expression. */
1385 pmatch[reg_num].rm_eo = cur_idx;
1386 }
1387
1388 #define NUMBER_OF_STATE 1
1389
1390 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1391 and sift the nodes in each states according to the following rules.
1392 Updated state_log will be wrote to STATE_LOG.
1393
1394 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1395 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1396 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1397 the LAST_NODE, we throw away the node `a'.
1398 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1399 string `s' and transit to `b':
1400 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1401 away the node `a'.
1402 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1403 thrown away, we throw away the node `a'.
1404 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1405 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1406 node `a'.
1407 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1408 we throw away the node `a'. */
1409
1410 #define STATE_NODE_CONTAINS(state,node) \
1411 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1412
1413 static reg_errcode_t
1414 sift_states_backward (preg, mctx, sctx)
1415 const regex_t *preg;
1416 re_match_context_t *mctx;
1417 re_sift_context_t *sctx;
1418 {
1419 reg_errcode_t err;
1420 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1421 int null_cnt = 0;
1422 int str_idx = sctx->last_str_idx;
1423 re_node_set cur_dest;
1424 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
1425
1426 #ifdef DEBUG
1427 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1428 #endif
1429 cur_src = &mctx->state_log[str_idx]->nodes;
1430
1431 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1432 transit to the last_node and the last_node itself. */
1433 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1434 if (BE (err != REG_NOERROR, 0))
1435 return err;
1436 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1437 if (BE (err != REG_NOERROR, 0))
1438 goto free_return;
1439
1440 /* Then check each states in the state_log. */
1441 while (str_idx > 0)
1442 {
1443 int i, ret;
1444 /* Update counters. */
1445 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1446 if (null_cnt > mctx->max_mb_elem_len)
1447 {
1448 memset (sctx->sifted_states, '\0',
1449 sizeof (re_dfastate_t *) * str_idx);
1450 re_node_set_free (&cur_dest);
1451 return REG_NOERROR;
1452 }
1453 re_node_set_empty (&cur_dest);
1454 --str_idx;
1455 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1456 : &mctx->state_log[str_idx]->nodes);
1457
1458 /* Then build the next sifted state.
1459 We build the next sifted state on `cur_dest', and update
1460 `sifted_states[str_idx]' with `cur_dest'.
1461 Note:
1462 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1463 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1464 for (i = 0; i < cur_src->nelem; i++)
1465 {
1466 int prev_node = cur_src->elems[i];
1467 int naccepted = 0;
1468 re_token_type_t type = dfa->nodes[prev_node].type;
1469
1470 if (IS_EPSILON_NODE (type))
1471 continue;
1472 #ifdef RE_ENABLE_I18N
1473 /* If the node may accept `multi byte'. */
1474 if (ACCEPT_MB_NODE (type))
1475 naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node,
1476 str_idx, sctx->last_str_idx);
1477
1478 #endif /* RE_ENABLE_I18N */
1479 /* We don't check backreferences here.
1480 See update_cur_sifted_state(). */
1481
1482 if (!naccepted
1483 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1484 str_idx)
1485 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1486 dfa->nexts[prev_node]))
1487 naccepted = 1;
1488
1489 if (naccepted == 0)
1490 continue;
1491
1492 if (sctx->limits.nelem)
1493 {
1494 int to_idx = str_idx + naccepted;
1495 if (check_dst_limits (dfa, &sctx->limits, mctx,
1496 dfa->nexts[prev_node], to_idx,
1497 prev_node, str_idx))
1498 continue;
1499 }
1500 ret = re_node_set_insert (&cur_dest, prev_node);
1501 if (BE (ret == -1, 0))
1502 {
1503 err = REG_ESPACE;
1504 goto free_return;
1505 }
1506 }
1507
1508 /* Add all the nodes which satisfy the following conditions:
1509 - It can epsilon transit to a node in CUR_DEST.
1510 - It is in CUR_SRC.
1511 And update state_log. */
1512 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1513 if (BE (err != REG_NOERROR, 0))
1514 goto free_return;
1515 }
1516 err = REG_NOERROR;
1517 free_return:
1518 re_node_set_free (&cur_dest);
1519 return err;
1520 }
1521
1522 /* Helper functions. */
1523
1524 static reg_errcode_t
1525 clean_state_log_if_need (mctx, next_state_log_idx)
1526 re_match_context_t *mctx;
1527 int next_state_log_idx;
1528 {
1529 int top = mctx->state_log_top;
1530
1531 if (next_state_log_idx >= mctx->input->bufs_len
1532 || (next_state_log_idx >= mctx->input->valid_len
1533 && mctx->input->valid_len < mctx->input->len))
1534 {
1535 reg_errcode_t err;
1536 err = extend_buffers (mctx);
1537 if (BE (err != REG_NOERROR, 0))
1538 return err;
1539 }
1540
1541 if (top < next_state_log_idx)
1542 {
1543 memset (mctx->state_log + top + 1, '\0',
1544 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1545 mctx->state_log_top = next_state_log_idx;
1546 }
1547 return REG_NOERROR;
1548 }
1549
1550 static reg_errcode_t
1551 merge_state_array (dfa, dst, src, num)
1552 re_dfa_t *dfa;
1553 re_dfastate_t **dst;
1554 re_dfastate_t **src;
1555 int num;
1556 {
1557 int st_idx;
1558 reg_errcode_t err;
1559 for (st_idx = 0; st_idx < num; ++st_idx)
1560 {
1561 if (dst[st_idx] == NULL)
1562 dst[st_idx] = src[st_idx];
1563 else if (src[st_idx] != NULL)
1564 {
1565 re_node_set merged_set;
1566 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1567 &src[st_idx]->nodes);
1568 if (BE (err != REG_NOERROR, 0))
1569 return err;
1570 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1571 re_node_set_free (&merged_set);
1572 if (BE (err != REG_NOERROR, 0))
1573 return err;
1574 }
1575 }
1576 return REG_NOERROR;
1577 }
1578
1579 static reg_errcode_t
1580 update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes)
1581 const regex_t *preg;
1582 re_match_context_t *mctx;
1583 re_sift_context_t *sctx;
1584 int str_idx;
1585 re_node_set *dest_nodes;
1586 {
1587 reg_errcode_t err;
1588 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1589 const re_node_set *candidates;
1590 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1591 : &mctx->state_log[str_idx]->nodes);
1592
1593 /* At first, add the nodes which can epsilon transit to a node in
1594 DEST_NODE. */
1595 if (dest_nodes->nelem)
1596 {
1597 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1598 if (BE (err != REG_NOERROR, 0))
1599 return err;
1600 }
1601
1602 /* Then, check the limitations in the current sift_context. */
1603 if (dest_nodes->nelem && sctx->limits.nelem)
1604 {
1605 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1606 mctx->bkref_ents, str_idx);
1607 if (BE (err != REG_NOERROR, 0))
1608 return err;
1609 }
1610
1611 /* Update state_log. */
1612 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1613 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1614 return err;
1615
1616 if ((mctx->state_log[str_idx] != NULL
1617 && mctx->state_log[str_idx]->has_backref))
1618 {
1619 err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes);
1620 if (BE (err != REG_NOERROR, 0))
1621 return err;
1622 }
1623 return REG_NOERROR;
1624 }
1625
1626 static reg_errcode_t
1627 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1628 re_dfa_t *dfa;
1629 re_node_set *dest_nodes;
1630 const re_node_set *candidates;
1631 {
1632 reg_errcode_t err;
1633 int src_idx;
1634 re_node_set src_copy;
1635
1636 err = re_node_set_init_copy (&src_copy, dest_nodes);
1637 if (BE (err != REG_NOERROR, 0))
1638 return err;
1639 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1640 {
1641 err = re_node_set_add_intersect (dest_nodes, candidates,
1642 dfa->inveclosures
1643 + src_copy.elems[src_idx]);
1644 if (BE (err != REG_NOERROR, 0))
1645 {
1646 re_node_set_free (&src_copy);
1647 return err;
1648 }
1649 }
1650 re_node_set_free (&src_copy);
1651 return REG_NOERROR;
1652 }
1653
1654 static reg_errcode_t
1655 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1656 re_dfa_t *dfa;
1657 int node;
1658 re_node_set *dest_nodes;
1659 const re_node_set *candidates;
1660 {
1661 int ecl_idx;
1662 reg_errcode_t err;
1663 re_node_set *inv_eclosure = dfa->inveclosures + node;
1664 re_node_set except_nodes;
1665 re_node_set_init_empty (&except_nodes);
1666 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1667 {
1668 int cur_node = inv_eclosure->elems[ecl_idx];
1669 if (cur_node == node)
1670 continue;
1671 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1672 {
1673 int edst1 = dfa->edests[cur_node].elems[0];
1674 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1675 ? dfa->edests[cur_node].elems[1] : -1);
1676 if ((!re_node_set_contains (inv_eclosure, edst1)
1677 && re_node_set_contains (dest_nodes, edst1))
1678 || (edst2 > 0
1679 && !re_node_set_contains (inv_eclosure, edst2)
1680 && re_node_set_contains (dest_nodes, edst2)))
1681 {
1682 err = re_node_set_add_intersect (&except_nodes, candidates,
1683 dfa->inveclosures + cur_node);
1684 if (BE (err != REG_NOERROR, 0))
1685 {
1686 re_node_set_free (&except_nodes);
1687 return err;
1688 }
1689 }
1690 }
1691 }
1692 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1693 {
1694 int cur_node = inv_eclosure->elems[ecl_idx];
1695 if (!re_node_set_contains (&except_nodes, cur_node))
1696 {
1697 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1698 re_node_set_remove_at (dest_nodes, idx);
1699 }
1700 }
1701 re_node_set_free (&except_nodes);
1702 return REG_NOERROR;
1703 }
1704
1705 static int
1706 check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx)
1707 re_dfa_t *dfa;
1708 re_node_set *limits;
1709 re_match_context_t *mctx;
1710 int dst_node, dst_idx, src_node, src_idx;
1711 {
1712 int lim_idx, src_pos, dst_pos;
1713
1714 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1715 {
1716 int subexp_idx;
1717 struct re_backref_cache_entry *ent;
1718 ent = mctx->bkref_ents + limits->elems[lim_idx];
1719 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1720
1721 dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1722 dfa->eclosures + dst_node,
1723 subexp_idx, dst_node, dst_idx);
1724 src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1725 dfa->eclosures + src_node,
1726 subexp_idx, src_node, src_idx);
1727
1728 /* In case of:
1729 <src> <dst> ( <subexp> )
1730 ( <subexp> ) <src> <dst>
1731 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1732 if (src_pos == dst_pos)
1733 continue; /* This is unrelated limitation. */
1734 else
1735 return 1;
1736 }
1737 return 0;
1738 }
1739
1740 static int
1741 check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, from_node,
1742 str_idx)
1743 re_dfa_t *dfa;
1744 re_match_context_t *mctx;
1745 re_node_set *eclosures;
1746 int limit, subexp_idx, from_node, str_idx;
1747 {
1748 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1749 int node_idx;
1750
1751 /* If we are outside the range of the subexpression, return -1 or 1. */
1752 if (str_idx < lim->subexp_from)
1753 return -1;
1754
1755 if (lim->subexp_to < str_idx)
1756 return 1;
1757
1758 /* If we are within the subexpression, return 0. */
1759 if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1760 return 0;
1761
1762 /* Else, we are on the boundary: examine the nodes on the epsilon
1763 closure. */
1764 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1765 {
1766 int node = eclosures->elems[node_idx];
1767 switch (dfa->nodes[node].type)
1768 {
1769 case OP_BACK_REF:
1770 {
1771 int bi = search_cur_bkref_entry (mctx, str_idx);
1772 for (; bi < mctx->nbkref_ents; ++bi)
1773 {
1774 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1775 int dst, cpos;
1776
1777 /* If this backreference goes beyond the point we're
1778 examining, don't go any further. */
1779 if (ent->str_idx > str_idx)
1780 break;
1781
1782 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1783 continue;
1784
1785 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1786 OP_CLOSE_SUBEXP cases below. But, if the
1787 destination node is the same node as the source
1788 node, don't recurse because it would cause an
1789 infinite loop: a regex that exhibits this behavior
1790 is ()\1*\1* */
1791 dst = dfa->edests[node].elems[0];
1792 if (dst == from_node)
1793 {
1794 if (str_idx == lim->subexp_from)
1795 return -1;
1796 else /* if (str_idx == lim->subexp_to) */
1797 return 0;
1798 }
1799
1800 cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
1801 dfa->eclosures + dst,
1802 subexp_idx, dst,
1803 str_idx);
1804
1805 if (cpos == -1 && str_idx == lim->subexp_from)
1806 return -1;
1807
1808 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1809 return 0;
1810 }
1811 break;
1812 }
1813
1814 case OP_OPEN_SUBEXP:
1815 if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1816 return -1;
1817 break;
1818
1819 case OP_CLOSE_SUBEXP:
1820 if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1821 return 0;
1822 break;
1823
1824 default:
1825 break;
1826 }
1827 }
1828
1829 if (str_idx == lim->subexp_to)
1830 return 1;
1831 else
1832 return 0;
1833 }
1834
1835 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1836 which are against limitations from DEST_NODES. */
1837
1838 static reg_errcode_t
1839 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1840 re_dfa_t *dfa;
1841 re_node_set *dest_nodes;
1842 const re_node_set *candidates;
1843 re_node_set *limits;
1844 struct re_backref_cache_entry *bkref_ents;
1845 int str_idx;
1846 {
1847 reg_errcode_t err;
1848 int node_idx, lim_idx;
1849
1850 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1851 {
1852 int subexp_idx;
1853 struct re_backref_cache_entry *ent;
1854 ent = bkref_ents + limits->elems[lim_idx];
1855
1856 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1857 continue; /* This is unrelated limitation. */
1858
1859 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1860 if (ent->subexp_to == str_idx)
1861 {
1862 int ops_node = -1;
1863 int cls_node = -1;
1864 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1865 {
1866 int node = dest_nodes->elems[node_idx];
1867 re_token_type_t type = dfa->nodes[node].type;
1868 if (type == OP_OPEN_SUBEXP
1869 && subexp_idx == dfa->nodes[node].opr.idx)
1870 ops_node = node;
1871 else if (type == OP_CLOSE_SUBEXP
1872 && subexp_idx == dfa->nodes[node].opr.idx)
1873 cls_node = node;
1874 }
1875
1876 /* Check the limitation of the open subexpression. */
1877 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1878 if (ops_node >= 0)
1879 {
1880 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1881 candidates);
1882 if (BE (err != REG_NOERROR, 0))
1883 return err;
1884 }
1885
1886 /* Check the limitation of the close subexpression. */
1887 if (cls_node >= 0)
1888 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1889 {
1890 int node = dest_nodes->elems[node_idx];
1891 if (!re_node_set_contains (dfa->inveclosures + node,
1892 cls_node)
1893 && !re_node_set_contains (dfa->eclosures + node,
1894 cls_node))
1895 {
1896 /* It is against this limitation.
1897 Remove it form the current sifted state. */
1898 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1899 candidates);
1900 if (BE (err != REG_NOERROR, 0))
1901 return err;
1902 --node_idx;
1903 }
1904 }
1905 }
1906 else /* (ent->subexp_to != str_idx) */
1907 {
1908 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1909 {
1910 int node = dest_nodes->elems[node_idx];
1911 re_token_type_t type = dfa->nodes[node].type;
1912 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1913 {
1914 if (subexp_idx != dfa->nodes[node].opr.idx)
1915 continue;
1916 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1917 || (type == OP_OPEN_SUBEXP))
1918 {
1919 /* It is against this limitation.
1920 Remove it form the current sifted state. */
1921 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1922 candidates);
1923 if (BE (err != REG_NOERROR, 0))
1924 return err;
1925 }
1926 }
1927 }
1928 }
1929 }
1930 return REG_NOERROR;
1931 }
1932
1933 static reg_errcode_t
1934 sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes)
1935 const regex_t *preg;
1936 re_match_context_t *mctx;
1937 re_sift_context_t *sctx;
1938 int str_idx;
1939 re_node_set *dest_nodes;
1940 {
1941 reg_errcode_t err;
1942 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1943 int node_idx, node;
1944 re_sift_context_t local_sctx;
1945 const re_node_set *candidates;
1946 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1947 : &mctx->state_log[str_idx]->nodes);
1948 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1949
1950 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
1951 {
1952 int cur_bkref_idx = re_string_cur_idx (mctx->input);
1953 re_token_type_t type;
1954 node = candidates->elems[node_idx];
1955 type = dfa->nodes[node].type;
1956 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
1957 continue;
1958 /* Avoid infinite loop for the REs like "()\1+". */
1959 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
1960 continue;
1961 if (type == OP_BACK_REF)
1962 {
1963 int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
1964 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
1965 {
1966 int disabled_idx, subexp_len, to_idx, dst_node;
1967 struct re_backref_cache_entry *entry;
1968 entry = mctx->bkref_ents + enabled_idx;
1969 if (entry->str_idx > str_idx)
1970 break;
1971 if (entry->node != node)
1972 continue;
1973 subexp_len = entry->subexp_to - entry->subexp_from;
1974 to_idx = str_idx + subexp_len;
1975 dst_node = (subexp_len ? dfa->nexts[node]
1976 : dfa->edests[node].elems[0]);
1977
1978 if (to_idx > sctx->last_str_idx
1979 || sctx->sifted_states[to_idx] == NULL
1980 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
1981 dst_node)
1982 || check_dst_limits (dfa, &sctx->limits, mctx, node,
1983 str_idx, dst_node, to_idx))
1984 continue;
1985 {
1986 re_dfastate_t *cur_state;
1987 entry->flag = 0;
1988 for (disabled_idx = enabled_idx + 1;
1989 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
1990 {
1991 struct re_backref_cache_entry *entry2;
1992 entry2 = mctx->bkref_ents + disabled_idx;
1993 if (entry2->str_idx > str_idx)
1994 break;
1995 entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
1996 }
1997
1998 if (local_sctx.sifted_states == NULL)
1999 {
2000 local_sctx = *sctx;
2001 err = re_node_set_init_copy (&local_sctx.limits,
2002 &sctx->limits);
2003 if (BE (err != REG_NOERROR, 0))
2004 goto free_return;
2005 }
2006 local_sctx.last_node = node;
2007 local_sctx.last_str_idx = str_idx;
2008 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2009 if (BE (err < 0, 0))
2010 {
2011 err = REG_ESPACE;
2012 goto free_return;
2013 }
2014 cur_state = local_sctx.sifted_states[str_idx];
2015 err = sift_states_backward (preg, mctx, &local_sctx);
2016 if (BE (err != REG_NOERROR, 0))
2017 goto free_return;
2018 if (sctx->limited_states != NULL)
2019 {
2020 err = merge_state_array (dfa, sctx->limited_states,
2021 local_sctx.sifted_states,
2022 str_idx + 1);
2023 if (BE (err != REG_NOERROR, 0))
2024 goto free_return;
2025 }
2026 local_sctx.sifted_states[str_idx] = cur_state;
2027 re_node_set_remove (&local_sctx.limits, enabled_idx);
2028 /* We must not use the variable entry here, since
2029 mctx->bkref_ents might be realloced. */
2030 mctx->bkref_ents[enabled_idx].flag = 1;
2031 }
2032 }
2033 enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2034 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2035 {
2036 struct re_backref_cache_entry *entry;
2037 entry = mctx->bkref_ents + enabled_idx;
2038 if (entry->str_idx > str_idx)
2039 break;
2040 if (entry->node == node)
2041 entry->flag = 0;
2042 }
2043 }
2044 }
2045 err = REG_NOERROR;
2046 free_return:
2047 if (local_sctx.sifted_states != NULL)
2048 {
2049 re_node_set_free (&local_sctx.limits);
2050 }
2051
2052 return err;
2053 }
2054
2055
2056 #ifdef RE_ENABLE_I18N
2057 static int
2058 sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
2059 const regex_t *preg;
2060 const re_match_context_t *mctx;
2061 re_sift_context_t *sctx;
2062 int node_idx, str_idx, max_str_idx;
2063 {
2064 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2065 int naccepted;
2066 /* Check the node can accept `multi byte'. */
2067 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
2068 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2069 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2070 dfa->nexts[node_idx]))
2071 /* The node can't accept the `multi byte', or the
2072 destination was already thrown away, then the node
2073 could't accept the current input `multi byte'. */
2074 naccepted = 0;
2075 /* Otherwise, it is sure that the node could accept
2076 `naccepted' bytes input. */
2077 return naccepted;
2078 }
2079 #endif /* RE_ENABLE_I18N */
2080
2081 \f
2082 /* Functions for state transition. */
2083
2084 /* Return the next state to which the current state STATE will transit by
2085 accepting the current input byte, and update STATE_LOG if necessary.
2086 If STATE can accept a multibyte char/collating element/back reference
2087 update the destination of STATE_LOG. */
2088
2089 static re_dfastate_t *
2090 transit_state (err, preg, mctx, state)
2091 reg_errcode_t *err;
2092 const regex_t *preg;
2093 re_match_context_t *mctx;
2094 re_dfastate_t *state;
2095 {
2096 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2097 re_dfastate_t **trtable, *next_state;
2098 unsigned char ch;
2099 int cur_idx;
2100
2101 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
2102 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
2103 && mctx->input->valid_len < mctx->input->len))
2104 {
2105 *err = extend_buffers (mctx);
2106 if (BE (*err != REG_NOERROR, 0))
2107 return NULL;
2108 }
2109
2110 *err = REG_NOERROR;
2111 if (state == NULL)
2112 {
2113 next_state = state;
2114 re_string_skip_bytes (mctx->input, 1);
2115 }
2116 else
2117 {
2118 #ifdef RE_ENABLE_I18N
2119 /* If the current state can accept multibyte. */
2120 if (state->accept_mb)
2121 {
2122 *err = transit_state_mb (preg, state, mctx);
2123 if (BE (*err != REG_NOERROR, 0))
2124 return NULL;
2125 }
2126 #endif /* RE_ENABLE_I18N */
2127
2128 /* Then decide the next state with the single byte. */
2129 if (1)
2130 {
2131 /* Use transition table */
2132 ch = re_string_fetch_byte (mctx->input);
2133 trtable = state->trtable;
2134 if (trtable == NULL)
2135 {
2136 trtable = build_trtable (preg, state);
2137 if (trtable == NULL)
2138 {
2139 *err = REG_ESPACE;
2140 return NULL;
2141 }
2142 }
2143 if (BE (state->word_trtable, 0))
2144 {
2145 unsigned int context;
2146 context
2147 = re_string_context_at (mctx->input,
2148 re_string_cur_idx (mctx->input) - 1,
2149 mctx->eflags, preg->newline_anchor);
2150 if (IS_WORD_CONTEXT (context))
2151 next_state = trtable[ch + SBC_MAX];
2152 else
2153 next_state = trtable[ch];
2154 }
2155 else
2156 next_state = trtable[ch];
2157 }
2158 #if 0
2159 else
2160 {
2161 /* don't use transition table */
2162 next_state = transit_state_sb (err, preg, state, mctx);
2163 if (BE (next_state == NULL && err != REG_NOERROR, 0))
2164 return NULL;
2165 }
2166 #endif
2167 }
2168
2169 cur_idx = re_string_cur_idx (mctx->input);
2170 /* Update the state_log if we need. */
2171 if (mctx->state_log != NULL)
2172 {
2173 if (cur_idx > mctx->state_log_top)
2174 {
2175 mctx->state_log[cur_idx] = next_state;
2176 mctx->state_log_top = cur_idx;
2177 }
2178 else if (mctx->state_log[cur_idx] == 0)
2179 {
2180 mctx->state_log[cur_idx] = next_state;
2181 }
2182 else
2183 {
2184 re_dfastate_t *pstate;
2185 unsigned int context;
2186 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2187 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2188 the destination of a multibyte char/collating element/
2189 back reference. Then the next state is the union set of
2190 these destinations and the results of the transition table. */
2191 pstate = mctx->state_log[cur_idx];
2192 log_nodes = pstate->entrance_nodes;
2193 if (next_state != NULL)
2194 {
2195 table_nodes = next_state->entrance_nodes;
2196 *err = re_node_set_init_union (&next_nodes, table_nodes,
2197 log_nodes);
2198 if (BE (*err != REG_NOERROR, 0))
2199 return NULL;
2200 }
2201 else
2202 next_nodes = *log_nodes;
2203 /* Note: We already add the nodes of the initial state,
2204 then we don't need to add them here. */
2205
2206 context = re_string_context_at (mctx->input,
2207 re_string_cur_idx (mctx->input) - 1,
2208 mctx->eflags, preg->newline_anchor);
2209 next_state = mctx->state_log[cur_idx]
2210 = re_acquire_state_context (err, dfa, &next_nodes, context);
2211 /* We don't need to check errors here, since the return value of
2212 this function is next_state and ERR is already set. */
2213
2214 if (table_nodes != NULL)
2215 re_node_set_free (&next_nodes);
2216 }
2217 }
2218
2219 if (BE (dfa->nbackref, 0) && next_state != NULL)
2220 {
2221 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2222 later. We must check them here, since the back references in the
2223 next state might use them. */
2224 *err = check_subexp_matching_top (dfa, mctx, &next_state->nodes,
2225 cur_idx);
2226 if (BE (*err != REG_NOERROR, 0))
2227 return NULL;
2228
2229 /* If the next state has back references. */
2230 if (next_state->has_backref)
2231 {
2232 *err = transit_state_bkref (preg, &next_state->nodes, mctx);
2233 if (BE (*err != REG_NOERROR, 0))
2234 return NULL;
2235 next_state = mctx->state_log[cur_idx];
2236 }
2237 }
2238 return next_state;
2239 }
2240
2241 /* Helper functions for transit_state. */
2242
2243 /* From the node set CUR_NODES, pick up the nodes whose types are
2244 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2245 expression. And register them to use them later for evaluating the
2246 correspoding back references. */
2247
2248 static reg_errcode_t
2249 check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx)
2250 re_dfa_t *dfa;
2251 re_match_context_t *mctx;
2252 re_node_set *cur_nodes;
2253 int str_idx;
2254 {
2255 int node_idx;
2256 reg_errcode_t err;
2257
2258 /* TODO: This isn't efficient.
2259 Because there might be more than one nodes whose types are
2260 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2261 nodes.
2262 E.g. RE: (a){2} */
2263 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2264 {
2265 int node = cur_nodes->elems[node_idx];
2266 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2267 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2268 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2269 {
2270 err = match_ctx_add_subtop (mctx, node, str_idx);
2271 if (BE (err != REG_NOERROR, 0))
2272 return err;
2273 }
2274 }
2275 return REG_NOERROR;
2276 }
2277
2278 #if 0
2279 /* Return the next state to which the current state STATE will transit by
2280 accepting the current input byte. */
2281
2282 static re_dfastate_t *
2283 transit_state_sb (err, preg, state, mctx)
2284 reg_errcode_t *err;
2285 const regex_t *preg;
2286 re_dfastate_t *state;
2287 re_match_context_t *mctx;
2288 {
2289 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2290 re_node_set next_nodes;
2291 re_dfastate_t *next_state;
2292 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
2293 unsigned int context;
2294
2295 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2296 if (BE (*err != REG_NOERROR, 0))
2297 return NULL;
2298 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2299 {
2300 int cur_node = state->nodes.elems[node_cnt];
2301 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
2302 {
2303 *err = re_node_set_merge (&next_nodes,
2304 dfa->eclosures + dfa->nexts[cur_node]);
2305 if (BE (*err != REG_NOERROR, 0))
2306 {
2307 re_node_set_free (&next_nodes);
2308 return NULL;
2309 }
2310 }
2311 }
2312 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
2313 preg->newline_anchor);
2314 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2315 /* We don't need to check errors here, since the return value of
2316 this function is next_state and ERR is already set. */
2317
2318 re_node_set_free (&next_nodes);
2319 re_string_skip_bytes (mctx->input, 1);
2320 return next_state;
2321 }
2322 #endif
2323
2324 #ifdef RE_ENABLE_I18N
2325 static reg_errcode_t
2326 transit_state_mb (preg, pstate, mctx)
2327 const regex_t *preg;
2328 re_dfastate_t *pstate;
2329 re_match_context_t *mctx;
2330 {
2331 reg_errcode_t err;
2332 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2333 int i;
2334
2335 for (i = 0; i < pstate->nodes.nelem; ++i)
2336 {
2337 re_node_set dest_nodes, *new_nodes;
2338 int cur_node_idx = pstate->nodes.elems[i];
2339 int naccepted = 0, dest_idx;
2340 unsigned int context;
2341 re_dfastate_t *dest_state;
2342
2343 if (dfa->nodes[cur_node_idx].constraint)
2344 {
2345 context = re_string_context_at (mctx->input,
2346 re_string_cur_idx (mctx->input),
2347 mctx->eflags, preg->newline_anchor);
2348 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2349 context))
2350 continue;
2351 }
2352
2353 /* How many bytes the node can accept? */
2354 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2355 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
2356 re_string_cur_idx (mctx->input));
2357 if (naccepted == 0)
2358 continue;
2359
2360 /* The node can accepts `naccepted' bytes. */
2361 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
2362 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2363 : mctx->max_mb_elem_len);
2364 err = clean_state_log_if_need (mctx, dest_idx);
2365 if (BE (err != REG_NOERROR, 0))
2366 return err;
2367 #ifdef DEBUG
2368 assert (dfa->nexts[cur_node_idx] != -1);
2369 #endif
2370 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2371 then we use pstate->nodes.elems[i] instead. */
2372 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2373
2374 dest_state = mctx->state_log[dest_idx];
2375 if (dest_state == NULL)
2376 dest_nodes = *new_nodes;
2377 else
2378 {
2379 err = re_node_set_init_union (&dest_nodes,
2380 dest_state->entrance_nodes, new_nodes);
2381 if (BE (err != REG_NOERROR, 0))
2382 return err;
2383 }
2384 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
2385 preg->newline_anchor);
2386 mctx->state_log[dest_idx]
2387 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2388 if (dest_state != NULL)
2389 re_node_set_free (&dest_nodes);
2390 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2391 return err;
2392 }
2393 return REG_NOERROR;
2394 }
2395 #endif /* RE_ENABLE_I18N */
2396
2397 static reg_errcode_t
2398 transit_state_bkref (preg, nodes, mctx)
2399 const regex_t *preg;
2400 const re_node_set *nodes;
2401 re_match_context_t *mctx;
2402 {
2403 reg_errcode_t err;
2404 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2405 int i;
2406 int cur_str_idx = re_string_cur_idx (mctx->input);
2407
2408 for (i = 0; i < nodes->nelem; ++i)
2409 {
2410 int dest_str_idx, prev_nelem, bkc_idx;
2411 int node_idx = nodes->elems[i];
2412 unsigned int context;
2413 const re_token_t *node = dfa->nodes + node_idx;
2414 re_node_set *new_dest_nodes;
2415
2416 /* Check whether `node' is a backreference or not. */
2417 if (node->type != OP_BACK_REF)
2418 continue;
2419
2420 if (node->constraint)
2421 {
2422 context = re_string_context_at (mctx->input, cur_str_idx,
2423 mctx->eflags, preg->newline_anchor);
2424 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2425 continue;
2426 }
2427
2428 /* `node' is a backreference.
2429 Check the substring which the substring matched. */
2430 bkc_idx = mctx->nbkref_ents;
2431 err = get_subexp (preg, mctx, node_idx, cur_str_idx);
2432 if (BE (err != REG_NOERROR, 0))
2433 goto free_return;
2434
2435 /* And add the epsilon closures (which is `new_dest_nodes') of
2436 the backreference to appropriate state_log. */
2437 #ifdef DEBUG
2438 assert (dfa->nexts[node_idx] != -1);
2439 #endif
2440 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2441 {
2442 int subexp_len;
2443 re_dfastate_t *dest_state;
2444 struct re_backref_cache_entry *bkref_ent;
2445 bkref_ent = mctx->bkref_ents + bkc_idx;
2446 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2447 continue;
2448 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2449 new_dest_nodes = (subexp_len == 0
2450 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2451 : dfa->eclosures + dfa->nexts[node_idx]);
2452 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2453 - bkref_ent->subexp_from);
2454 context = re_string_context_at (mctx->input, dest_str_idx - 1,
2455 mctx->eflags, preg->newline_anchor);
2456 dest_state = mctx->state_log[dest_str_idx];
2457 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2458 : mctx->state_log[cur_str_idx]->nodes.nelem);
2459 /* Add `new_dest_node' to state_log. */
2460 if (dest_state == NULL)
2461 {
2462 mctx->state_log[dest_str_idx]
2463 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2464 context);
2465 if (BE (mctx->state_log[dest_str_idx] == NULL
2466 && err != REG_NOERROR, 0))
2467 goto free_return;
2468 }
2469 else
2470 {
2471 re_node_set dest_nodes;
2472 err = re_node_set_init_union (&dest_nodes,
2473 dest_state->entrance_nodes,
2474 new_dest_nodes);
2475 if (BE (err != REG_NOERROR, 0))
2476 {
2477 re_node_set_free (&dest_nodes);
2478 goto free_return;
2479 }
2480 mctx->state_log[dest_str_idx]
2481 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2482 re_node_set_free (&dest_nodes);
2483 if (BE (mctx->state_log[dest_str_idx] == NULL
2484 && err != REG_NOERROR, 0))
2485 goto free_return;
2486 }
2487 /* We need to check recursively if the backreference can epsilon
2488 transit. */
2489 if (subexp_len == 0
2490 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2491 {
2492 err = check_subexp_matching_top (dfa, mctx, new_dest_nodes,
2493 cur_str_idx);
2494 if (BE (err != REG_NOERROR, 0))
2495 goto free_return;
2496 err = transit_state_bkref (preg, new_dest_nodes, mctx);
2497 if (BE (err != REG_NOERROR, 0))
2498 goto free_return;
2499 }
2500 }
2501 }
2502 err = REG_NOERROR;
2503 free_return:
2504 return err;
2505 }
2506
2507 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2508 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2509 Note that we might collect inappropriate candidates here.
2510 However, the cost of checking them strictly here is too high, then we
2511 delay these checking for prune_impossible_nodes(). */
2512
2513 static reg_errcode_t
2514 get_subexp (preg, mctx, bkref_node, bkref_str_idx)
2515 const regex_t *preg;
2516 re_match_context_t *mctx;
2517 int bkref_node, bkref_str_idx;
2518 {
2519 int subexp_num, sub_top_idx;
2520 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2521 const char *buf = (const char *) re_string_get_buffer (mctx->input);
2522 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2523 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2524 for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2525 {
2526 const struct re_backref_cache_entry *entry
2527 = &mctx->bkref_ents[cache_idx];
2528 if (entry->str_idx > bkref_str_idx)
2529 break;
2530 if (entry->node == bkref_node)
2531 return REG_NOERROR; /* We already checked it. */
2532 }
2533 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2534
2535 /* For each sub expression */
2536 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2537 {
2538 reg_errcode_t err;
2539 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2540 re_sub_match_last_t *sub_last;
2541 int sub_last_idx, sl_str;
2542 const char *bkref_str;
2543
2544 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2545 continue; /* It isn't related. */
2546
2547 sl_str = sub_top->str_idx;
2548 bkref_str = buf + bkref_str_idx;
2549 /* At first, check the last node of sub expressions we already
2550 evaluated. */
2551 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2552 {
2553 int sl_str_diff;
2554 sub_last = sub_top->lasts[sub_last_idx];
2555 sl_str_diff = sub_last->str_idx - sl_str;
2556 /* The matched string by the sub expression match with the substring
2557 at the back reference? */
2558 if (sl_str_diff > 0
2559 && memcmp (bkref_str, buf + sl_str, sl_str_diff) != 0)
2560 break; /* We don't need to search this sub expression any more. */
2561 bkref_str += sl_str_diff;
2562 sl_str += sl_str_diff;
2563 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2564 bkref_str_idx);
2565
2566 /* Reload buf and bkref_str, since the preceding call might
2567 have reallocated the buffer. */
2568 buf = (const char *) re_string_get_buffer (mctx->input);
2569 bkref_str = buf + bkref_str_idx;
2570
2571 if (err == REG_NOMATCH)
2572 continue;
2573 if (BE (err != REG_NOERROR, 0))
2574 return err;
2575 }
2576 if (sub_last_idx < sub_top->nlasts)
2577 continue;
2578 if (sub_last_idx > 0)
2579 ++sl_str;
2580 /* Then, search for the other last nodes of the sub expression. */
2581 for (; sl_str <= bkref_str_idx; ++sl_str)
2582 {
2583 int cls_node, sl_str_off;
2584 const re_node_set *nodes;
2585 sl_str_off = sl_str - sub_top->str_idx;
2586 /* The matched string by the sub expression match with the substring
2587 at the back reference? */
2588 if (sl_str_off > 0 && *bkref_str++ != buf[sl_str - 1])
2589 break; /* We don't need to search this sub expression any more. */
2590 if (mctx->state_log[sl_str] == NULL)
2591 continue;
2592 /* Does this state have a ')' of the sub expression? */
2593 nodes = &mctx->state_log[sl_str]->nodes;
2594 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2595 if (cls_node == -1)
2596 continue; /* No. */
2597 if (sub_top->path == NULL)
2598 {
2599 sub_top->path = calloc (sizeof (state_array_t),
2600 sl_str - sub_top->str_idx + 1);
2601 if (sub_top->path == NULL)
2602 return REG_ESPACE;
2603 }
2604 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2605 in the current context? */
2606 err = check_arrival (preg, mctx, sub_top->path, sub_top->node,
2607 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2608 if (err == REG_NOMATCH)
2609 continue;
2610 if (BE (err != REG_NOERROR, 0))
2611 return err;
2612 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2613 if (BE (sub_last == NULL, 0))
2614 return REG_ESPACE;
2615 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2616 bkref_str_idx);
2617 if (err == REG_NOMATCH)
2618 continue;
2619 }
2620 }
2621 return REG_NOERROR;
2622 }
2623
2624 /* Helper functions for get_subexp(). */
2625
2626 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2627 If it can arrive, register the sub expression expressed with SUB_TOP
2628 and SUB_LAST. */
2629
2630 static reg_errcode_t
2631 get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, bkref_str)
2632 const regex_t *preg;
2633 re_match_context_t *mctx;
2634 const re_sub_match_top_t *sub_top;
2635 re_sub_match_last_t *sub_last;
2636 int bkref_node, bkref_str;
2637 {
2638 reg_errcode_t err;
2639 int to_idx;
2640 /* Can the subexpression arrive the back reference? */
2641 err = check_arrival (preg, mctx, &sub_last->path, sub_last->node,
2642 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2643 if (err != REG_NOERROR)
2644 return err;
2645 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2646 sub_last->str_idx);
2647 if (BE (err != REG_NOERROR, 0))
2648 return err;
2649 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2650 clean_state_log_if_need (mctx, to_idx);
2651 return REG_NOERROR;
2652 }
2653
2654 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2655 Search '(' if FL_OPEN, or search ')' otherwise.
2656 TODO: This function isn't efficient...
2657 Because there might be more than one nodes whose types are
2658 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2659 nodes.
2660 E.g. RE: (a){2} */
2661
2662 static int
2663 find_subexp_node (dfa, nodes, subexp_idx, type)
2664 const re_dfa_t *dfa;
2665 const re_node_set *nodes;
2666 int subexp_idx, type;
2667 {
2668 int cls_idx;
2669 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2670 {
2671 int cls_node = nodes->elems[cls_idx];
2672 const re_token_t *node = dfa->nodes + cls_node;
2673 if (node->type == type
2674 && node->opr.idx == subexp_idx)
2675 return cls_node;
2676 }
2677 return -1;
2678 }
2679
2680 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2681 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2682 heavily reused.
2683 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2684
2685 static reg_errcode_t
2686 check_arrival (preg, mctx, path, top_node, top_str, last_node, last_str,
2687 type)
2688 const regex_t *preg;
2689 re_match_context_t *mctx;
2690 state_array_t *path;
2691 int top_node, top_str, last_node, last_str, type;
2692 {
2693 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2694 reg_errcode_t err;
2695 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2696 re_dfastate_t *cur_state = NULL;
2697 re_node_set *cur_nodes, next_nodes;
2698 re_dfastate_t **backup_state_log;
2699 unsigned int context;
2700
2701 subexp_num = dfa->nodes[top_node].opr.idx;
2702 /* Extend the buffer if we need. */
2703 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2704 {
2705 re_dfastate_t **new_array;
2706 int old_alloc = path->alloc;
2707 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2708 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2709 if (new_array == NULL)
2710 {
2711 path->alloc = old_alloc;
2712 return REG_ESPACE;
2713 }
2714 path->array = new_array;
2715 memset (new_array + old_alloc, '\0',
2716 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2717 }
2718
2719 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2720
2721 /* Temporary modify MCTX. */
2722 backup_state_log = mctx->state_log;
2723 backup_cur_idx = mctx->input->cur_idx;
2724 mctx->state_log = path->array;
2725 mctx->input->cur_idx = str_idx;
2726
2727 /* Setup initial node set. */
2728 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2729 preg->newline_anchor);
2730 if (str_idx == top_str)
2731 {
2732 err = re_node_set_init_1 (&next_nodes, top_node);
2733 if (BE (err != REG_NOERROR, 0))
2734 return err;
2735 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2736 if (BE (err != REG_NOERROR, 0))
2737 {
2738 re_node_set_free (&next_nodes);
2739 return err;
2740 }
2741 }
2742 else
2743 {
2744 cur_state = mctx->state_log[str_idx];
2745 if (cur_state && cur_state->has_backref)
2746 {
2747 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2748 if (BE ( err != REG_NOERROR, 0))
2749 return err;
2750 }
2751 else
2752 re_node_set_init_empty (&next_nodes);
2753 }
2754 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2755 {
2756 if (next_nodes.nelem)
2757 {
2758 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2759 subexp_num, type);
2760 if (BE ( err != REG_NOERROR, 0))
2761 {
2762 re_node_set_free (&next_nodes);
2763 return err;
2764 }
2765 }
2766 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2767 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2768 {
2769 re_node_set_free (&next_nodes);
2770 return err;
2771 }
2772 mctx->state_log[str_idx] = cur_state;
2773 }
2774
2775 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2776 {
2777 re_node_set_empty (&next_nodes);
2778 if (mctx->state_log[str_idx + 1])
2779 {
2780 err = re_node_set_merge (&next_nodes,
2781 &mctx->state_log[str_idx + 1]->nodes);
2782 if (BE (err != REG_NOERROR, 0))
2783 {
2784 re_node_set_free (&next_nodes);
2785 return err;
2786 }
2787 }
2788 if (cur_state)
2789 {
2790 err = check_arrival_add_next_nodes (preg, dfa, mctx, str_idx,
2791 &cur_state->nodes, &next_nodes);
2792 if (BE (err != REG_NOERROR, 0))
2793 {
2794 re_node_set_free (&next_nodes);
2795 return err;
2796 }
2797 }
2798 ++str_idx;
2799 if (next_nodes.nelem)
2800 {
2801 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2802 if (BE (err != REG_NOERROR, 0))
2803 {
2804 re_node_set_free (&next_nodes);
2805 return err;
2806 }
2807 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2808 subexp_num, type);
2809 if (BE ( err != REG_NOERROR, 0))
2810 {
2811 re_node_set_free (&next_nodes);
2812 return err;
2813 }
2814 }
2815 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2816 preg->newline_anchor);
2817 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2818 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2819 {
2820 re_node_set_free (&next_nodes);
2821 return err;
2822 }
2823 mctx->state_log[str_idx] = cur_state;
2824 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2825 }
2826 re_node_set_free (&next_nodes);
2827 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2828 : &mctx->state_log[last_str]->nodes);
2829 path->next_idx = str_idx;
2830
2831 /* Fix MCTX. */
2832 mctx->state_log = backup_state_log;
2833 mctx->input->cur_idx = backup_cur_idx;
2834
2835 /* Then check the current node set has the node LAST_NODE. */
2836 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2837 return REG_NOERROR;
2838
2839 return REG_NOMATCH;
2840 }
2841
2842 /* Helper functions for check_arrival. */
2843
2844 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2845 to NEXT_NODES.
2846 TODO: This function is similar to the functions transit_state*(),
2847 however this function has many additional works.
2848 Can't we unify them? */
2849
2850 static reg_errcode_t
2851 check_arrival_add_next_nodes (preg, dfa, mctx, str_idx, cur_nodes, next_nodes)
2852 const regex_t *preg;
2853 re_dfa_t *dfa;
2854 re_match_context_t *mctx;
2855 int str_idx;
2856 re_node_set *cur_nodes, *next_nodes;
2857 {
2858 int cur_idx;
2859 reg_errcode_t err;
2860 re_node_set union_set;
2861 re_node_set_init_empty (&union_set);
2862 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2863 {
2864 int naccepted = 0;
2865 int cur_node = cur_nodes->elems[cur_idx];
2866 re_token_type_t type = dfa->nodes[cur_node].type;
2867 if (IS_EPSILON_NODE (type))
2868 continue;
2869 #ifdef RE_ENABLE_I18N
2870 /* If the node may accept `multi byte'. */
2871 if (ACCEPT_MB_NODE (type))
2872 {
2873 naccepted = check_node_accept_bytes (preg, cur_node, mctx->input,
2874 str_idx);
2875 if (naccepted > 1)
2876 {
2877 re_dfastate_t *dest_state;
2878 int next_node = dfa->nexts[cur_node];
2879 int next_idx = str_idx + naccepted;
2880 dest_state = mctx->state_log[next_idx];
2881 re_node_set_empty (&union_set);
2882 if (dest_state)
2883 {
2884 err = re_node_set_merge (&union_set, &dest_state->nodes);
2885 if (BE (err != REG_NOERROR, 0))
2886 {
2887 re_node_set_free (&union_set);
2888 return err;
2889 }
2890 err = re_node_set_insert (&union_set, next_node);
2891 if (BE (err < 0, 0))
2892 {
2893 re_node_set_free (&union_set);
2894 return REG_ESPACE;
2895 }
2896 }
2897 else
2898 {
2899 err = re_node_set_insert (&union_set, next_node);
2900 if (BE (err < 0, 0))
2901 {
2902 re_node_set_free (&union_set);
2903 return REG_ESPACE;
2904 }
2905 }
2906 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2907 &union_set);
2908 if (BE (mctx->state_log[next_idx] == NULL
2909 && err != REG_NOERROR, 0))
2910 {
2911 re_node_set_free (&union_set);
2912 return err;
2913 }
2914 }
2915 }
2916 #endif /* RE_ENABLE_I18N */
2917 if (naccepted
2918 || check_node_accept (preg, dfa->nodes + cur_node, mctx,
2919 str_idx))
2920 {
2921 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
2922 if (BE (err < 0, 0))
2923 {
2924 re_node_set_free (&union_set);
2925 return REG_ESPACE;
2926 }
2927 }
2928 }
2929 re_node_set_free (&union_set);
2930 return REG_NOERROR;
2931 }
2932
2933 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
2934 CUR_NODES, however exclude the nodes which are:
2935 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2936 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2937 */
2938
2939 static reg_errcode_t
2940 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
2941 re_dfa_t *dfa;
2942 re_node_set *cur_nodes;
2943 int ex_subexp, type;
2944 {
2945 reg_errcode_t err;
2946 int idx, outside_node;
2947 re_node_set new_nodes;
2948 #ifdef DEBUG
2949 assert (cur_nodes->nelem);
2950 #endif
2951 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
2952 if (BE (err != REG_NOERROR, 0))
2953 return err;
2954 /* Create a new node set NEW_NODES with the nodes which are epsilon
2955 closures of the node in CUR_NODES. */
2956
2957 for (idx = 0; idx < cur_nodes->nelem; ++idx)
2958 {
2959 int cur_node = cur_nodes->elems[idx];
2960 re_node_set *eclosure = dfa->eclosures + cur_node;
2961 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
2962 if (outside_node == -1)
2963 {
2964 /* There are no problematic nodes, just merge them. */
2965 err = re_node_set_merge (&new_nodes, eclosure);
2966 if (BE (err != REG_NOERROR, 0))
2967 {
2968 re_node_set_free (&new_nodes);
2969 return err;
2970 }
2971 }
2972 else
2973 {
2974 /* There are problematic nodes, re-calculate incrementally. */
2975 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
2976 ex_subexp, type);
2977 if (BE (err != REG_NOERROR, 0))
2978 {
2979 re_node_set_free (&new_nodes);
2980 return err;
2981 }
2982 }
2983 }
2984 re_node_set_free (cur_nodes);
2985 *cur_nodes = new_nodes;
2986 return REG_NOERROR;
2987 }
2988
2989 /* Helper function for check_arrival_expand_ecl.
2990 Check incrementally the epsilon closure of TARGET, and if it isn't
2991 problematic append it to DST_NODES. */
2992
2993 static reg_errcode_t
2994 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
2995 re_dfa_t *dfa;
2996 int target, ex_subexp, type;
2997 re_node_set *dst_nodes;
2998 {
2999 int cur_node;
3000 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3001 {
3002 int err;
3003
3004 if (dfa->nodes[cur_node].type == type
3005 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3006 {
3007 if (type == OP_CLOSE_SUBEXP)
3008 {
3009 err = re_node_set_insert (dst_nodes, cur_node);
3010 if (BE (err == -1, 0))
3011 return REG_ESPACE;
3012 }
3013 break;
3014 }
3015 err = re_node_set_insert (dst_nodes, cur_node);
3016 if (BE (err == -1, 0))
3017 return REG_ESPACE;
3018 if (dfa->edests[cur_node].nelem == 0)
3019 break;
3020 if (dfa->edests[cur_node].nelem == 2)
3021 {
3022 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3023 dfa->edests[cur_node].elems[1],
3024 ex_subexp, type);
3025 if (BE (err != REG_NOERROR, 0))
3026 return err;
3027 }
3028 cur_node = dfa->edests[cur_node].elems[0];
3029 }
3030 return REG_NOERROR;
3031 }
3032
3033
3034 /* For all the back references in the current state, calculate the
3035 destination of the back references by the appropriate entry
3036 in MCTX->BKREF_ENTS. */
3037
3038 static reg_errcode_t
3039 expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num,
3040 type)
3041 const regex_t *preg;
3042 re_match_context_t *mctx;
3043 int cur_str, last_str, subexp_num, type;
3044 re_node_set *cur_nodes;
3045 {
3046 reg_errcode_t err;
3047 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3048 int cache_idx, cache_idx_start;
3049 /* The current state. */
3050
3051 cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3052 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3053 {
3054 int to_idx, next_node;
3055 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3056 if (ent->str_idx > cur_str)
3057 break;
3058 /* Is this entry ENT is appropriate? */
3059 if (!re_node_set_contains (cur_nodes, ent->node))
3060 continue; /* No. */
3061
3062 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3063 /* Calculate the destination of the back reference, and append it
3064 to MCTX->STATE_LOG. */
3065 if (to_idx == cur_str)
3066 {
3067 /* The backreference did epsilon transit, we must re-check all the
3068 node in the current state. */
3069 re_node_set new_dests;
3070 reg_errcode_t err2, err3;
3071 next_node = dfa->edests[ent->node].elems[0];
3072 if (re_node_set_contains (cur_nodes, next_node))
3073 continue;
3074 err = re_node_set_init_1 (&new_dests, next_node);
3075 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3076 err3 = re_node_set_merge (cur_nodes, &new_dests);
3077 re_node_set_free (&new_dests);
3078 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3079 || err3 != REG_NOERROR, 0))
3080 {
3081 err = (err != REG_NOERROR ? err
3082 : (err2 != REG_NOERROR ? err2 : err3));
3083 return err;
3084 }
3085 /* TODO: It is still inefficient... */
3086 cache_idx = cache_idx_start - 1;
3087 continue;
3088 }
3089 else
3090 {
3091 re_node_set union_set;
3092 next_node = dfa->nexts[ent->node];
3093 if (mctx->state_log[to_idx])
3094 {
3095 int ret;
3096 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3097 next_node))
3098 continue;
3099 err = re_node_set_init_copy (&union_set,
3100 &mctx->state_log[to_idx]->nodes);
3101 ret = re_node_set_insert (&union_set, next_node);
3102 if (BE (err != REG_NOERROR || ret < 0, 0))
3103 {
3104 re_node_set_free (&union_set);
3105 err = err != REG_NOERROR ? err : REG_ESPACE;
3106 return err;
3107 }
3108 }
3109 else
3110 {
3111 err = re_node_set_init_1 (&union_set, next_node);
3112 if (BE (err != REG_NOERROR, 0))
3113 return err;
3114 }
3115 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3116 re_node_set_free (&union_set);
3117 if (BE (mctx->state_log[to_idx] == NULL
3118 && err != REG_NOERROR, 0))
3119 return err;
3120 }
3121 }
3122 return REG_NOERROR;
3123 }
3124
3125 /* Build transition table for the state.
3126 Return the new table if succeeded, otherwise return NULL. */
3127
3128 static re_dfastate_t **
3129 build_trtable (preg, state)
3130 const regex_t *preg;
3131 re_dfastate_t *state;
3132 {
3133 reg_errcode_t err;
3134 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3135 int i, j, k, ch;
3136 int dests_node_malloced = 0, dest_states_malloced = 0;
3137 int ndests; /* Number of the destination states from `state'. */
3138 re_dfastate_t **trtable;
3139 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3140 re_node_set follows, *dests_node;
3141 bitset *dests_ch;
3142 bitset acceptable;
3143
3144 /* We build DFA states which corresponds to the destination nodes
3145 from `state'. `dests_node[i]' represents the nodes which i-th
3146 destination state contains, and `dests_ch[i]' represents the
3147 characters which i-th destination state accepts. */
3148 #ifdef _LIBC
3149 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3150 dests_node = (re_node_set *)
3151 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3152 else
3153 #endif
3154 {
3155 dests_node = (re_node_set *)
3156 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3157 if (BE (dests_node == NULL, 0))
3158 return NULL;
3159 dests_node_malloced = 1;
3160 }
3161 dests_ch = (bitset *) (dests_node + SBC_MAX);
3162
3163 /* Initialize transiton table. */
3164 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3165 state->word_trtable = 0;
3166 if (BE (trtable == NULL, 0))
3167 {
3168 if (dests_node_malloced)
3169 free (dests_node);
3170 return NULL;
3171 }
3172
3173 /* At first, group all nodes belonging to `state' into several
3174 destinations. */
3175 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
3176 if (BE (ndests <= 0, 0))
3177 {
3178 if (dests_node_malloced)
3179 free (dests_node);
3180 /* Return NULL in case of an error, trtable otherwise. */
3181 if (ndests == 0)
3182 {
3183 state->trtable = trtable;
3184 return trtable;
3185 }
3186 free (trtable);
3187 return NULL;
3188 }
3189
3190 err = re_node_set_alloc (&follows, ndests + 1);
3191 if (BE (err != REG_NOERROR, 0))
3192 goto out_free;
3193
3194 #ifdef _LIBC
3195 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3196 + ndests * 3 * sizeof (re_dfastate_t *)))
3197 dest_states = (re_dfastate_t **)
3198 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3199 else
3200 #endif
3201 {
3202 dest_states = (re_dfastate_t **)
3203 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3204 if (BE (dest_states == NULL, 0))
3205 {
3206 out_free:
3207 if (dest_states_malloced)
3208 free (dest_states);
3209 re_node_set_free (&follows);
3210 for (i = 0; i < ndests; ++i)
3211 re_node_set_free (dests_node + i);
3212 free (trtable);
3213 if (dests_node_malloced)
3214 free (dests_node);
3215 return NULL;
3216 }
3217 dest_states_malloced = 1;
3218 }
3219 dest_states_word = dest_states + ndests;
3220 dest_states_nl = dest_states_word + ndests;
3221 bitset_empty (acceptable);
3222
3223 /* Then build the states for all destinations. */
3224 for (i = 0; i < ndests; ++i)
3225 {
3226 int next_node;
3227 re_node_set_empty (&follows);
3228 /* Merge the follows of this destination states. */
3229 for (j = 0; j < dests_node[i].nelem; ++j)
3230 {
3231 next_node = dfa->nexts[dests_node[i].elems[j]];
3232 if (next_node != -1)
3233 {
3234 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3235 if (BE (err != REG_NOERROR, 0))
3236 goto out_free;
3237 }
3238 }
3239 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3240 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3241 goto out_free;
3242 /* If the new state has context constraint,
3243 build appropriate states for these contexts. */
3244 if (dest_states[i]->has_constraint)
3245 {
3246 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3247 CONTEXT_WORD);
3248 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3249 goto out_free;
3250 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3251 CONTEXT_NEWLINE);
3252 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3253 goto out_free;
3254 }
3255 else
3256 {
3257 dest_states_word[i] = dest_states[i];
3258 dest_states_nl[i] = dest_states[i];
3259 }
3260 bitset_merge (acceptable, dests_ch[i]);
3261 }
3262
3263 /* Update the transition table. */
3264 /* For all characters ch...: */
3265 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
3266 for (j = 0; j < UINT_BITS; ++j, ++ch)
3267 if ((acceptable[i] >> j) & 1)
3268 {
3269 for (k = 0; k < ndests; ++k)
3270 if ((dests_ch[k][i] >> j) & 1)
3271 {
3272 /* k-th destination accepts the word character ch. */
3273 if (state->word_trtable)
3274 {
3275 trtable[ch] = dest_states[k];
3276 trtable[ch + SBC_MAX] = dest_states_word[k];
3277 }
3278 else if (dfa->mb_cur_max > 1
3279 && dest_states[k] != dest_states_word[k])
3280 {
3281 re_dfastate_t **new_trtable;
3282
3283 new_trtable = (re_dfastate_t **)
3284 realloc (trtable,
3285 sizeof (re_dfastate_t *)
3286 * 2 * SBC_MAX);
3287 if (BE (new_trtable == NULL, 0))
3288 goto out_free;
3289 memcpy (new_trtable + SBC_MAX, new_trtable,
3290 sizeof (re_dfastate_t *) * SBC_MAX);
3291 trtable = new_trtable;
3292 state->word_trtable = 1;
3293 trtable[ch] = dest_states[k];
3294 trtable[ch + SBC_MAX] = dest_states_word[k];
3295 }
3296 else if (IS_WORD_CHAR (ch))
3297 trtable[ch] = dest_states_word[k];
3298 else
3299 trtable[ch] = dest_states[k];
3300 /* There must be only one destination which accepts
3301 character ch. See group_nodes_into_DFAstates. */
3302 break;
3303 }
3304 }
3305 /* new line */
3306 if (bitset_contain (acceptable, NEWLINE_CHAR))
3307 {
3308 /* The current state accepts newline character. */
3309 for (k = 0; k < ndests; ++k)
3310 if (bitset_contain (dests_ch[k], NEWLINE_CHAR))
3311 {
3312 /* k-th destination accepts newline character. */
3313 trtable[NEWLINE_CHAR] = dest_states_nl[k];
3314 if (state->word_trtable)
3315 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[k];
3316 /* There must be only one destination which accepts
3317 newline. See group_nodes_into_DFAstates. */
3318 break;
3319 }
3320 }
3321
3322 if (dest_states_malloced)
3323 free (dest_states);
3324
3325 re_node_set_free (&follows);
3326 for (i = 0; i < ndests; ++i)
3327 re_node_set_free (dests_node + i);
3328
3329 if (dests_node_malloced)
3330 free (dests_node);
3331
3332 state->trtable = trtable;
3333 return trtable;
3334 }
3335
3336 /* Group all nodes belonging to STATE into several destinations.
3337 Then for all destinations, set the nodes belonging to the destination
3338 to DESTS_NODE[i] and set the characters accepted by the destination
3339 to DEST_CH[i]. This function return the number of destinations. */
3340
3341 static int
3342 group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
3343 const regex_t *preg;
3344 const re_dfastate_t *state;
3345 re_node_set *dests_node;
3346 bitset *dests_ch;
3347 {
3348 reg_errcode_t err;
3349 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3350 int i, j, k;
3351 int ndests; /* Number of the destinations from `state'. */
3352 bitset accepts; /* Characters a node can accept. */
3353 const re_node_set *cur_nodes = &state->nodes;
3354 bitset_empty (accepts);
3355 ndests = 0;
3356
3357 /* For all the nodes belonging to `state', */
3358 for (i = 0; i < cur_nodes->nelem; ++i)
3359 {
3360 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3361 re_token_type_t type = node->type;
3362 unsigned int constraint = node->constraint;
3363
3364 /* Enumerate all single byte character this node can accept. */
3365 if (type == CHARACTER)
3366 bitset_set (accepts, node->opr.c);
3367 else if (type == SIMPLE_BRACKET)
3368 {
3369 bitset_merge (accepts, node->opr.sbcset);
3370 }
3371 else if (type == OP_PERIOD)
3372 {
3373 #ifdef RE_ENABLE_I18N
3374 if (dfa->mb_cur_max > 1)
3375 bitset_merge (accepts, dfa->sb_char);
3376 else
3377 #endif
3378 bitset_set_all (accepts);
3379 if (!(preg->syntax & RE_DOT_NEWLINE))
3380 bitset_clear (accepts, '\n');
3381 if (preg->syntax & RE_DOT_NOT_NULL)
3382 bitset_clear (accepts, '\0');
3383 }
3384 #ifdef RE_ENABLE_I18N
3385 else if (type == OP_UTF8_PERIOD)
3386 {
3387 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3388 if (!(preg->syntax & RE_DOT_NEWLINE))
3389 bitset_clear (accepts, '\n');
3390 if (preg->syntax & RE_DOT_NOT_NULL)
3391 bitset_clear (accepts, '\0');
3392 }
3393 #endif
3394 else
3395 continue;
3396
3397 /* Check the `accepts' and sift the characters which are not
3398 match it the context. */
3399 if (constraint)
3400 {
3401 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3402 {
3403 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3404 bitset_empty (accepts);
3405 if (accepts_newline)
3406 bitset_set (accepts, NEWLINE_CHAR);
3407 else
3408 continue;
3409 }
3410 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3411 {
3412 bitset_empty (accepts);
3413 continue;
3414 }
3415
3416 if (constraint & NEXT_WORD_CONSTRAINT)
3417 {
3418 unsigned int any_set = 0;
3419 if (type == CHARACTER && !node->word_char)
3420 {
3421 bitset_empty (accepts);
3422 continue;
3423 }
3424 #ifdef RE_ENABLE_I18N
3425 if (dfa->mb_cur_max > 1)
3426 for (j = 0; j < BITSET_UINTS; ++j)
3427 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3428 else
3429 #endif
3430 for (j = 0; j < BITSET_UINTS; ++j)
3431 any_set |= (accepts[j] &= dfa->word_char[j]);
3432 if (!any_set)
3433 continue;
3434 }
3435 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3436 {
3437 unsigned int any_set = 0;
3438 if (type == CHARACTER && node->word_char)
3439 {
3440 bitset_empty (accepts);
3441 continue;
3442 }
3443 #ifdef RE_ENABLE_I18N
3444 if (dfa->mb_cur_max > 1)
3445 for (j = 0; j < BITSET_UINTS; ++j)
3446 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3447 else
3448 #endif
3449 for (j = 0; j < BITSET_UINTS; ++j)
3450 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3451 if (!any_set)
3452 continue;
3453 }
3454 }
3455
3456 /* Then divide `accepts' into DFA states, or create a new
3457 state. Above, we make sure that accepts is not empty. */
3458 for (j = 0; j < ndests; ++j)
3459 {
3460 bitset intersec; /* Intersection sets, see below. */
3461 bitset remains;
3462 /* Flags, see below. */
3463 int has_intersec, not_subset, not_consumed;
3464
3465 /* Optimization, skip if this state doesn't accept the character. */
3466 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3467 continue;
3468
3469 /* Enumerate the intersection set of this state and `accepts'. */
3470 has_intersec = 0;
3471 for (k = 0; k < BITSET_UINTS; ++k)
3472 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3473 /* And skip if the intersection set is empty. */
3474 if (!has_intersec)
3475 continue;
3476
3477 /* Then check if this state is a subset of `accepts'. */
3478 not_subset = not_consumed = 0;
3479 for (k = 0; k < BITSET_UINTS; ++k)
3480 {
3481 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3482 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3483 }
3484
3485 /* If this state isn't a subset of `accepts', create a
3486 new group state, which has the `remains'. */
3487 if (not_subset)
3488 {
3489 bitset_copy (dests_ch[ndests], remains);
3490 bitset_copy (dests_ch[j], intersec);
3491 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3492 if (BE (err != REG_NOERROR, 0))
3493 goto error_return;
3494 ++ndests;
3495 }
3496
3497 /* Put the position in the current group. */
3498 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3499 if (BE (err < 0, 0))
3500 goto error_return;
3501
3502 /* If all characters are consumed, go to next node. */
3503 if (!not_consumed)
3504 break;
3505 }
3506 /* Some characters remain, create a new group. */
3507 if (j == ndests)
3508 {
3509 bitset_copy (dests_ch[ndests], accepts);
3510 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3511 if (BE (err != REG_NOERROR, 0))
3512 goto error_return;
3513 ++ndests;
3514 bitset_empty (accepts);
3515 }
3516 }
3517 return ndests;
3518 error_return:
3519 for (j = 0; j < ndests; ++j)
3520 re_node_set_free (dests_node + j);
3521 return -1;
3522 }
3523
3524 #ifdef RE_ENABLE_I18N
3525 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3526 Return the number of the bytes the node accepts.
3527 STR_IDX is the current index of the input string.
3528
3529 This function handles the nodes which can accept one character, or
3530 one collating element like '.', '[a-z]', opposite to the other nodes
3531 can only accept one byte. */
3532
3533 static int
3534 check_node_accept_bytes (preg, node_idx, input, str_idx)
3535 const regex_t *preg;
3536 int node_idx, str_idx;
3537 const re_string_t *input;
3538 {
3539 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3540 const re_token_t *node = dfa->nodes + node_idx;
3541 int char_len, elem_len;
3542 int i;
3543
3544 if (BE (node->type == OP_UTF8_PERIOD, 0))
3545 {
3546 unsigned char c = re_string_byte_at (input, str_idx), d;
3547 if (BE (c < 0xc2, 1))
3548 return 0;
3549
3550 if (str_idx + 2 > input->len)
3551 return 0;
3552
3553 d = re_string_byte_at (input, str_idx + 1);
3554 if (c < 0xe0)
3555 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3556 else if (c < 0xf0)
3557 {
3558 char_len = 3;
3559 if (c == 0xe0 && d < 0xa0)
3560 return 0;
3561 }
3562 else if (c < 0xf8)
3563 {
3564 char_len = 4;
3565 if (c == 0xf0 && d < 0x90)
3566 return 0;
3567 }
3568 else if (c < 0xfc)
3569 {
3570 char_len = 5;
3571 if (c == 0xf8 && d < 0x88)
3572 return 0;
3573 }
3574 else if (c < 0xfe)
3575 {
3576 char_len = 6;
3577 if (c == 0xfc && d < 0x84)
3578 return 0;
3579 }
3580 else
3581 return 0;
3582
3583 if (str_idx + char_len > input->len)
3584 return 0;
3585
3586 for (i = 1; i < char_len; ++i)
3587 {
3588 d = re_string_byte_at (input, str_idx + i);
3589 if (d < 0x80 || d > 0xbf)
3590 return 0;
3591 }
3592 return char_len;
3593 }
3594
3595 char_len = re_string_char_size_at (input, str_idx);
3596 if (node->type == OP_PERIOD)
3597 {
3598 if (char_len <= 1)
3599 return 0;
3600 /* FIXME: I don't think this if is needed, as both '\n'
3601 and '\0' are char_len == 1. */
3602 /* '.' accepts any one character except the following two cases. */
3603 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
3604 re_string_byte_at (input, str_idx) == '\n') ||
3605 ((preg->syntax & RE_DOT_NOT_NULL) &&
3606 re_string_byte_at (input, str_idx) == '\0'))
3607 return 0;
3608 return char_len;
3609 }
3610
3611 elem_len = re_string_elem_size_at (input, str_idx);
3612 if (elem_len <= 1 && char_len <= 1)
3613 return 0;
3614
3615 if (node->type == COMPLEX_BRACKET)
3616 {
3617 const re_charset_t *cset = node->opr.mbcset;
3618 # ifdef _LIBC
3619 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3620 + str_idx);
3621 int j;
3622 uint32_t nrules;
3623 # endif /* _LIBC */
3624 int match_len = 0;
3625 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3626 ? re_string_wchar_at (input, str_idx) : 0);
3627
3628 /* match with multibyte character? */
3629 for (i = 0; i < cset->nmbchars; ++i)
3630 if (wc == cset->mbchars[i])
3631 {
3632 match_len = char_len;
3633 goto check_node_accept_bytes_match;
3634 }
3635 /* match with character_class? */
3636 for (i = 0; i < cset->nchar_classes; ++i)
3637 {
3638 wctype_t wt = cset->char_classes[i];
3639 if (__iswctype (wc, wt))
3640 {
3641 match_len = char_len;
3642 goto check_node_accept_bytes_match;
3643 }
3644 }
3645
3646 # ifdef _LIBC
3647 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3648 if (nrules != 0)
3649 {
3650 unsigned int in_collseq = 0;
3651 const int32_t *table, *indirect;
3652 const unsigned char *weights, *extra;
3653 const char *collseqwc;
3654 int32_t idx;
3655 /* This #include defines a local function! */
3656 # include <locale/weight.h>
3657
3658 /* match with collating_symbol? */
3659 if (cset->ncoll_syms)
3660 extra = (const unsigned char *)
3661 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3662 for (i = 0; i < cset->ncoll_syms; ++i)
3663 {
3664 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3665 /* Compare the length of input collating element and
3666 the length of current collating element. */
3667 if (*coll_sym != elem_len)
3668 continue;
3669 /* Compare each bytes. */
3670 for (j = 0; j < *coll_sym; j++)
3671 if (pin[j] != coll_sym[1 + j])
3672 break;
3673 if (j == *coll_sym)
3674 {
3675 /* Match if every bytes is equal. */
3676 match_len = j;
3677 goto check_node_accept_bytes_match;
3678 }
3679 }
3680
3681 if (cset->nranges)
3682 {
3683 if (elem_len <= char_len)
3684 {
3685 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3686 in_collseq = __collseq_table_lookup (collseqwc, wc);
3687 }
3688 else
3689 in_collseq = find_collation_sequence_value (pin, elem_len);
3690 }
3691 /* match with range expression? */
3692 for (i = 0; i < cset->nranges; ++i)
3693 if (cset->range_starts[i] <= in_collseq
3694 && in_collseq <= cset->range_ends[i])
3695 {
3696 match_len = elem_len;
3697 goto check_node_accept_bytes_match;
3698 }
3699
3700 /* match with equivalence_class? */
3701 if (cset->nequiv_classes)
3702 {
3703 const unsigned char *cp = pin;
3704 table = (const int32_t *)
3705 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3706 weights = (const unsigned char *)
3707 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3708 extra = (const unsigned char *)
3709 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3710 indirect = (const int32_t *)
3711 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3712 idx = findidx (&cp);
3713 if (idx > 0)
3714 for (i = 0; i < cset->nequiv_classes; ++i)
3715 {
3716 int32_t equiv_class_idx = cset->equiv_classes[i];
3717 size_t weight_len = weights[idx];
3718 if (weight_len == weights[equiv_class_idx])
3719 {
3720 int cnt = 0;
3721 while (cnt <= weight_len
3722 && (weights[equiv_class_idx + 1 + cnt]
3723 == weights[idx + 1 + cnt]))
3724 ++cnt;
3725 if (cnt > weight_len)
3726 {
3727 match_len = elem_len;
3728 goto check_node_accept_bytes_match;
3729 }
3730 }
3731 }
3732 }
3733 }
3734 else
3735 # endif /* _LIBC */
3736 {
3737 /* match with range expression? */
3738 #if __GNUC__ >= 2
3739 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3740 #else
3741 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3742 cmp_buf[2] = wc;
3743 #endif
3744 for (i = 0; i < cset->nranges; ++i)
3745 {
3746 cmp_buf[0] = cset->range_starts[i];
3747 cmp_buf[4] = cset->range_ends[i];
3748 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3749 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3750 {
3751 match_len = char_len;
3752 goto check_node_accept_bytes_match;
3753 }
3754 }
3755 }
3756 check_node_accept_bytes_match:
3757 if (!cset->non_match)
3758 return match_len;
3759 else
3760 {
3761 if (match_len > 0)
3762 return 0;
3763 else
3764 return (elem_len > char_len) ? elem_len : char_len;
3765 }
3766 }
3767 return 0;
3768 }
3769
3770 # ifdef _LIBC
3771 static unsigned int
3772 find_collation_sequence_value (mbs, mbs_len)
3773 const unsigned char *mbs;
3774 size_t mbs_len;
3775 {
3776 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3777 if (nrules == 0)
3778 {
3779 if (mbs_len == 1)
3780 {
3781 /* No valid character. Match it as a single byte character. */
3782 const unsigned char *collseq = (const unsigned char *)
3783 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3784 return collseq[mbs[0]];
3785 }
3786 return UINT_MAX;
3787 }
3788 else
3789 {
3790 int32_t idx;
3791 const unsigned char *extra = (const unsigned char *)
3792 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3793
3794 for (idx = 0; ;)
3795 {
3796 int mbs_cnt, found = 0;
3797 int32_t elem_mbs_len;
3798 /* Skip the name of collating element name. */
3799 idx = idx + extra[idx] + 1;
3800 elem_mbs_len = extra[idx++];
3801 if (mbs_len == elem_mbs_len)
3802 {
3803 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3804 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3805 break;
3806 if (mbs_cnt == elem_mbs_len)
3807 /* Found the entry. */
3808 found = 1;
3809 }
3810 /* Skip the byte sequence of the collating element. */
3811 idx += elem_mbs_len;
3812 /* Adjust for the alignment. */
3813 idx = (idx + 3) & ~3;
3814 /* Skip the collation sequence value. */
3815 idx += sizeof (uint32_t);
3816 /* Skip the wide char sequence of the collating element. */
3817 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3818 /* If we found the entry, return the sequence value. */
3819 if (found)
3820 return *(uint32_t *) (extra + idx);
3821 /* Skip the collation sequence value. */
3822 idx += sizeof (uint32_t);
3823 }
3824 }
3825 }
3826 # endif /* _LIBC */
3827 #endif /* RE_ENABLE_I18N */
3828
3829 /* Check whether the node accepts the byte which is IDX-th
3830 byte of the INPUT. */
3831
3832 static int
3833 check_node_accept (preg, node, mctx, idx)
3834 const regex_t *preg;
3835 const re_token_t *node;
3836 const re_match_context_t *mctx;
3837 int idx;
3838 {
3839 unsigned char ch;
3840 if (node->constraint)
3841 {
3842 /* The node has constraints. Check whether the current context
3843 satisfies the constraints. */
3844 unsigned int context = re_string_context_at (mctx->input, idx,
3845 mctx->eflags,
3846 preg->newline_anchor);
3847 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3848 return 0;
3849 }
3850 ch = re_string_byte_at (mctx->input, idx);
3851 switch (node->type)
3852 {
3853 case CHARACTER:
3854 return node->opr.c == ch;
3855 case SIMPLE_BRACKET:
3856 return bitset_contain (node->opr.sbcset, ch);
3857 #ifdef RE_ENABLE_I18N
3858 case OP_UTF8_PERIOD:
3859 if (ch >= 0x80)
3860 return 0;
3861 /* FALLTHROUGH */
3862 #endif
3863 case OP_PERIOD:
3864 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
3865 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
3866 default:
3867 return 0;
3868 }
3869 }
3870
3871 /* Extend the buffers, if the buffers have run out. */
3872
3873 static reg_errcode_t
3874 extend_buffers (mctx)
3875 re_match_context_t *mctx;
3876 {
3877 reg_errcode_t ret;
3878 re_string_t *pstr = mctx->input;
3879
3880 /* Double the lengthes of the buffers. */
3881 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3882 if (BE (ret != REG_NOERROR, 0))
3883 return ret;
3884
3885 if (mctx->state_log != NULL)
3886 {
3887 /* And double the length of state_log. */
3888 /* XXX We have no indication of the size of this buffer. If this
3889 allocation fail we have no indication that the state_log array
3890 does not have the right size. */
3891 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3892 pstr->bufs_len + 1);
3893 if (BE (new_array == NULL, 0))
3894 return REG_ESPACE;
3895 mctx->state_log = new_array;
3896 }
3897
3898 /* Then reconstruct the buffers. */
3899 if (pstr->icase)
3900 {
3901 #ifdef RE_ENABLE_I18N
3902 if (pstr->mb_cur_max > 1)
3903 {
3904 ret = build_wcs_upper_buffer (pstr);
3905 if (BE (ret != REG_NOERROR, 0))
3906 return ret;
3907 }
3908 else
3909 #endif /* RE_ENABLE_I18N */
3910 build_upper_buffer (pstr);
3911 }
3912 else
3913 {
3914 #ifdef RE_ENABLE_I18N
3915 if (pstr->mb_cur_max > 1)
3916 build_wcs_buffer (pstr);
3917 else
3918 #endif /* RE_ENABLE_I18N */
3919 {
3920 if (pstr->trans != NULL)
3921 re_string_translate_buffer (pstr);
3922 }
3923 }
3924 return REG_NOERROR;
3925 }
3926
3927 \f
3928 /* Functions for matching context. */
3929
3930 /* Initialize MCTX. */
3931
3932 static reg_errcode_t
3933 match_ctx_init (mctx, eflags, input, n)
3934 re_match_context_t *mctx;
3935 int eflags, n;
3936 re_string_t *input;
3937 {
3938 mctx->eflags = eflags;
3939 mctx->input = input;
3940 mctx->match_last = -1;
3941 if (n > 0)
3942 {
3943 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
3944 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
3945 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
3946 return REG_ESPACE;
3947 }
3948 else
3949 mctx->bkref_ents = NULL;
3950 mctx->nbkref_ents = 0;
3951 mctx->abkref_ents = n;
3952 mctx->max_mb_elem_len = 1;
3953 mctx->nsub_tops = 0;
3954 mctx->asub_tops = n;
3955 return REG_NOERROR;
3956 }
3957
3958 /* Clean the entries which depend on the current input in MCTX.
3959 This function must be invoked when the matcher changes the start index
3960 of the input, or changes the input string. */
3961
3962 static void
3963 match_ctx_clean (mctx)
3964 re_match_context_t *mctx;
3965 {
3966 match_ctx_free_subtops (mctx);
3967 mctx->nsub_tops = 0;
3968 mctx->nbkref_ents = 0;
3969 }
3970
3971 /* Free all the memory associated with MCTX. */
3972
3973 static void
3974 match_ctx_free (mctx)
3975 re_match_context_t *mctx;
3976 {
3977 match_ctx_free_subtops (mctx);
3978 re_free (mctx->sub_tops);
3979 re_free (mctx->bkref_ents);
3980 }
3981
3982 /* Free all the memory associated with MCTX->SUB_TOPS. */
3983
3984 static void
3985 match_ctx_free_subtops (mctx)
3986 re_match_context_t *mctx;
3987 {
3988 int st_idx;
3989 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
3990 {
3991 int sl_idx;
3992 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
3993 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
3994 {
3995 re_sub_match_last_t *last = top->lasts[sl_idx];
3996 re_free (last->path.array);
3997 re_free (last);
3998 }
3999 re_free (top->lasts);
4000 if (top->path)
4001 {
4002 re_free (top->path->array);
4003 re_free (top->path);
4004 }
4005 free (top);
4006 }
4007 }
4008
4009 /* Add a new backreference entry to MCTX.
4010 Note that we assume that caller never call this function with duplicate
4011 entry, and call with STR_IDX which isn't smaller than any existing entry.
4012 */
4013
4014 static reg_errcode_t
4015 match_ctx_add_entry (mctx, node, str_idx, from, to)
4016 re_match_context_t *mctx;
4017 int node, str_idx, from, to;
4018 {
4019 if (mctx->nbkref_ents >= mctx->abkref_ents)
4020 {
4021 struct re_backref_cache_entry* new_entry;
4022 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4023 mctx->abkref_ents * 2);
4024 if (BE (new_entry == NULL, 0))
4025 {
4026 re_free (mctx->bkref_ents);
4027 return REG_ESPACE;
4028 }
4029 mctx->bkref_ents = new_entry;
4030 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4031 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4032 mctx->abkref_ents *= 2;
4033 }
4034 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4035 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4036 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4037 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4038 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
4039 if (mctx->max_mb_elem_len < to - from)
4040 mctx->max_mb_elem_len = to - from;
4041 return REG_NOERROR;
4042 }
4043
4044 /* Search for the first entry which has the same str_idx.
4045 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4046
4047 static int
4048 search_cur_bkref_entry (mctx, str_idx)
4049 re_match_context_t *mctx;
4050 int str_idx;
4051 {
4052 int left, right, mid;
4053 right = mctx->nbkref_ents;
4054 for (left = 0; left < right;)
4055 {
4056 mid = (left + right) / 2;
4057 if (mctx->bkref_ents[mid].str_idx < str_idx)
4058 left = mid + 1;
4059 else
4060 right = mid;
4061 }
4062 return left;
4063 }
4064
4065 static void
4066 match_ctx_clear_flag (mctx)
4067 re_match_context_t *mctx;
4068 {
4069 int i;
4070 for (i = 0; i < mctx->nbkref_ents; ++i)
4071 mctx->bkref_ents[i].flag = 0;
4072 }
4073
4074 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4075 at STR_IDX. */
4076
4077 static reg_errcode_t
4078 match_ctx_add_subtop (mctx, node, str_idx)
4079 re_match_context_t *mctx;
4080 int node, str_idx;
4081 {
4082 #ifdef DEBUG
4083 assert (mctx->sub_tops != NULL);
4084 assert (mctx->asub_tops > 0);
4085 #endif
4086 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4087 {
4088 int new_asub_tops = mctx->asub_tops * 2;
4089 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4090 re_sub_match_top_t *,
4091 new_asub_tops);
4092 if (BE (new_array == NULL, 0))
4093 return REG_ESPACE;
4094 mctx->sub_tops = new_array;
4095 mctx->asub_tops = new_asub_tops;
4096 }
4097 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4098 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4099 return REG_ESPACE;
4100 mctx->sub_tops[mctx->nsub_tops]->node = node;
4101 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4102 return REG_NOERROR;
4103 }
4104
4105 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4106 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4107
4108 static re_sub_match_last_t *
4109 match_ctx_add_sublast (subtop, node, str_idx)
4110 re_sub_match_top_t *subtop;
4111 int node, str_idx;
4112 {
4113 re_sub_match_last_t *new_entry;
4114 if (BE (subtop->nlasts == subtop->alasts, 0))
4115 {
4116 int new_alasts = 2 * subtop->alasts + 1;
4117 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4118 re_sub_match_last_t *,
4119 new_alasts);
4120 if (BE (new_array == NULL, 0))
4121 return NULL;
4122 subtop->lasts = new_array;
4123 subtop->alasts = new_alasts;
4124 }
4125 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4126 if (BE (new_entry != NULL, 1))
4127 {
4128 subtop->lasts[subtop->nlasts] = new_entry;
4129 new_entry->node = node;
4130 new_entry->str_idx = str_idx;
4131 ++subtop->nlasts;
4132 }
4133 return new_entry;
4134 }
4135
4136 static void
4137 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
4138 check_subexp)
4139 re_sift_context_t *sctx;
4140 re_dfastate_t **sifted_sts, **limited_sts;
4141 int last_node, last_str_idx, check_subexp;
4142 {
4143 sctx->sifted_states = sifted_sts;
4144 sctx->limited_states = limited_sts;
4145 sctx->last_node = last_node;
4146 sctx->last_str_idx = last_str_idx;
4147 sctx->check_subexp = check_subexp;
4148 sctx->cur_bkref = -1;
4149 sctx->cls_subexp_idx = -1;
4150 re_node_set_init_empty (&sctx->limits);
4151 }