]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
Fix memory leak in regexp compiler (BZ #17069)
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <stdint.h>
21
22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23 size_t length, reg_syntax_t syntax);
24 static void re_compile_fastmap_iter (regex_t *bufp,
25 const re_dfastate_t *init_state,
26 char *fastmap);
27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
39 void *extra);
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
42 void *extra);
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 bin_tree_t *node);
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
51 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
52 unsigned int constraint);
53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 int node, int root);
56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57 static int fetch_number (re_string_t *input, re_token_t *token,
58 reg_syntax_t syntax);
59 static int peek_token (re_token_t *token, re_string_t *input,
60 reg_syntax_t syntax) internal_function;
61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62 reg_syntax_t syntax, reg_errcode_t *err);
63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 int nest, reg_errcode_t *err);
66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 int nest, reg_errcode_t *err);
69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 int nest, reg_errcode_t *err);
72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76 re_dfa_t *dfa, re_token_t *token,
77 reg_syntax_t syntax, reg_errcode_t *err);
78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79 re_token_t *token, reg_syntax_t syntax,
80 reg_errcode_t *err);
81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_string_t *regexp,
83 re_token_t *token, int token_len,
84 re_dfa_t *dfa,
85 reg_syntax_t syntax,
86 int accept_hyphen);
87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 re_string_t *regexp,
89 re_token_t *token);
90 #ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 re_charset_t *mbcset,
93 int *equiv_class_alloc,
94 const unsigned char *name);
95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 bitset_t sbcset,
97 re_charset_t *mbcset,
98 int *char_class_alloc,
99 const unsigned char *class_name,
100 reg_syntax_t syntax);
101 #else /* not RE_ENABLE_I18N */
102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
103 const unsigned char *name);
104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 bitset_t sbcset,
106 const unsigned char *class_name,
107 reg_syntax_t syntax);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110 RE_TRANSLATE_TYPE trans,
111 const unsigned char *class_name,
112 const unsigned char *extra,
113 int non_match, reg_errcode_t *err);
114 static bin_tree_t *create_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 re_token_type_t type);
117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118 bin_tree_t *left, bin_tree_t *right,
119 const re_token_t *token);
120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121 static void free_token (re_token_t *node);
122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 \f
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
129
130 const char __re_error_msgid[] attribute_hidden =
131 {
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
134 "\0"
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
137 "\0"
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 "\0"
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 "\0"
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 "\0"
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 "\0"
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 "\0"
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 "\0"
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 "\0"
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 "\0"
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 "\0"
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 "\0"
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 "\0"
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 "\0"
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 "\0"
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 "\0"
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 };
183
184 const size_t __re_error_msgid_idx[] attribute_hidden =
185 {
186 REG_NOERROR_IDX,
187 REG_NOMATCH_IDX,
188 REG_BADPAT_IDX,
189 REG_ECOLLATE_IDX,
190 REG_ECTYPE_IDX,
191 REG_EESCAPE_IDX,
192 REG_ESUBREG_IDX,
193 REG_EBRACK_IDX,
194 REG_EPAREN_IDX,
195 REG_EBRACE_IDX,
196 REG_BADBR_IDX,
197 REG_ERANGE_IDX,
198 REG_ESPACE_IDX,
199 REG_BADRPT_IDX,
200 REG_EEND_IDX,
201 REG_ESIZE_IDX,
202 REG_ERPAREN_IDX
203 };
204 \f
205 /* Entry points for GNU code. */
206
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
210
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
213
214 const char *
215 re_compile_pattern (pattern, length, bufp)
216 const char *pattern;
217 size_t length;
218 struct re_pattern_buffer *bufp;
219 {
220 reg_errcode_t ret;
221
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239
240 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
246
247
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255 reg_syntax_t
256 re_set_syntax (syntax)
257 reg_syntax_t syntax;
258 {
259 reg_syntax_t ret = re_syntax_options;
260
261 re_syntax_options = syntax;
262 return ret;
263 }
264 #ifdef _LIBC
265 weak_alias (__re_set_syntax, re_set_syntax)
266 #endif
267
268 int
269 re_compile_fastmap (bufp)
270 struct re_pattern_buffer *bufp;
271 {
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
274
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
284 return 0;
285 }
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
289
290 static inline void
291 __attribute ((always_inline))
292 re_set_fastmap (char *fastmap, int icase, int ch)
293 {
294 fastmap[ch] = 1;
295 if (icase)
296 fastmap[tolower (ch)] = 1;
297 }
298
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
301
302 static void
303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 char *fastmap)
305 {
306 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307 int node_cnt;
308 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
310 {
311 int node = init_state->nodes.elems[node_cnt];
312 re_token_type_t type = dfa->nodes[node].type;
313
314 if (type == CHARACTER)
315 {
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
319 {
320 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321 wchar_t wc;
322 mbstate_t state;
323
324 p = buf;
325 *p++ = dfa->nodes[node].opr.c;
326 while (++node < dfa->nodes_len
327 && dfa->nodes[node].type == CHARACTER
328 && dfa->nodes[node].mb_partial)
329 *p++ = dfa->nodes[node].opr.c;
330 memset (&state, '\0', sizeof (state));
331 if (__mbrtowc (&wc, (const char *) buf, p - buf,
332 &state) == p - buf
333 && (__wcrtomb ((char *) buf, towlower (wc), &state)
334 != (size_t) -1))
335 re_set_fastmap (fastmap, 0, buf[0]);
336 }
337 #endif
338 }
339 else if (type == SIMPLE_BRACKET)
340 {
341 int i, ch;
342 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 {
344 int j;
345 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347 if (w & ((bitset_word_t) 1 << j))
348 re_set_fastmap (fastmap, icase, ch);
349 }
350 }
351 #ifdef RE_ENABLE_I18N
352 else if (type == COMPLEX_BRACKET)
353 {
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 int i;
356
357 # ifdef _LIBC
358 /* See if we have to try all bytes which start multiple collation
359 elements.
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365 && (cset->ncoll_syms || cset->nranges))
366 {
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
372 }
373 # endif /* _LIBC */
374
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa->mb_cur_max > 1
380 && (cset->nchar_classes || cset->non_match || cset->nranges
381 # ifdef _LIBC
382 || cset->nequiv_classes
383 # endif /* _LIBC */
384 ))
385 {
386 unsigned char c = 0;
387 do
388 {
389 mbstate_t mbs;
390 memset (&mbs, 0, sizeof (mbs));
391 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392 re_set_fastmap (fastmap, false, (int) c);
393 }
394 while (++c != 0);
395 }
396
397 else
398 {
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i = 0; i < cset->nmbchars; ++i)
401 {
402 char buf[256];
403 mbstate_t state;
404 memset (&state, '\0', sizeof (state));
405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408 {
409 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
410 != (size_t) -1)
411 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
412 }
413 }
414 }
415 }
416 #endif /* RE_ENABLE_I18N */
417 else if (type == OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type == OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type == END_OF_RE)
422 {
423 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424 if (type == END_OF_RE)
425 bufp->can_be_null = 1;
426 return;
427 }
428 }
429 }
430 \f
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
433
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
436
437 `buffer' to the compiled pattern;
438 `used' to the length of the compiled pattern;
439 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 `fastmap' to an allocated space for the fastmap;
444 `fastmap_accurate' to zero;
445 `re_nsub' to the number of subexpressions in PATTERN.
446
447 PATTERN is the address of the pattern string.
448
449 CFLAGS is a series of bits which affect compilation.
450
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
453
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
456
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
459
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
462 registers.
463
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
466
467 int
468 regcomp (preg, pattern, cflags)
469 regex_t *__restrict preg;
470 const char *__restrict pattern;
471 int cflags;
472 {
473 reg_errcode_t ret;
474 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
475 : RE_SYNTAX_POSIX_BASIC);
476
477 preg->buffer = NULL;
478 preg->allocated = 0;
479 preg->used = 0;
480
481 /* Try to allocate space for the fastmap. */
482 preg->fastmap = re_malloc (char, SBC_MAX);
483 if (BE (preg->fastmap == NULL, 0))
484 return REG_ESPACE;
485
486 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
487
488 /* If REG_NEWLINE is set, newlines are treated differently. */
489 if (cflags & REG_NEWLINE)
490 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
491 syntax &= ~RE_DOT_NEWLINE;
492 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
493 /* It also changes the matching behavior. */
494 preg->newline_anchor = 1;
495 }
496 else
497 preg->newline_anchor = 0;
498 preg->no_sub = !!(cflags & REG_NOSUB);
499 preg->translate = NULL;
500
501 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
502
503 /* POSIX doesn't distinguish between an unmatched open-group and an
504 unmatched close-group: both are REG_EPAREN. */
505 if (ret == REG_ERPAREN)
506 ret = REG_EPAREN;
507
508 /* We have already checked preg->fastmap != NULL. */
509 if (BE (ret == REG_NOERROR, 1))
510 /* Compute the fastmap now, since regexec cannot modify the pattern
511 buffer. This function never fails in this implementation. */
512 (void) re_compile_fastmap (preg);
513 else
514 {
515 /* Some error occurred while compiling the expression. */
516 re_free (preg->fastmap);
517 preg->fastmap = NULL;
518 }
519
520 return (int) ret;
521 }
522 #ifdef _LIBC
523 weak_alias (__regcomp, regcomp)
524 #endif
525
526 /* Returns a message corresponding to an error code, ERRCODE, returned
527 from either regcomp or regexec. We don't use PREG here. */
528
529 size_t
530 regerror (errcode, preg, errbuf, errbuf_size)
531 int errcode;
532 const regex_t *__restrict preg;
533 char *__restrict errbuf;
534 size_t errbuf_size;
535 {
536 const char *msg;
537 size_t msg_size;
538
539 if (BE (errcode < 0
540 || errcode >= (int) (sizeof (__re_error_msgid_idx)
541 / sizeof (__re_error_msgid_idx[0])), 0))
542 /* Only error codes returned by the rest of the code should be passed
543 to this routine. If we are given anything else, or if other regex
544 code generates an invalid error code, then the program has a bug.
545 Dump core so we can fix it. */
546 abort ();
547
548 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
549
550 msg_size = strlen (msg) + 1; /* Includes the null. */
551
552 if (BE (errbuf_size != 0, 1))
553 {
554 if (BE (msg_size > errbuf_size, 0))
555 {
556 #if defined HAVE_MEMPCPY || defined _LIBC
557 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
558 #else
559 memcpy (errbuf, msg, errbuf_size - 1);
560 errbuf[errbuf_size - 1] = 0;
561 #endif
562 }
563 else
564 memcpy (errbuf, msg, msg_size);
565 }
566
567 return msg_size;
568 }
569 #ifdef _LIBC
570 weak_alias (__regerror, regerror)
571 #endif
572
573
574 #ifdef RE_ENABLE_I18N
575 /* This static array is used for the map to single-byte characters when
576 UTF-8 is used. Otherwise we would allocate memory just to initialize
577 it the same all the time. UTF-8 is the preferred encoding so this is
578 a worthwhile optimization. */
579 static const bitset_t utf8_sb_map =
580 {
581 /* Set the first 128 bits. */
582 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
583 };
584 #endif
585
586
587 static void
588 free_dfa_content (re_dfa_t *dfa)
589 {
590 int i, j;
591
592 if (dfa->nodes)
593 for (i = 0; i < dfa->nodes_len; ++i)
594 free_token (dfa->nodes + i);
595 re_free (dfa->nexts);
596 for (i = 0; i < dfa->nodes_len; ++i)
597 {
598 if (dfa->eclosures != NULL)
599 re_node_set_free (dfa->eclosures + i);
600 if (dfa->inveclosures != NULL)
601 re_node_set_free (dfa->inveclosures + i);
602 if (dfa->edests != NULL)
603 re_node_set_free (dfa->edests + i);
604 }
605 re_free (dfa->edests);
606 re_free (dfa->eclosures);
607 re_free (dfa->inveclosures);
608 re_free (dfa->nodes);
609
610 if (dfa->state_table)
611 for (i = 0; i <= dfa->state_hash_mask; ++i)
612 {
613 struct re_state_table_entry *entry = dfa->state_table + i;
614 for (j = 0; j < entry->num; ++j)
615 {
616 re_dfastate_t *state = entry->array[j];
617 free_state (state);
618 }
619 re_free (entry->array);
620 }
621 re_free (dfa->state_table);
622 #ifdef RE_ENABLE_I18N
623 if (dfa->sb_char != utf8_sb_map)
624 re_free (dfa->sb_char);
625 #endif
626 re_free (dfa->subexp_map);
627 #ifdef DEBUG
628 re_free (dfa->re_str);
629 #endif
630
631 re_free (dfa);
632 }
633
634
635 /* Free dynamically allocated space used by PREG. */
636
637 void
638 regfree (preg)
639 regex_t *preg;
640 {
641 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
642 if (BE (dfa != NULL, 1))
643 free_dfa_content (dfa);
644 preg->buffer = NULL;
645 preg->allocated = 0;
646
647 re_free (preg->fastmap);
648 preg->fastmap = NULL;
649
650 re_free (preg->translate);
651 preg->translate = NULL;
652 }
653 #ifdef _LIBC
654 weak_alias (__regfree, regfree)
655 #endif
656 \f
657 /* Entry points compatible with 4.2 BSD regex library. We don't define
658 them unless specifically requested. */
659
660 #if defined _REGEX_RE_COMP || defined _LIBC
661
662 /* BSD has one and only one pattern buffer. */
663 static struct re_pattern_buffer re_comp_buf;
664
665 char *
666 # ifdef _LIBC
667 /* Make these definitions weak in libc, so POSIX programs can redefine
668 these names if they don't use our functions, and still use
669 regcomp/regexec above without link errors. */
670 weak_function
671 # endif
672 re_comp (s)
673 const char *s;
674 {
675 reg_errcode_t ret;
676 char *fastmap;
677
678 if (!s)
679 {
680 if (!re_comp_buf.buffer)
681 return gettext ("No previous regular expression");
682 return 0;
683 }
684
685 if (re_comp_buf.buffer)
686 {
687 fastmap = re_comp_buf.fastmap;
688 re_comp_buf.fastmap = NULL;
689 __regfree (&re_comp_buf);
690 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
691 re_comp_buf.fastmap = fastmap;
692 }
693
694 if (re_comp_buf.fastmap == NULL)
695 {
696 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
697 if (re_comp_buf.fastmap == NULL)
698 return (char *) gettext (__re_error_msgid
699 + __re_error_msgid_idx[(int) REG_ESPACE]);
700 }
701
702 /* Since `re_exec' always passes NULL for the `regs' argument, we
703 don't need to initialize the pattern buffer fields which affect it. */
704
705 /* Match anchors at newlines. */
706 re_comp_buf.newline_anchor = 1;
707
708 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
709
710 if (!ret)
711 return NULL;
712
713 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
714 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
715 }
716
717 #ifdef _LIBC
718 libc_freeres_fn (free_mem)
719 {
720 __regfree (&re_comp_buf);
721 }
722 #endif
723
724 #endif /* _REGEX_RE_COMP */
725 \f
726 /* Internal entry point.
727 Compile the regular expression PATTERN, whose length is LENGTH.
728 SYNTAX indicate regular expression's syntax. */
729
730 static reg_errcode_t
731 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732 reg_syntax_t syntax)
733 {
734 reg_errcode_t err = REG_NOERROR;
735 re_dfa_t *dfa;
736 re_string_t regexp;
737
738 /* Initialize the pattern buffer. */
739 preg->fastmap_accurate = 0;
740 preg->syntax = syntax;
741 preg->not_bol = preg->not_eol = 0;
742 preg->used = 0;
743 preg->re_nsub = 0;
744 preg->can_be_null = 0;
745 preg->regs_allocated = REGS_UNALLOCATED;
746
747 /* Initialize the dfa. */
748 dfa = (re_dfa_t *) preg->buffer;
749 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
750 {
751 /* If zero allocated, but buffer is non-null, try to realloc
752 enough space. This loses if buffer's address is bogus, but
753 that is the user's responsibility. If ->buffer is NULL this
754 is a simple allocation. */
755 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756 if (dfa == NULL)
757 return REG_ESPACE;
758 preg->allocated = sizeof (re_dfa_t);
759 preg->buffer = (unsigned char *) dfa;
760 }
761 preg->used = sizeof (re_dfa_t);
762
763 err = init_dfa (dfa, length);
764 if (BE (err != REG_NOERROR, 0))
765 {
766 free_dfa_content (dfa);
767 preg->buffer = NULL;
768 preg->allocated = 0;
769 return err;
770 }
771 #ifdef DEBUG
772 /* Note: length+1 will not overflow since it is checked in init_dfa. */
773 dfa->re_str = re_malloc (char, length + 1);
774 strncpy (dfa->re_str, pattern, length + 1);
775 #endif
776
777 __libc_lock_init (dfa->lock);
778
779 err = re_string_construct (&regexp, pattern, length, preg->translate,
780 syntax & RE_ICASE, dfa);
781 if (BE (err != REG_NOERROR, 0))
782 {
783 re_compile_internal_free_return:
784 free_workarea_compile (preg);
785 re_string_destruct (&regexp);
786 free_dfa_content (dfa);
787 preg->buffer = NULL;
788 preg->allocated = 0;
789 return err;
790 }
791
792 /* Parse the regular expression, and build a structure tree. */
793 preg->re_nsub = 0;
794 dfa->str_tree = parse (&regexp, preg, syntax, &err);
795 if (BE (dfa->str_tree == NULL, 0))
796 goto re_compile_internal_free_return;
797
798 /* Analyze the tree and create the nfa. */
799 err = analyze (preg);
800 if (BE (err != REG_NOERROR, 0))
801 goto re_compile_internal_free_return;
802
803 #ifdef RE_ENABLE_I18N
804 /* If possible, do searching in single byte encoding to speed things up. */
805 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
806 optimize_utf8 (dfa);
807 #endif
808
809 /* Then create the initial state of the dfa. */
810 err = create_initial_state (dfa);
811
812 /* Release work areas. */
813 free_workarea_compile (preg);
814 re_string_destruct (&regexp);
815
816 if (BE (err != REG_NOERROR, 0))
817 {
818 free_dfa_content (dfa);
819 preg->buffer = NULL;
820 preg->allocated = 0;
821 }
822
823 return err;
824 }
825
826 /* Initialize DFA. We use the length of the regular expression PAT_LEN
827 as the initial length of some arrays. */
828
829 static reg_errcode_t
830 init_dfa (re_dfa_t *dfa, size_t pat_len)
831 {
832 unsigned int table_size;
833 #ifndef _LIBC
834 char *codeset_name;
835 #endif
836
837 memset (dfa, '\0', sizeof (re_dfa_t));
838
839 /* Force allocation of str_tree_storage the first time. */
840 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
841
842 /* Avoid overflows. */
843 if (pat_len == SIZE_MAX)
844 return REG_ESPACE;
845
846 dfa->nodes_alloc = pat_len + 1;
847 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
848
849 /* table_size = 2 ^ ceil(log pat_len) */
850 for (table_size = 1; ; table_size <<= 1)
851 if (table_size > pat_len)
852 break;
853
854 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
855 dfa->state_hash_mask = table_size - 1;
856
857 dfa->mb_cur_max = MB_CUR_MAX;
858 #ifdef _LIBC
859 if (dfa->mb_cur_max == 6
860 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
861 dfa->is_utf8 = 1;
862 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
863 != 0);
864 #else
865 # ifdef HAVE_LANGINFO_CODESET
866 codeset_name = nl_langinfo (CODESET);
867 # else
868 codeset_name = getenv ("LC_ALL");
869 if (codeset_name == NULL || codeset_name[0] == '\0')
870 codeset_name = getenv ("LC_CTYPE");
871 if (codeset_name == NULL || codeset_name[0] == '\0')
872 codeset_name = getenv ("LANG");
873 if (codeset_name == NULL)
874 codeset_name = "";
875 else if (strchr (codeset_name, '.') != NULL)
876 codeset_name = strchr (codeset_name, '.') + 1;
877 # endif
878
879 if (strcasecmp (codeset_name, "UTF-8") == 0
880 || strcasecmp (codeset_name, "UTF8") == 0)
881 dfa->is_utf8 = 1;
882
883 /* We check exhaustively in the loop below if this charset is a
884 superset of ASCII. */
885 dfa->map_notascii = 0;
886 #endif
887
888 #ifdef RE_ENABLE_I18N
889 if (dfa->mb_cur_max > 1)
890 {
891 if (dfa->is_utf8)
892 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
893 else
894 {
895 int i, j, ch;
896
897 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
898 if (BE (dfa->sb_char == NULL, 0))
899 return REG_ESPACE;
900
901 /* Set the bits corresponding to single byte chars. */
902 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
903 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
904 {
905 wint_t wch = __btowc (ch);
906 if (wch != WEOF)
907 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
908 # ifndef _LIBC
909 if (isascii (ch) && wch != ch)
910 dfa->map_notascii = 1;
911 # endif
912 }
913 }
914 }
915 #endif
916
917 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
918 return REG_ESPACE;
919 return REG_NOERROR;
920 }
921
922 /* Initialize WORD_CHAR table, which indicate which character is
923 "word". In this case "word" means that it is the word construction
924 character used by some operators like "\<", "\>", etc. */
925
926 static void
927 internal_function
928 init_word_char (re_dfa_t *dfa)
929 {
930 dfa->word_ops_used = 1;
931 int i = 0;
932 int ch = 0;
933 if (BE (dfa->map_notascii == 0, 1))
934 {
935 if (sizeof (dfa->word_char[0]) == 8)
936 {
937 /* The extra temporaries here avoid "implicitly truncated"
938 warnings in the case when this is dead code, i.e. 32-bit. */
939 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
940 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
941 dfa->word_char[0] = wc0;
942 dfa->word_char[1] = wc1;
943 i = 2;
944 }
945 else if (sizeof (dfa->word_char[0]) == 4)
946 {
947 dfa->word_char[0] = UINT32_C (0x00000000);
948 dfa->word_char[1] = UINT32_C (0x03ff0000);
949 dfa->word_char[2] = UINT32_C (0x87fffffe);
950 dfa->word_char[3] = UINT32_C (0x07fffffe);
951 i = 4;
952 }
953 else
954 abort ();
955 ch = 128;
956
957 if (BE (dfa->is_utf8, 1))
958 {
959 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
960 return;
961 }
962 }
963
964 for (; i < BITSET_WORDS; ++i)
965 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
966 if (isalnum (ch) || ch == '_')
967 dfa->word_char[i] |= (bitset_word_t) 1 << j;
968 }
969
970 /* Free the work area which are only used while compiling. */
971
972 static void
973 free_workarea_compile (regex_t *preg)
974 {
975 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
976 bin_tree_storage_t *storage, *next;
977 for (storage = dfa->str_tree_storage; storage; storage = next)
978 {
979 next = storage->next;
980 re_free (storage);
981 }
982 dfa->str_tree_storage = NULL;
983 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
984 dfa->str_tree = NULL;
985 re_free (dfa->org_indices);
986 dfa->org_indices = NULL;
987 }
988
989 /* Create initial states for all contexts. */
990
991 static reg_errcode_t
992 create_initial_state (re_dfa_t *dfa)
993 {
994 int first, i;
995 reg_errcode_t err;
996 re_node_set init_nodes;
997
998 /* Initial states have the epsilon closure of the node which is
999 the first node of the regular expression. */
1000 first = dfa->str_tree->first->node_idx;
1001 dfa->init_node = first;
1002 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1003 if (BE (err != REG_NOERROR, 0))
1004 return err;
1005
1006 /* The back-references which are in initial states can epsilon transit,
1007 since in this case all of the subexpressions can be null.
1008 Then we add epsilon closures of the nodes which are the next nodes of
1009 the back-references. */
1010 if (dfa->nbackref > 0)
1011 for (i = 0; i < init_nodes.nelem; ++i)
1012 {
1013 int node_idx = init_nodes.elems[i];
1014 re_token_type_t type = dfa->nodes[node_idx].type;
1015
1016 int clexp_idx;
1017 if (type != OP_BACK_REF)
1018 continue;
1019 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1020 {
1021 re_token_t *clexp_node;
1022 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1023 if (clexp_node->type == OP_CLOSE_SUBEXP
1024 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1025 break;
1026 }
1027 if (clexp_idx == init_nodes.nelem)
1028 continue;
1029
1030 if (type == OP_BACK_REF)
1031 {
1032 int dest_idx = dfa->edests[node_idx].elems[0];
1033 if (!re_node_set_contains (&init_nodes, dest_idx))
1034 {
1035 reg_errcode_t err = re_node_set_merge (&init_nodes,
1036 dfa->eclosures
1037 + dest_idx);
1038 if (err != REG_NOERROR)
1039 return err;
1040 i = 0;
1041 }
1042 }
1043 }
1044
1045 /* It must be the first time to invoke acquire_state. */
1046 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1047 /* We don't check ERR here, since the initial state must not be NULL. */
1048 if (BE (dfa->init_state == NULL, 0))
1049 return err;
1050 if (dfa->init_state->has_constraint)
1051 {
1052 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1053 CONTEXT_WORD);
1054 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1055 CONTEXT_NEWLINE);
1056 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1057 &init_nodes,
1058 CONTEXT_NEWLINE
1059 | CONTEXT_BEGBUF);
1060 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1061 || dfa->init_state_begbuf == NULL, 0))
1062 return err;
1063 }
1064 else
1065 dfa->init_state_word = dfa->init_state_nl
1066 = dfa->init_state_begbuf = dfa->init_state;
1067
1068 re_node_set_free (&init_nodes);
1069 return REG_NOERROR;
1070 }
1071 \f
1072 #ifdef RE_ENABLE_I18N
1073 /* If it is possible to do searching in single byte encoding instead of UTF-8
1074 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1075 DFA nodes where needed. */
1076
1077 static void
1078 optimize_utf8 (re_dfa_t *dfa)
1079 {
1080 int node, i, mb_chars = 0, has_period = 0;
1081
1082 for (node = 0; node < dfa->nodes_len; ++node)
1083 switch (dfa->nodes[node].type)
1084 {
1085 case CHARACTER:
1086 if (dfa->nodes[node].opr.c >= 0x80)
1087 mb_chars = 1;
1088 break;
1089 case ANCHOR:
1090 switch (dfa->nodes[node].opr.ctx_type)
1091 {
1092 case LINE_FIRST:
1093 case LINE_LAST:
1094 case BUF_FIRST:
1095 case BUF_LAST:
1096 break;
1097 default:
1098 /* Word anchors etc. cannot be handled. It's okay to test
1099 opr.ctx_type since constraints (for all DFA nodes) are
1100 created by ORing one or more opr.ctx_type values. */
1101 return;
1102 }
1103 break;
1104 case OP_PERIOD:
1105 has_period = 1;
1106 break;
1107 case OP_BACK_REF:
1108 case OP_ALT:
1109 case END_OF_RE:
1110 case OP_DUP_ASTERISK:
1111 case OP_OPEN_SUBEXP:
1112 case OP_CLOSE_SUBEXP:
1113 break;
1114 case COMPLEX_BRACKET:
1115 return;
1116 case SIMPLE_BRACKET:
1117 /* Just double check. The non-ASCII range starts at 0x80. */
1118 assert (0x80 % BITSET_WORD_BITS == 0);
1119 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1120 if (dfa->nodes[node].opr.sbcset[i])
1121 return;
1122 break;
1123 default:
1124 abort ();
1125 }
1126
1127 if (mb_chars || has_period)
1128 for (node = 0; node < dfa->nodes_len; ++node)
1129 {
1130 if (dfa->nodes[node].type == CHARACTER
1131 && dfa->nodes[node].opr.c >= 0x80)
1132 dfa->nodes[node].mb_partial = 0;
1133 else if (dfa->nodes[node].type == OP_PERIOD)
1134 dfa->nodes[node].type = OP_UTF8_PERIOD;
1135 }
1136
1137 /* The search can be in single byte locale. */
1138 dfa->mb_cur_max = 1;
1139 dfa->is_utf8 = 0;
1140 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141 }
1142 #endif
1143 \f
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145 "eclosure", and "inveclosure". */
1146
1147 static reg_errcode_t
1148 analyze (regex_t *preg)
1149 {
1150 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151 reg_errcode_t ret;
1152
1153 /* Allocate arrays. */
1154 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1155 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1156 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159 || dfa->eclosures == NULL, 0))
1160 return REG_ESPACE;
1161
1162 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1163 if (dfa->subexp_map != NULL)
1164 {
1165 int i;
1166 for (i = 0; i < preg->re_nsub; i++)
1167 dfa->subexp_map[i] = i;
1168 preorder (dfa->str_tree, optimize_subexps, dfa);
1169 for (i = 0; i < preg->re_nsub; i++)
1170 if (dfa->subexp_map[i] != i)
1171 break;
1172 if (i == preg->re_nsub)
1173 {
1174 free (dfa->subexp_map);
1175 dfa->subexp_map = NULL;
1176 }
1177 }
1178
1179 ret = postorder (dfa->str_tree, lower_subexps, preg);
1180 if (BE (ret != REG_NOERROR, 0))
1181 return ret;
1182 ret = postorder (dfa->str_tree, calc_first, dfa);
1183 if (BE (ret != REG_NOERROR, 0))
1184 return ret;
1185 preorder (dfa->str_tree, calc_next, dfa);
1186 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1188 return ret;
1189 ret = calc_eclosure (dfa);
1190 if (BE (ret != REG_NOERROR, 0))
1191 return ret;
1192
1193 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1195 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196 || dfa->nbackref)
1197 {
1198 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199 if (BE (dfa->inveclosures == NULL, 0))
1200 return REG_ESPACE;
1201 ret = calc_inveclosure (dfa);
1202 }
1203
1204 return ret;
1205 }
1206
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208 implement parse tree visits. Instead, we use parent pointers and
1209 some hairy code in these two functions. */
1210 static reg_errcode_t
1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212 void *extra)
1213 {
1214 bin_tree_t *node, *prev;
1215
1216 for (node = root; ; )
1217 {
1218 /* Descend down the tree, preferably to the left (or to the right
1219 if that's the only child). */
1220 while (node->left || node->right)
1221 if (node->left)
1222 node = node->left;
1223 else
1224 node = node->right;
1225
1226 do
1227 {
1228 reg_errcode_t err = fn (extra, node);
1229 if (BE (err != REG_NOERROR, 0))
1230 return err;
1231 if (node->parent == NULL)
1232 return REG_NOERROR;
1233 prev = node;
1234 node = node->parent;
1235 }
1236 /* Go up while we have a node that is reached from the right. */
1237 while (node->right == prev || node->right == NULL);
1238 node = node->right;
1239 }
1240 }
1241
1242 static reg_errcode_t
1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244 void *extra)
1245 {
1246 bin_tree_t *node;
1247
1248 for (node = root; ; )
1249 {
1250 reg_errcode_t err = fn (extra, node);
1251 if (BE (err != REG_NOERROR, 0))
1252 return err;
1253
1254 /* Go to the left node, or up and to the right. */
1255 if (node->left)
1256 node = node->left;
1257 else
1258 {
1259 bin_tree_t *prev = NULL;
1260 while (node->right == prev || node->right == NULL)
1261 {
1262 prev = node;
1263 node = node->parent;
1264 if (!node)
1265 return REG_NOERROR;
1266 }
1267 node = node->right;
1268 }
1269 }
1270 }
1271
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1274 backreferences as well. Requires a preorder visit. */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra, bin_tree_t *node)
1277 {
1278 re_dfa_t *dfa = (re_dfa_t *) extra;
1279
1280 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281 {
1282 int idx = node->token.opr.idx;
1283 node->token.opr.idx = dfa->subexp_map[idx];
1284 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285 }
1286
1287 else if (node->token.type == SUBEXP
1288 && node->left && node->left->token.type == SUBEXP)
1289 {
1290 int other_idx = node->left->token.opr.idx;
1291
1292 node->left = node->left->left;
1293 if (node->left)
1294 node->left->parent = node;
1295
1296 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297 if (other_idx < BITSET_WORD_BITS)
1298 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299 }
1300
1301 return REG_NOERROR;
1302 }
1303
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1306 static reg_errcode_t
1307 lower_subexps (void *extra, bin_tree_t *node)
1308 {
1309 regex_t *preg = (regex_t *) extra;
1310 reg_errcode_t err = REG_NOERROR;
1311
1312 if (node->left && node->left->token.type == SUBEXP)
1313 {
1314 node->left = lower_subexp (&err, preg, node->left);
1315 if (node->left)
1316 node->left->parent = node;
1317 }
1318 if (node->right && node->right->token.type == SUBEXP)
1319 {
1320 node->right = lower_subexp (&err, preg, node->right);
1321 if (node->right)
1322 node->right->parent = node;
1323 }
1324
1325 return err;
1326 }
1327
1328 static bin_tree_t *
1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 {
1331 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332 bin_tree_t *body = node->left;
1333 bin_tree_t *op, *cls, *tree1, *tree;
1334
1335 if (preg->no_sub
1336 /* We do not optimize empty subexpressions, because otherwise we may
1337 have bad CONCAT nodes with NULL children. This is obviously not
1338 very common, so we do not lose much. An example that triggers
1339 this case is the sed "script" /\(\)/x. */
1340 && node->left != NULL
1341 && (node->token.opr.idx >= BITSET_WORD_BITS
1342 || !(dfa->used_bkref_map
1343 & ((bitset_word_t) 1 << node->token.opr.idx))))
1344 return node->left;
1345
1346 /* Convert the SUBEXP node to the concatenation of an
1347 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1348 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351 tree = create_tree (dfa, op, tree1, CONCAT);
1352 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353 {
1354 *err = REG_ESPACE;
1355 return NULL;
1356 }
1357
1358 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360 return tree;
1361 }
1362
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364 nodes. Requires a postorder visit. */
1365 static reg_errcode_t
1366 calc_first (void *extra, bin_tree_t *node)
1367 {
1368 re_dfa_t *dfa = (re_dfa_t *) extra;
1369 if (node->token.type == CONCAT)
1370 {
1371 node->first = node->left->first;
1372 node->node_idx = node->left->node_idx;
1373 }
1374 else
1375 {
1376 node->first = node;
1377 node->node_idx = re_dfa_add_node (dfa, node->token);
1378 if (BE (node->node_idx == -1, 0))
1379 return REG_ESPACE;
1380 if (node->token.type == ANCHOR)
1381 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1382 }
1383 return REG_NOERROR;
1384 }
1385
1386 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1387 static reg_errcode_t
1388 calc_next (void *extra, bin_tree_t *node)
1389 {
1390 switch (node->token.type)
1391 {
1392 case OP_DUP_ASTERISK:
1393 node->left->next = node;
1394 break;
1395 case CONCAT:
1396 node->left->next = node->right->first;
1397 node->right->next = node->next;
1398 break;
1399 default:
1400 if (node->left)
1401 node->left->next = node->next;
1402 if (node->right)
1403 node->right->next = node->next;
1404 break;
1405 }
1406 return REG_NOERROR;
1407 }
1408
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 {
1413 re_dfa_t *dfa = (re_dfa_t *) extra;
1414 int idx = node->node_idx;
1415 reg_errcode_t err = REG_NOERROR;
1416
1417 switch (node->token.type)
1418 {
1419 case CONCAT:
1420 break;
1421
1422 case END_OF_RE:
1423 assert (node->next == NULL);
1424 break;
1425
1426 case OP_DUP_ASTERISK:
1427 case OP_ALT:
1428 {
1429 int left, right;
1430 dfa->has_plural_match = 1;
1431 if (node->left != NULL)
1432 left = node->left->first->node_idx;
1433 else
1434 left = node->next->node_idx;
1435 if (node->right != NULL)
1436 right = node->right->first->node_idx;
1437 else
1438 right = node->next->node_idx;
1439 assert (left > -1);
1440 assert (right > -1);
1441 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1442 }
1443 break;
1444
1445 case ANCHOR:
1446 case OP_OPEN_SUBEXP:
1447 case OP_CLOSE_SUBEXP:
1448 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449 break;
1450
1451 case OP_BACK_REF:
1452 dfa->nexts[idx] = node->next->node_idx;
1453 if (node->token.type == OP_BACK_REF)
1454 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455 break;
1456
1457 default:
1458 assert (!IS_EPSILON_NODE (node->token.type));
1459 dfa->nexts[idx] = node->next->node_idx;
1460 break;
1461 }
1462
1463 return err;
1464 }
1465
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468 to their own constraint. */
1469
1470 static reg_errcode_t
1471 internal_function
1472 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1473 int root_node, unsigned int init_constraint)
1474 {
1475 int org_node, clone_node, ret;
1476 unsigned int constraint = init_constraint;
1477 for (org_node = top_org_node, clone_node = top_clone_node;;)
1478 {
1479 int org_dest, clone_dest;
1480 if (dfa->nodes[org_node].type == OP_BACK_REF)
1481 {
1482 /* If the back reference epsilon-transit, its destination must
1483 also have the constraint. Then duplicate the epsilon closure
1484 of the destination of the back reference, and store it in
1485 edests of the back reference. */
1486 org_dest = dfa->nexts[org_node];
1487 re_node_set_empty (dfa->edests + clone_node);
1488 clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 if (BE (clone_dest == -1, 0))
1490 return REG_ESPACE;
1491 dfa->nexts[clone_node] = dfa->nexts[org_node];
1492 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493 if (BE (ret < 0, 0))
1494 return REG_ESPACE;
1495 }
1496 else if (dfa->edests[org_node].nelem == 0)
1497 {
1498 /* In case of the node can't epsilon-transit, don't duplicate the
1499 destination and store the original destination as the
1500 destination of the node. */
1501 dfa->nexts[clone_node] = dfa->nexts[org_node];
1502 break;
1503 }
1504 else if (dfa->edests[org_node].nelem == 1)
1505 {
1506 /* In case of the node can epsilon-transit, and it has only one
1507 destination. */
1508 org_dest = dfa->edests[org_node].elems[0];
1509 re_node_set_empty (dfa->edests + clone_node);
1510 /* If the node is root_node itself, it means the epsilon clsoure
1511 has a loop. Then tie it to the destination of the root_node. */
1512 if (org_node == root_node && clone_node != org_node)
1513 {
1514 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1515 if (BE (ret < 0, 0))
1516 return REG_ESPACE;
1517 break;
1518 }
1519 /* In case of the node has another constraint, add it. */
1520 constraint |= dfa->nodes[org_node].constraint;
1521 clone_dest = duplicate_node (dfa, org_dest, constraint);
1522 if (BE (clone_dest == -1, 0))
1523 return REG_ESPACE;
1524 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525 if (BE (ret < 0, 0))
1526 return REG_ESPACE;
1527 }
1528 else /* dfa->edests[org_node].nelem == 2 */
1529 {
1530 /* In case of the node can epsilon-transit, and it has two
1531 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1532 org_dest = dfa->edests[org_node].elems[0];
1533 re_node_set_empty (dfa->edests + clone_node);
1534 /* Search for a duplicated node which satisfies the constraint. */
1535 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536 if (clone_dest == -1)
1537 {
1538 /* There is no such duplicated node, create a new one. */
1539 reg_errcode_t err;
1540 clone_dest = duplicate_node (dfa, org_dest, constraint);
1541 if (BE (clone_dest == -1, 0))
1542 return REG_ESPACE;
1543 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544 if (BE (ret < 0, 0))
1545 return REG_ESPACE;
1546 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547 root_node, constraint);
1548 if (BE (err != REG_NOERROR, 0))
1549 return err;
1550 }
1551 else
1552 {
1553 /* There is a duplicated node which satisfies the constraint,
1554 use it to avoid infinite loop. */
1555 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 if (BE (ret < 0, 0))
1557 return REG_ESPACE;
1558 }
1559
1560 org_dest = dfa->edests[org_node].elems[1];
1561 clone_dest = duplicate_node (dfa, org_dest, constraint);
1562 if (BE (clone_dest == -1, 0))
1563 return REG_ESPACE;
1564 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565 if (BE (ret < 0, 0))
1566 return REG_ESPACE;
1567 }
1568 org_node = org_dest;
1569 clone_node = clone_dest;
1570 }
1571 return REG_NOERROR;
1572 }
1573
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575 satisfies the constraint CONSTRAINT. */
1576
1577 static int
1578 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1579 unsigned int constraint)
1580 {
1581 int idx;
1582 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583 {
1584 if (org_node == dfa->org_indices[idx]
1585 && constraint == dfa->nodes[idx].constraint)
1586 return idx; /* Found. */
1587 }
1588 return -1; /* Not found. */
1589 }
1590
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592 Return the index of the new node, or -1 if insufficient storage is
1593 available. */
1594
1595 static int
1596 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1597 {
1598 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599 if (BE (dup_idx != -1, 1))
1600 {
1601 dfa->nodes[dup_idx].constraint = constraint;
1602 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603 dfa->nodes[dup_idx].duplicated = 1;
1604
1605 /* Store the index of the original node. */
1606 dfa->org_indices[dup_idx] = org_idx;
1607 }
1608 return dup_idx;
1609 }
1610
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1613 {
1614 int src, idx, ret;
1615 for (idx = 0; idx < dfa->nodes_len; ++idx)
1616 re_node_set_init_empty (dfa->inveclosures + idx);
1617
1618 for (src = 0; src < dfa->nodes_len; ++src)
1619 {
1620 int *elems = dfa->eclosures[src].elems;
1621 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1622 {
1623 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1624 if (BE (ret == -1, 0))
1625 return REG_ESPACE;
1626 }
1627 }
1628
1629 return REG_NOERROR;
1630 }
1631
1632 /* Calculate "eclosure" for all the node in DFA. */
1633
1634 static reg_errcode_t
1635 calc_eclosure (re_dfa_t *dfa)
1636 {
1637 int node_idx, incomplete;
1638 #ifdef DEBUG
1639 assert (dfa->nodes_len > 0);
1640 #endif
1641 incomplete = 0;
1642 /* For each nodes, calculate epsilon closure. */
1643 for (node_idx = 0; ; ++node_idx)
1644 {
1645 reg_errcode_t err;
1646 re_node_set eclosure_elem;
1647 if (node_idx == dfa->nodes_len)
1648 {
1649 if (!incomplete)
1650 break;
1651 incomplete = 0;
1652 node_idx = 0;
1653 }
1654
1655 #ifdef DEBUG
1656 assert (dfa->eclosures[node_idx].nelem != -1);
1657 #endif
1658
1659 /* If we have already calculated, skip it. */
1660 if (dfa->eclosures[node_idx].nelem != 0)
1661 continue;
1662 /* Calculate epsilon closure of `node_idx'. */
1663 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1664 if (BE (err != REG_NOERROR, 0))
1665 return err;
1666
1667 if (dfa->eclosures[node_idx].nelem == 0)
1668 {
1669 incomplete = 1;
1670 re_node_set_free (&eclosure_elem);
1671 }
1672 }
1673 return REG_NOERROR;
1674 }
1675
1676 /* Calculate epsilon closure of NODE. */
1677
1678 static reg_errcode_t
1679 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1680 {
1681 reg_errcode_t err;
1682 int i;
1683 re_node_set eclosure;
1684 int ret;
1685 int incomplete = 0;
1686 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1687 if (BE (err != REG_NOERROR, 0))
1688 return err;
1689
1690 /* This indicates that we are calculating this node now.
1691 We reference this value to avoid infinite loop. */
1692 dfa->eclosures[node].nelem = -1;
1693
1694 /* If the current node has constraints, duplicate all nodes
1695 since they must inherit the constraints. */
1696 if (dfa->nodes[node].constraint
1697 && dfa->edests[node].nelem
1698 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1699 {
1700 err = duplicate_node_closure (dfa, node, node, node,
1701 dfa->nodes[node].constraint);
1702 if (BE (err != REG_NOERROR, 0))
1703 return err;
1704 }
1705
1706 /* Expand each epsilon destination nodes. */
1707 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1708 for (i = 0; i < dfa->edests[node].nelem; ++i)
1709 {
1710 re_node_set eclosure_elem;
1711 int edest = dfa->edests[node].elems[i];
1712 /* If calculating the epsilon closure of `edest' is in progress,
1713 return intermediate result. */
1714 if (dfa->eclosures[edest].nelem == -1)
1715 {
1716 incomplete = 1;
1717 continue;
1718 }
1719 /* If we haven't calculated the epsilon closure of `edest' yet,
1720 calculate now. Otherwise use calculated epsilon closure. */
1721 if (dfa->eclosures[edest].nelem == 0)
1722 {
1723 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1724 if (BE (err != REG_NOERROR, 0))
1725 return err;
1726 }
1727 else
1728 eclosure_elem = dfa->eclosures[edest];
1729 /* Merge the epsilon closure of `edest'. */
1730 err = re_node_set_merge (&eclosure, &eclosure_elem);
1731 if (BE (err != REG_NOERROR, 0))
1732 return err;
1733 /* If the epsilon closure of `edest' is incomplete,
1734 the epsilon closure of this node is also incomplete. */
1735 if (dfa->eclosures[edest].nelem == 0)
1736 {
1737 incomplete = 1;
1738 re_node_set_free (&eclosure_elem);
1739 }
1740 }
1741
1742 /* An epsilon closure includes itself. */
1743 ret = re_node_set_insert (&eclosure, node);
1744 if (BE (ret < 0, 0))
1745 return REG_ESPACE;
1746 if (incomplete && !root)
1747 dfa->eclosures[node].nelem = 0;
1748 else
1749 dfa->eclosures[node] = eclosure;
1750 *new_set = eclosure;
1751 return REG_NOERROR;
1752 }
1753 \f
1754 /* Functions for token which are used in the parser. */
1755
1756 /* Fetch a token from INPUT.
1757 We must not use this function inside bracket expressions. */
1758
1759 static void
1760 internal_function
1761 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1762 {
1763 re_string_skip_bytes (input, peek_token (result, input, syntax));
1764 }
1765
1766 /* Peek a token from INPUT, and return the length of the token.
1767 We must not use this function inside bracket expressions. */
1768
1769 static int
1770 internal_function
1771 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1772 {
1773 unsigned char c;
1774
1775 if (re_string_eoi (input))
1776 {
1777 token->type = END_OF_RE;
1778 return 0;
1779 }
1780
1781 c = re_string_peek_byte (input, 0);
1782 token->opr.c = c;
1783
1784 token->word_char = 0;
1785 #ifdef RE_ENABLE_I18N
1786 token->mb_partial = 0;
1787 if (input->mb_cur_max > 1 &&
1788 !re_string_first_byte (input, re_string_cur_idx (input)))
1789 {
1790 token->type = CHARACTER;
1791 token->mb_partial = 1;
1792 return 1;
1793 }
1794 #endif
1795 if (c == '\\')
1796 {
1797 unsigned char c2;
1798 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1799 {
1800 token->type = BACK_SLASH;
1801 return 1;
1802 }
1803
1804 c2 = re_string_peek_byte_case (input, 1);
1805 token->opr.c = c2;
1806 token->type = CHARACTER;
1807 #ifdef RE_ENABLE_I18N
1808 if (input->mb_cur_max > 1)
1809 {
1810 wint_t wc = re_string_wchar_at (input,
1811 re_string_cur_idx (input) + 1);
1812 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1813 }
1814 else
1815 #endif
1816 token->word_char = IS_WORD_CHAR (c2) != 0;
1817
1818 switch (c2)
1819 {
1820 case '|':
1821 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1822 token->type = OP_ALT;
1823 break;
1824 case '1': case '2': case '3': case '4': case '5':
1825 case '6': case '7': case '8': case '9':
1826 if (!(syntax & RE_NO_BK_REFS))
1827 {
1828 token->type = OP_BACK_REF;
1829 token->opr.idx = c2 - '1';
1830 }
1831 break;
1832 case '<':
1833 if (!(syntax & RE_NO_GNU_OPS))
1834 {
1835 token->type = ANCHOR;
1836 token->opr.ctx_type = WORD_FIRST;
1837 }
1838 break;
1839 case '>':
1840 if (!(syntax & RE_NO_GNU_OPS))
1841 {
1842 token->type = ANCHOR;
1843 token->opr.ctx_type = WORD_LAST;
1844 }
1845 break;
1846 case 'b':
1847 if (!(syntax & RE_NO_GNU_OPS))
1848 {
1849 token->type = ANCHOR;
1850 token->opr.ctx_type = WORD_DELIM;
1851 }
1852 break;
1853 case 'B':
1854 if (!(syntax & RE_NO_GNU_OPS))
1855 {
1856 token->type = ANCHOR;
1857 token->opr.ctx_type = NOT_WORD_DELIM;
1858 }
1859 break;
1860 case 'w':
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = OP_WORD;
1863 break;
1864 case 'W':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_NOTWORD;
1867 break;
1868 case 's':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_SPACE;
1871 break;
1872 case 'S':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_NOTSPACE;
1875 break;
1876 case '`':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 {
1879 token->type = ANCHOR;
1880 token->opr.ctx_type = BUF_FIRST;
1881 }
1882 break;
1883 case '\'':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 {
1886 token->type = ANCHOR;
1887 token->opr.ctx_type = BUF_LAST;
1888 }
1889 break;
1890 case '(':
1891 if (!(syntax & RE_NO_BK_PARENS))
1892 token->type = OP_OPEN_SUBEXP;
1893 break;
1894 case ')':
1895 if (!(syntax & RE_NO_BK_PARENS))
1896 token->type = OP_CLOSE_SUBEXP;
1897 break;
1898 case '+':
1899 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1900 token->type = OP_DUP_PLUS;
1901 break;
1902 case '?':
1903 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904 token->type = OP_DUP_QUESTION;
1905 break;
1906 case '{':
1907 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1908 token->type = OP_OPEN_DUP_NUM;
1909 break;
1910 case '}':
1911 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912 token->type = OP_CLOSE_DUP_NUM;
1913 break;
1914 default:
1915 break;
1916 }
1917 return 2;
1918 }
1919
1920 token->type = CHARACTER;
1921 #ifdef RE_ENABLE_I18N
1922 if (input->mb_cur_max > 1)
1923 {
1924 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1925 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1926 }
1927 else
1928 #endif
1929 token->word_char = IS_WORD_CHAR (token->opr.c);
1930
1931 switch (c)
1932 {
1933 case '\n':
1934 if (syntax & RE_NEWLINE_ALT)
1935 token->type = OP_ALT;
1936 break;
1937 case '|':
1938 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1939 token->type = OP_ALT;
1940 break;
1941 case '*':
1942 token->type = OP_DUP_ASTERISK;
1943 break;
1944 case '+':
1945 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1946 token->type = OP_DUP_PLUS;
1947 break;
1948 case '?':
1949 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950 token->type = OP_DUP_QUESTION;
1951 break;
1952 case '{':
1953 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1954 token->type = OP_OPEN_DUP_NUM;
1955 break;
1956 case '}':
1957 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958 token->type = OP_CLOSE_DUP_NUM;
1959 break;
1960 case '(':
1961 if (syntax & RE_NO_BK_PARENS)
1962 token->type = OP_OPEN_SUBEXP;
1963 break;
1964 case ')':
1965 if (syntax & RE_NO_BK_PARENS)
1966 token->type = OP_CLOSE_SUBEXP;
1967 break;
1968 case '[':
1969 token->type = OP_OPEN_BRACKET;
1970 break;
1971 case '.':
1972 token->type = OP_PERIOD;
1973 break;
1974 case '^':
1975 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1976 re_string_cur_idx (input) != 0)
1977 {
1978 char prev = re_string_peek_byte (input, -1);
1979 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1980 break;
1981 }
1982 token->type = ANCHOR;
1983 token->opr.ctx_type = LINE_FIRST;
1984 break;
1985 case '$':
1986 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1987 re_string_cur_idx (input) + 1 != re_string_length (input))
1988 {
1989 re_token_t next;
1990 re_string_skip_bytes (input, 1);
1991 peek_token (&next, input, syntax);
1992 re_string_skip_bytes (input, -1);
1993 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1994 break;
1995 }
1996 token->type = ANCHOR;
1997 token->opr.ctx_type = LINE_LAST;
1998 break;
1999 default:
2000 break;
2001 }
2002 return 1;
2003 }
2004
2005 /* Peek a token from INPUT, and return the length of the token.
2006 We must not use this function out of bracket expressions. */
2007
2008 static int
2009 internal_function
2010 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2011 {
2012 unsigned char c;
2013 if (re_string_eoi (input))
2014 {
2015 token->type = END_OF_RE;
2016 return 0;
2017 }
2018 c = re_string_peek_byte (input, 0);
2019 token->opr.c = c;
2020
2021 #ifdef RE_ENABLE_I18N
2022 if (input->mb_cur_max > 1 &&
2023 !re_string_first_byte (input, re_string_cur_idx (input)))
2024 {
2025 token->type = CHARACTER;
2026 return 1;
2027 }
2028 #endif /* RE_ENABLE_I18N */
2029
2030 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2031 && re_string_cur_idx (input) + 1 < re_string_length (input))
2032 {
2033 /* In this case, '\' escape a character. */
2034 unsigned char c2;
2035 re_string_skip_bytes (input, 1);
2036 c2 = re_string_peek_byte (input, 0);
2037 token->opr.c = c2;
2038 token->type = CHARACTER;
2039 return 1;
2040 }
2041 if (c == '[') /* '[' is a special char in a bracket exps. */
2042 {
2043 unsigned char c2;
2044 int token_len;
2045 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2046 c2 = re_string_peek_byte (input, 1);
2047 else
2048 c2 = 0;
2049 token->opr.c = c2;
2050 token_len = 2;
2051 switch (c2)
2052 {
2053 case '.':
2054 token->type = OP_OPEN_COLL_ELEM;
2055 break;
2056 case '=':
2057 token->type = OP_OPEN_EQUIV_CLASS;
2058 break;
2059 case ':':
2060 if (syntax & RE_CHAR_CLASSES)
2061 {
2062 token->type = OP_OPEN_CHAR_CLASS;
2063 break;
2064 }
2065 /* else fall through. */
2066 default:
2067 token->type = CHARACTER;
2068 token->opr.c = c;
2069 token_len = 1;
2070 break;
2071 }
2072 return token_len;
2073 }
2074 switch (c)
2075 {
2076 case '-':
2077 token->type = OP_CHARSET_RANGE;
2078 break;
2079 case ']':
2080 token->type = OP_CLOSE_BRACKET;
2081 break;
2082 case '^':
2083 token->type = OP_NON_MATCH_LIST;
2084 break;
2085 default:
2086 token->type = CHARACTER;
2087 }
2088 return 1;
2089 }
2090 \f
2091 /* Functions for parser. */
2092
2093 /* Entry point of the parser.
2094 Parse the regular expression REGEXP and return the structure tree.
2095 If an error is occured, ERR is set by error code, and return NULL.
2096 This function build the following tree, from regular expression <reg_exp>:
2097 CAT
2098 / \
2099 / \
2100 <reg_exp> EOR
2101
2102 CAT means concatenation.
2103 EOR means end of regular expression. */
2104
2105 static bin_tree_t *
2106 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2107 reg_errcode_t *err)
2108 {
2109 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2110 bin_tree_t *tree, *eor, *root;
2111 re_token_t current_token;
2112 dfa->syntax = syntax;
2113 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2114 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2115 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2116 return NULL;
2117 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2118 if (tree != NULL)
2119 root = create_tree (dfa, tree, eor, CONCAT);
2120 else
2121 root = eor;
2122 if (BE (eor == NULL || root == NULL, 0))
2123 {
2124 *err = REG_ESPACE;
2125 return NULL;
2126 }
2127 return root;
2128 }
2129
2130 /* This function build the following tree, from regular expression
2131 <branch1>|<branch2>:
2132 ALT
2133 / \
2134 / \
2135 <branch1> <branch2>
2136
2137 ALT means alternative, which represents the operator `|'. */
2138
2139 static bin_tree_t *
2140 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2142 {
2143 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144 bin_tree_t *tree, *branch = NULL;
2145 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147 return NULL;
2148
2149 while (token->type == OP_ALT)
2150 {
2151 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152 if (token->type != OP_ALT && token->type != END_OF_RE
2153 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154 {
2155 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2156 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2157 return NULL;
2158 }
2159 else
2160 branch = NULL;
2161 tree = create_tree (dfa, tree, branch, OP_ALT);
2162 if (BE (tree == NULL, 0))
2163 {
2164 *err = REG_ESPACE;
2165 return NULL;
2166 }
2167 }
2168 return tree;
2169 }
2170
2171 /* This function build the following tree, from regular expression
2172 <exp1><exp2>:
2173 CAT
2174 / \
2175 / \
2176 <exp1> <exp2>
2177
2178 CAT means concatenation. */
2179
2180 static bin_tree_t *
2181 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2182 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2183 {
2184 bin_tree_t *tree, *exp;
2185 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2186 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2187 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188 return NULL;
2189
2190 while (token->type != OP_ALT && token->type != END_OF_RE
2191 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2192 {
2193 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2194 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2195 {
2196 if (tree != NULL)
2197 postorder (tree, free_tree, NULL);
2198 return NULL;
2199 }
2200 if (tree != NULL && exp != NULL)
2201 {
2202 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2203 if (newtree == NULL)
2204 {
2205 postorder (exp, free_tree, NULL);
2206 postorder (tree, free_tree, NULL);
2207 *err = REG_ESPACE;
2208 return NULL;
2209 }
2210 tree = newtree;
2211 }
2212 else if (tree == NULL)
2213 tree = exp;
2214 /* Otherwise exp == NULL, we don't need to create new tree. */
2215 }
2216 return tree;
2217 }
2218
2219 /* This function build the following tree, from regular expression a*:
2220 *
2221 |
2222 a
2223 */
2224
2225 static bin_tree_t *
2226 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2228 {
2229 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2230 bin_tree_t *tree;
2231 switch (token->type)
2232 {
2233 case CHARACTER:
2234 tree = create_token_tree (dfa, NULL, NULL, token);
2235 if (BE (tree == NULL, 0))
2236 {
2237 *err = REG_ESPACE;
2238 return NULL;
2239 }
2240 #ifdef RE_ENABLE_I18N
2241 if (dfa->mb_cur_max > 1)
2242 {
2243 while (!re_string_eoi (regexp)
2244 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2245 {
2246 bin_tree_t *mbc_remain;
2247 fetch_token (token, regexp, syntax);
2248 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2249 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2250 if (BE (mbc_remain == NULL || tree == NULL, 0))
2251 {
2252 *err = REG_ESPACE;
2253 return NULL;
2254 }
2255 }
2256 }
2257 #endif
2258 break;
2259 case OP_OPEN_SUBEXP:
2260 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2261 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 return NULL;
2263 break;
2264 case OP_OPEN_BRACKET:
2265 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2266 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 return NULL;
2268 break;
2269 case OP_BACK_REF:
2270 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271 {
2272 *err = REG_ESUBREG;
2273 return NULL;
2274 }
2275 dfa->used_bkref_map |= 1 << token->opr.idx;
2276 tree = create_token_tree (dfa, NULL, NULL, token);
2277 if (BE (tree == NULL, 0))
2278 {
2279 *err = REG_ESPACE;
2280 return NULL;
2281 }
2282 ++dfa->nbackref;
2283 dfa->has_mb_node = 1;
2284 break;
2285 case OP_OPEN_DUP_NUM:
2286 if (syntax & RE_CONTEXT_INVALID_DUP)
2287 {
2288 *err = REG_BADRPT;
2289 return NULL;
2290 }
2291 /* FALLTHROUGH */
2292 case OP_DUP_ASTERISK:
2293 case OP_DUP_PLUS:
2294 case OP_DUP_QUESTION:
2295 if (syntax & RE_CONTEXT_INVALID_OPS)
2296 {
2297 *err = REG_BADRPT;
2298 return NULL;
2299 }
2300 else if (syntax & RE_CONTEXT_INDEP_OPS)
2301 {
2302 fetch_token (token, regexp, syntax);
2303 return parse_expression (regexp, preg, token, syntax, nest, err);
2304 }
2305 /* else fall through */
2306 case OP_CLOSE_SUBEXP:
2307 if ((token->type == OP_CLOSE_SUBEXP) &&
2308 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309 {
2310 *err = REG_ERPAREN;
2311 return NULL;
2312 }
2313 /* else fall through */
2314 case OP_CLOSE_DUP_NUM:
2315 /* We treat it as a normal character. */
2316
2317 /* Then we can these characters as normal characters. */
2318 token->type = CHARACTER;
2319 /* mb_partial and word_char bits should be initialized already
2320 by peek_token. */
2321 tree = create_token_tree (dfa, NULL, NULL, token);
2322 if (BE (tree == NULL, 0))
2323 {
2324 *err = REG_ESPACE;
2325 return NULL;
2326 }
2327 break;
2328 case ANCHOR:
2329 if ((token->opr.ctx_type
2330 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2331 && dfa->word_ops_used == 0)
2332 init_word_char (dfa);
2333 if (token->opr.ctx_type == WORD_DELIM
2334 || token->opr.ctx_type == NOT_WORD_DELIM)
2335 {
2336 bin_tree_t *tree_first, *tree_last;
2337 if (token->opr.ctx_type == WORD_DELIM)
2338 {
2339 token->opr.ctx_type = WORD_FIRST;
2340 tree_first = create_token_tree (dfa, NULL, NULL, token);
2341 token->opr.ctx_type = WORD_LAST;
2342 }
2343 else
2344 {
2345 token->opr.ctx_type = INSIDE_WORD;
2346 tree_first = create_token_tree (dfa, NULL, NULL, token);
2347 token->opr.ctx_type = INSIDE_NOTWORD;
2348 }
2349 tree_last = create_token_tree (dfa, NULL, NULL, token);
2350 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2351 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2352 {
2353 *err = REG_ESPACE;
2354 return NULL;
2355 }
2356 }
2357 else
2358 {
2359 tree = create_token_tree (dfa, NULL, NULL, token);
2360 if (BE (tree == NULL, 0))
2361 {
2362 *err = REG_ESPACE;
2363 return NULL;
2364 }
2365 }
2366 /* We must return here, since ANCHORs can't be followed
2367 by repetition operators.
2368 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2369 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2370 fetch_token (token, regexp, syntax);
2371 return tree;
2372 case OP_PERIOD:
2373 tree = create_token_tree (dfa, NULL, NULL, token);
2374 if (BE (tree == NULL, 0))
2375 {
2376 *err = REG_ESPACE;
2377 return NULL;
2378 }
2379 if (dfa->mb_cur_max > 1)
2380 dfa->has_mb_node = 1;
2381 break;
2382 case OP_WORD:
2383 case OP_NOTWORD:
2384 tree = build_charclass_op (dfa, regexp->trans,
2385 (const unsigned char *) "alnum",
2386 (const unsigned char *) "_",
2387 token->type == OP_NOTWORD, err);
2388 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 return NULL;
2390 break;
2391 case OP_SPACE:
2392 case OP_NOTSPACE:
2393 tree = build_charclass_op (dfa, regexp->trans,
2394 (const unsigned char *) "space",
2395 (const unsigned char *) "",
2396 token->type == OP_NOTSPACE, err);
2397 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2398 return NULL;
2399 break;
2400 case OP_ALT:
2401 case END_OF_RE:
2402 return NULL;
2403 case BACK_SLASH:
2404 *err = REG_EESCAPE;
2405 return NULL;
2406 default:
2407 /* Must not happen? */
2408 #ifdef DEBUG
2409 assert (0);
2410 #endif
2411 return NULL;
2412 }
2413 fetch_token (token, regexp, syntax);
2414
2415 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2416 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2417 {
2418 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2419 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2420 {
2421 if (tree != NULL)
2422 postorder (tree, free_tree, NULL);
2423 return NULL;
2424 }
2425 tree = dup_tree;
2426 /* In BRE consecutive duplications are not allowed. */
2427 if ((syntax & RE_CONTEXT_INVALID_DUP)
2428 && (token->type == OP_DUP_ASTERISK
2429 || token->type == OP_OPEN_DUP_NUM))
2430 {
2431 if (tree != NULL)
2432 postorder (tree, free_tree, NULL);
2433 *err = REG_BADRPT;
2434 return NULL;
2435 }
2436 }
2437
2438 return tree;
2439 }
2440
2441 /* This function build the following tree, from regular expression
2442 (<reg_exp>):
2443 SUBEXP
2444 |
2445 <reg_exp>
2446 */
2447
2448 static bin_tree_t *
2449 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2450 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2451 {
2452 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2453 bin_tree_t *tree;
2454 size_t cur_nsub;
2455 cur_nsub = preg->re_nsub++;
2456
2457 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2458
2459 /* The subexpression may be a null string. */
2460 if (token->type == OP_CLOSE_SUBEXP)
2461 tree = NULL;
2462 else
2463 {
2464 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2465 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2466 {
2467 if (tree != NULL)
2468 postorder (tree, free_tree, NULL);
2469 *err = REG_EPAREN;
2470 }
2471 if (BE (*err != REG_NOERROR, 0))
2472 return NULL;
2473 }
2474
2475 if (cur_nsub <= '9' - '1')
2476 dfa->completed_bkref_map |= 1 << cur_nsub;
2477
2478 tree = create_tree (dfa, tree, NULL, SUBEXP);
2479 if (BE (tree == NULL, 0))
2480 {
2481 *err = REG_ESPACE;
2482 return NULL;
2483 }
2484 tree->token.opr.idx = cur_nsub;
2485 return tree;
2486 }
2487
2488 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2489
2490 static bin_tree_t *
2491 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2492 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2493 {
2494 bin_tree_t *tree = NULL, *old_tree = NULL;
2495 int i, start, end, start_idx = re_string_cur_idx (regexp);
2496 re_token_t start_token = *token;
2497
2498 if (token->type == OP_OPEN_DUP_NUM)
2499 {
2500 end = 0;
2501 start = fetch_number (regexp, token, syntax);
2502 if (start == -1)
2503 {
2504 if (token->type == CHARACTER && token->opr.c == ',')
2505 start = 0; /* We treat "{,m}" as "{0,m}". */
2506 else
2507 {
2508 *err = REG_BADBR; /* <re>{} is invalid. */
2509 return NULL;
2510 }
2511 }
2512 if (BE (start != -2, 1))
2513 {
2514 /* We treat "{n}" as "{n,n}". */
2515 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2516 : ((token->type == CHARACTER && token->opr.c == ',')
2517 ? fetch_number (regexp, token, syntax) : -2));
2518 }
2519 if (BE (start == -2 || end == -2, 0))
2520 {
2521 /* Invalid sequence. */
2522 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2523 {
2524 if (token->type == END_OF_RE)
2525 *err = REG_EBRACE;
2526 else
2527 *err = REG_BADBR;
2528
2529 return NULL;
2530 }
2531
2532 /* If the syntax bit is set, rollback. */
2533 re_string_set_index (regexp, start_idx);
2534 *token = start_token;
2535 token->type = CHARACTER;
2536 /* mb_partial and word_char bits should be already initialized by
2537 peek_token. */
2538 return elem;
2539 }
2540
2541 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2542 {
2543 /* First number greater than second. */
2544 *err = REG_BADBR;
2545 return NULL;
2546 }
2547 }
2548 else
2549 {
2550 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552 }
2553
2554 fetch_token (token, regexp, syntax);
2555
2556 if (BE (elem == NULL, 0))
2557 return NULL;
2558 if (BE (start == 0 && end == 0, 0))
2559 {
2560 postorder (elem, free_tree, NULL);
2561 return NULL;
2562 }
2563
2564 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2565 if (BE (start > 0, 0))
2566 {
2567 tree = elem;
2568 for (i = 2; i <= start; ++i)
2569 {
2570 elem = duplicate_tree (elem, dfa);
2571 tree = create_tree (dfa, tree, elem, CONCAT);
2572 if (BE (elem == NULL || tree == NULL, 0))
2573 goto parse_dup_op_espace;
2574 }
2575
2576 if (start == end)
2577 return tree;
2578
2579 /* Duplicate ELEM before it is marked optional. */
2580 elem = duplicate_tree (elem, dfa);
2581 old_tree = tree;
2582 }
2583 else
2584 old_tree = NULL;
2585
2586 if (elem->token.type == SUBEXP)
2587 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2588
2589 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2590 if (BE (tree == NULL, 0))
2591 goto parse_dup_op_espace;
2592
2593 /* This loop is actually executed only when end != -1,
2594 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2595 already created the start+1-th copy. */
2596 for (i = start + 2; i <= end; ++i)
2597 {
2598 elem = duplicate_tree (elem, dfa);
2599 tree = create_tree (dfa, tree, elem, CONCAT);
2600 if (BE (elem == NULL || tree == NULL, 0))
2601 goto parse_dup_op_espace;
2602
2603 tree = create_tree (dfa, tree, NULL, OP_ALT);
2604 if (BE (tree == NULL, 0))
2605 goto parse_dup_op_espace;
2606 }
2607
2608 if (old_tree)
2609 tree = create_tree (dfa, old_tree, tree, CONCAT);
2610
2611 return tree;
2612
2613 parse_dup_op_espace:
2614 *err = REG_ESPACE;
2615 return NULL;
2616 }
2617
2618 /* Size of the names for collating symbol/equivalence_class/character_class.
2619 I'm not sure, but maybe enough. */
2620 #define BRACKET_NAME_BUF_SIZE 32
2621
2622 #ifndef _LIBC
2623 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2624 Build the range expression which starts from START_ELEM, and ends
2625 at END_ELEM. The result are written to MBCSET and SBCSET.
2626 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2627 mbcset->range_ends, is a pointer argument sinse we may
2628 update it. */
2629
2630 static reg_errcode_t
2631 internal_function
2632 # ifdef RE_ENABLE_I18N
2633 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2634 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2635 # else /* not RE_ENABLE_I18N */
2636 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2637 bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2639 {
2640 unsigned int start_ch, end_ch;
2641 /* Equivalence Classes and Character Classes can't be a range start/end. */
2642 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644 0))
2645 return REG_ERANGE;
2646
2647 /* We can handle no multi character collating elements without libc
2648 support. */
2649 if (BE ((start_elem->type == COLL_SYM
2650 && strlen ((char *) start_elem->opr.name) > 1)
2651 || (end_elem->type == COLL_SYM
2652 && strlen ((char *) end_elem->opr.name) > 1), 0))
2653 return REG_ECOLLATE;
2654
2655 # ifdef RE_ENABLE_I18N
2656 {
2657 wchar_t wc;
2658 wint_t start_wc;
2659 wint_t end_wc;
2660 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661
2662 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664 : 0));
2665 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667 : 0));
2668 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669 ? __btowc (start_ch) : start_elem->opr.wch);
2670 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671 ? __btowc (end_ch) : end_elem->opr.wch);
2672 if (start_wc == WEOF || end_wc == WEOF)
2673 return REG_ECOLLATE;
2674 cmp_buf[0] = start_wc;
2675 cmp_buf[4] = end_wc;
2676 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2677 return REG_ERANGE;
2678
2679 /* Got valid collation sequence values, add them as a new entry.
2680 However, for !_LIBC we have no collation elements: if the
2681 character set is single byte, the single byte character set
2682 that we build below suffices. parse_bracket_exp passes
2683 no MBCSET if dfa->mb_cur_max == 1. */
2684 if (mbcset)
2685 {
2686 /* Check the space of the arrays. */
2687 if (BE (*range_alloc == mbcset->nranges, 0))
2688 {
2689 /* There is not enough space, need realloc. */
2690 wchar_t *new_array_start, *new_array_end;
2691 int new_nranges;
2692
2693 /* +1 in case of mbcset->nranges is 0. */
2694 new_nranges = 2 * mbcset->nranges + 1;
2695 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2696 are NULL if *range_alloc == 0. */
2697 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2698 new_nranges);
2699 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2700 new_nranges);
2701
2702 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2703 return REG_ESPACE;
2704
2705 mbcset->range_starts = new_array_start;
2706 mbcset->range_ends = new_array_end;
2707 *range_alloc = new_nranges;
2708 }
2709
2710 mbcset->range_starts[mbcset->nranges] = start_wc;
2711 mbcset->range_ends[mbcset->nranges++] = end_wc;
2712 }
2713
2714 /* Build the table for single byte characters. */
2715 for (wc = 0; wc < SBC_MAX; ++wc)
2716 {
2717 cmp_buf[2] = wc;
2718 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2719 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2720 bitset_set (sbcset, wc);
2721 }
2722 }
2723 # else /* not RE_ENABLE_I18N */
2724 {
2725 unsigned int ch;
2726 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2727 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2728 : 0));
2729 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2730 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2731 : 0));
2732 if (start_ch > end_ch)
2733 return REG_ERANGE;
2734 /* Build the table for single byte characters. */
2735 for (ch = 0; ch < SBC_MAX; ++ch)
2736 if (start_ch <= ch && ch <= end_ch)
2737 bitset_set (sbcset, ch);
2738 }
2739 # endif /* not RE_ENABLE_I18N */
2740 return REG_NOERROR;
2741 }
2742 #endif /* not _LIBC */
2743
2744 #ifndef _LIBC
2745 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2746 Build the collating element which is represented by NAME.
2747 The result are written to MBCSET and SBCSET.
2748 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2749 pointer argument since we may update it. */
2750
2751 static reg_errcode_t
2752 internal_function
2753 # ifdef RE_ENABLE_I18N
2754 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2755 int *coll_sym_alloc, const unsigned char *name)
2756 # else /* not RE_ENABLE_I18N */
2757 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2758 # endif /* not RE_ENABLE_I18N */
2759 {
2760 size_t name_len = strlen ((const char *) name);
2761 if (BE (name_len != 1, 0))
2762 return REG_ECOLLATE;
2763 else
2764 {
2765 bitset_set (sbcset, name[0]);
2766 return REG_NOERROR;
2767 }
2768 }
2769 #endif /* not _LIBC */
2770
2771 /* This function parse bracket expression like "[abc]", "[a-c]",
2772 "[[.a-a.]]" etc. */
2773
2774 static bin_tree_t *
2775 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2776 reg_syntax_t syntax, reg_errcode_t *err)
2777 {
2778 #ifdef _LIBC
2779 const unsigned char *collseqmb;
2780 const char *collseqwc;
2781 uint32_t nrules;
2782 int32_t table_size;
2783 const int32_t *symb_table;
2784 const unsigned char *extra;
2785
2786 /* Local function for parse_bracket_exp used in _LIBC environement.
2787 Seek the collating symbol entry correspondings to NAME.
2788 Return the index of the symbol in the SYMB_TABLE,
2789 or -1 if not found. */
2790
2791 auto inline int32_t
2792 __attribute ((always_inline))
2793 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2794 {
2795 int32_t elem;
2796
2797 for (elem = 0; elem < table_size; elem++)
2798 if (symb_table[2 * elem] != 0)
2799 {
2800 int32_t idx = symb_table[2 * elem + 1];
2801 /* Skip the name of collating element name. */
2802 idx += 1 + extra[idx];
2803 if (/* Compare the length of the name. */
2804 name_len == extra[idx]
2805 /* Compare the name. */
2806 && memcmp (name, &extra[idx + 1], name_len) == 0)
2807 /* Yep, this is the entry. */
2808 return elem;
2809 }
2810 return -1;
2811 }
2812
2813 /* Local function for parse_bracket_exp used in _LIBC environment.
2814 Look up the collation sequence value of BR_ELEM.
2815 Return the value if succeeded, UINT_MAX otherwise. */
2816
2817 auto inline unsigned int
2818 __attribute ((always_inline))
2819 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2820 {
2821 if (br_elem->type == SB_CHAR)
2822 {
2823 /*
2824 if (MB_CUR_MAX == 1)
2825 */
2826 if (nrules == 0)
2827 return collseqmb[br_elem->opr.ch];
2828 else
2829 {
2830 wint_t wc = __btowc (br_elem->opr.ch);
2831 return __collseq_table_lookup (collseqwc, wc);
2832 }
2833 }
2834 else if (br_elem->type == MB_CHAR)
2835 {
2836 if (nrules != 0)
2837 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2838 }
2839 else if (br_elem->type == COLL_SYM)
2840 {
2841 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2842 if (nrules != 0)
2843 {
2844 int32_t elem, idx;
2845 elem = seek_collating_symbol_entry (br_elem->opr.name,
2846 sym_name_len);
2847 if (elem != -1)
2848 {
2849 /* We found the entry. */
2850 idx = symb_table[2 * elem + 1];
2851 /* Skip the name of collating element name. */
2852 idx += 1 + extra[idx];
2853 /* Skip the byte sequence of the collating element. */
2854 idx += 1 + extra[idx];
2855 /* Adjust for the alignment. */
2856 idx = (idx + 3) & ~3;
2857 /* Skip the multibyte collation sequence value. */
2858 idx += sizeof (unsigned int);
2859 /* Skip the wide char sequence of the collating element. */
2860 idx += sizeof (unsigned int) *
2861 (1 + *(unsigned int *) (extra + idx));
2862 /* Return the collation sequence value. */
2863 return *(unsigned int *) (extra + idx);
2864 }
2865 else if (sym_name_len == 1)
2866 {
2867 /* No valid character. Match it as a single byte
2868 character. */
2869 return collseqmb[br_elem->opr.name[0]];
2870 }
2871 }
2872 else if (sym_name_len == 1)
2873 return collseqmb[br_elem->opr.name[0]];
2874 }
2875 return UINT_MAX;
2876 }
2877
2878 /* Local function for parse_bracket_exp used in _LIBC environement.
2879 Build the range expression which starts from START_ELEM, and ends
2880 at END_ELEM. The result are written to MBCSET and SBCSET.
2881 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2882 mbcset->range_ends, is a pointer argument sinse we may
2883 update it. */
2884
2885 auto inline reg_errcode_t
2886 __attribute ((always_inline))
2887 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2888 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2889 {
2890 unsigned int ch;
2891 uint32_t start_collseq;
2892 uint32_t end_collseq;
2893
2894 /* Equivalence Classes and Character Classes can't be a range
2895 start/end. */
2896 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2897 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2898 0))
2899 return REG_ERANGE;
2900
2901 start_collseq = lookup_collation_sequence_value (start_elem);
2902 end_collseq = lookup_collation_sequence_value (end_elem);
2903 /* Check start/end collation sequence values. */
2904 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2905 return REG_ECOLLATE;
2906 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2907 return REG_ERANGE;
2908
2909 /* Got valid collation sequence values, add them as a new entry.
2910 However, if we have no collation elements, and the character set
2911 is single byte, the single byte character set that we
2912 build below suffices. */
2913 if (nrules > 0 || dfa->mb_cur_max > 1)
2914 {
2915 /* Check the space of the arrays. */
2916 if (BE (*range_alloc == mbcset->nranges, 0))
2917 {
2918 /* There is not enough space, need realloc. */
2919 uint32_t *new_array_start;
2920 uint32_t *new_array_end;
2921 int new_nranges;
2922
2923 /* +1 in case of mbcset->nranges is 0. */
2924 new_nranges = 2 * mbcset->nranges + 1;
2925 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2926 new_nranges);
2927 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2928 new_nranges);
2929
2930 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2931 return REG_ESPACE;
2932
2933 mbcset->range_starts = new_array_start;
2934 mbcset->range_ends = new_array_end;
2935 *range_alloc = new_nranges;
2936 }
2937
2938 mbcset->range_starts[mbcset->nranges] = start_collseq;
2939 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2940 }
2941
2942 /* Build the table for single byte characters. */
2943 for (ch = 0; ch < SBC_MAX; ch++)
2944 {
2945 uint32_t ch_collseq;
2946 /*
2947 if (MB_CUR_MAX == 1)
2948 */
2949 if (nrules == 0)
2950 ch_collseq = collseqmb[ch];
2951 else
2952 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2953 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2954 bitset_set (sbcset, ch);
2955 }
2956 return REG_NOERROR;
2957 }
2958
2959 /* Local function for parse_bracket_exp used in _LIBC environement.
2960 Build the collating element which is represented by NAME.
2961 The result are written to MBCSET and SBCSET.
2962 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2963 pointer argument sinse we may update it. */
2964
2965 auto inline reg_errcode_t
2966 __attribute ((always_inline))
2967 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2968 int *coll_sym_alloc, const unsigned char *name)
2969 {
2970 int32_t elem, idx;
2971 size_t name_len = strlen ((const char *) name);
2972 if (nrules != 0)
2973 {
2974 elem = seek_collating_symbol_entry (name, name_len);
2975 if (elem != -1)
2976 {
2977 /* We found the entry. */
2978 idx = symb_table[2 * elem + 1];
2979 /* Skip the name of collating element name. */
2980 idx += 1 + extra[idx];
2981 }
2982 else if (name_len == 1)
2983 {
2984 /* No valid character, treat it as a normal
2985 character. */
2986 bitset_set (sbcset, name[0]);
2987 return REG_NOERROR;
2988 }
2989 else
2990 return REG_ECOLLATE;
2991
2992 /* Got valid collation sequence, add it as a new entry. */
2993 /* Check the space of the arrays. */
2994 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2995 {
2996 /* Not enough, realloc it. */
2997 /* +1 in case of mbcset->ncoll_syms is 0. */
2998 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2999 /* Use realloc since mbcset->coll_syms is NULL
3000 if *alloc == 0. */
3001 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3002 new_coll_sym_alloc);
3003 if (BE (new_coll_syms == NULL, 0))
3004 return REG_ESPACE;
3005 mbcset->coll_syms = new_coll_syms;
3006 *coll_sym_alloc = new_coll_sym_alloc;
3007 }
3008 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3009 return REG_NOERROR;
3010 }
3011 else
3012 {
3013 if (BE (name_len != 1, 0))
3014 return REG_ECOLLATE;
3015 else
3016 {
3017 bitset_set (sbcset, name[0]);
3018 return REG_NOERROR;
3019 }
3020 }
3021 }
3022 #endif
3023
3024 re_token_t br_token;
3025 re_bitset_ptr_t sbcset;
3026 #ifdef RE_ENABLE_I18N
3027 re_charset_t *mbcset;
3028 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3029 int equiv_class_alloc = 0, char_class_alloc = 0;
3030 #endif /* not RE_ENABLE_I18N */
3031 int non_match = 0;
3032 bin_tree_t *work_tree;
3033 int token_len;
3034 int first_round = 1;
3035 #ifdef _LIBC
3036 collseqmb = (const unsigned char *)
3037 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3038 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3039 if (nrules)
3040 {
3041 /*
3042 if (MB_CUR_MAX > 1)
3043 */
3044 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3045 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3046 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3047 _NL_COLLATE_SYMB_TABLEMB);
3048 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3049 _NL_COLLATE_SYMB_EXTRAMB);
3050 }
3051 #endif
3052 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3053 #ifdef RE_ENABLE_I18N
3054 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3055 #endif /* RE_ENABLE_I18N */
3056 #ifdef RE_ENABLE_I18N
3057 if (BE (sbcset == NULL || mbcset == NULL, 0))
3058 #else
3059 if (BE (sbcset == NULL, 0))
3060 #endif /* RE_ENABLE_I18N */
3061 {
3062 re_free (sbcset);
3063 #ifdef RE_ENABLE_I18N
3064 re_free (mbcset);
3065 #endif
3066 *err = REG_ESPACE;
3067 return NULL;
3068 }
3069
3070 token_len = peek_token_bracket (token, regexp, syntax);
3071 if (BE (token->type == END_OF_RE, 0))
3072 {
3073 *err = REG_BADPAT;
3074 goto parse_bracket_exp_free_return;
3075 }
3076 if (token->type == OP_NON_MATCH_LIST)
3077 {
3078 #ifdef RE_ENABLE_I18N
3079 mbcset->non_match = 1;
3080 #endif /* not RE_ENABLE_I18N */
3081 non_match = 1;
3082 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3083 bitset_set (sbcset, '\n');
3084 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3085 token_len = peek_token_bracket (token, regexp, syntax);
3086 if (BE (token->type == END_OF_RE, 0))
3087 {
3088 *err = REG_BADPAT;
3089 goto parse_bracket_exp_free_return;
3090 }
3091 }
3092
3093 /* We treat the first ']' as a normal character. */
3094 if (token->type == OP_CLOSE_BRACKET)
3095 token->type = CHARACTER;
3096
3097 while (1)
3098 {
3099 bracket_elem_t start_elem, end_elem;
3100 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3101 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3102 reg_errcode_t ret;
3103 int token_len2 = 0, is_range_exp = 0;
3104 re_token_t token2;
3105
3106 start_elem.opr.name = start_name_buf;
3107 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3108 syntax, first_round);
3109 if (BE (ret != REG_NOERROR, 0))
3110 {
3111 *err = ret;
3112 goto parse_bracket_exp_free_return;
3113 }
3114 first_round = 0;
3115
3116 /* Get information about the next token. We need it in any case. */
3117 token_len = peek_token_bracket (token, regexp, syntax);
3118
3119 /* Do not check for ranges if we know they are not allowed. */
3120 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3121 {
3122 if (BE (token->type == END_OF_RE, 0))
3123 {
3124 *err = REG_EBRACK;
3125 goto parse_bracket_exp_free_return;
3126 }
3127 if (token->type == OP_CHARSET_RANGE)
3128 {
3129 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3130 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3131 if (BE (token2.type == END_OF_RE, 0))
3132 {
3133 *err = REG_EBRACK;
3134 goto parse_bracket_exp_free_return;
3135 }
3136 if (token2.type == OP_CLOSE_BRACKET)
3137 {
3138 /* We treat the last '-' as a normal character. */
3139 re_string_skip_bytes (regexp, -token_len);
3140 token->type = CHARACTER;
3141 }
3142 else
3143 is_range_exp = 1;
3144 }
3145 }
3146
3147 if (is_range_exp == 1)
3148 {
3149 end_elem.opr.name = end_name_buf;
3150 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3151 dfa, syntax, 1);
3152 if (BE (ret != REG_NOERROR, 0))
3153 {
3154 *err = ret;
3155 goto parse_bracket_exp_free_return;
3156 }
3157
3158 token_len = peek_token_bracket (token, regexp, syntax);
3159
3160 #ifdef _LIBC
3161 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3162 &start_elem, &end_elem);
3163 #else
3164 # ifdef RE_ENABLE_I18N
3165 *err = build_range_exp (sbcset,
3166 dfa->mb_cur_max > 1 ? mbcset : NULL,
3167 &range_alloc, &start_elem, &end_elem);
3168 # else
3169 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3170 # endif
3171 #endif /* RE_ENABLE_I18N */
3172 if (BE (*err != REG_NOERROR, 0))
3173 goto parse_bracket_exp_free_return;
3174 }
3175 else
3176 {
3177 switch (start_elem.type)
3178 {
3179 case SB_CHAR:
3180 bitset_set (sbcset, start_elem.opr.ch);
3181 break;
3182 #ifdef RE_ENABLE_I18N
3183 case MB_CHAR:
3184 /* Check whether the array has enough space. */
3185 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3186 {
3187 wchar_t *new_mbchars;
3188 /* Not enough, realloc it. */
3189 /* +1 in case of mbcset->nmbchars is 0. */
3190 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3191 /* Use realloc since array is NULL if *alloc == 0. */
3192 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3193 mbchar_alloc);
3194 if (BE (new_mbchars == NULL, 0))
3195 goto parse_bracket_exp_espace;
3196 mbcset->mbchars = new_mbchars;
3197 }
3198 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3199 break;
3200 #endif /* RE_ENABLE_I18N */
3201 case EQUIV_CLASS:
3202 *err = build_equiv_class (sbcset,
3203 #ifdef RE_ENABLE_I18N
3204 mbcset, &equiv_class_alloc,
3205 #endif /* RE_ENABLE_I18N */
3206 start_elem.opr.name);
3207 if (BE (*err != REG_NOERROR, 0))
3208 goto parse_bracket_exp_free_return;
3209 break;
3210 case COLL_SYM:
3211 *err = build_collating_symbol (sbcset,
3212 #ifdef RE_ENABLE_I18N
3213 mbcset, &coll_sym_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215 start_elem.opr.name);
3216 if (BE (*err != REG_NOERROR, 0))
3217 goto parse_bracket_exp_free_return;
3218 break;
3219 case CHAR_CLASS:
3220 *err = build_charclass (regexp->trans, sbcset,
3221 #ifdef RE_ENABLE_I18N
3222 mbcset, &char_class_alloc,
3223 #endif /* RE_ENABLE_I18N */
3224 start_elem.opr.name, syntax);
3225 if (BE (*err != REG_NOERROR, 0))
3226 goto parse_bracket_exp_free_return;
3227 break;
3228 default:
3229 assert (0);
3230 break;
3231 }
3232 }
3233 if (BE (token->type == END_OF_RE, 0))
3234 {
3235 *err = REG_EBRACK;
3236 goto parse_bracket_exp_free_return;
3237 }
3238 if (token->type == OP_CLOSE_BRACKET)
3239 break;
3240 }
3241
3242 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3243
3244 /* If it is non-matching list. */
3245 if (non_match)
3246 bitset_not (sbcset);
3247
3248 #ifdef RE_ENABLE_I18N
3249 /* Ensure only single byte characters are set. */
3250 if (dfa->mb_cur_max > 1)
3251 bitset_mask (sbcset, dfa->sb_char);
3252
3253 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3254 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3255 || mbcset->non_match)))
3256 {
3257 bin_tree_t *mbc_tree;
3258 int sbc_idx;
3259 /* Build a tree for complex bracket. */
3260 dfa->has_mb_node = 1;
3261 br_token.type = COMPLEX_BRACKET;
3262 br_token.opr.mbcset = mbcset;
3263 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3264 if (BE (mbc_tree == NULL, 0))
3265 goto parse_bracket_exp_espace;
3266 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3267 if (sbcset[sbc_idx])
3268 break;
3269 /* If there are no bits set in sbcset, there is no point
3270 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3271 if (sbc_idx < BITSET_WORDS)
3272 {
3273 /* Build a tree for simple bracket. */
3274 br_token.type = SIMPLE_BRACKET;
3275 br_token.opr.sbcset = sbcset;
3276 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277 if (BE (work_tree == NULL, 0))
3278 goto parse_bracket_exp_espace;
3279
3280 /* Then join them by ALT node. */
3281 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3282 if (BE (work_tree == NULL, 0))
3283 goto parse_bracket_exp_espace;
3284 }
3285 else
3286 {
3287 re_free (sbcset);
3288 work_tree = mbc_tree;
3289 }
3290 }
3291 else
3292 #endif /* not RE_ENABLE_I18N */
3293 {
3294 #ifdef RE_ENABLE_I18N
3295 free_charset (mbcset);
3296 #endif
3297 /* Build a tree for simple bracket. */
3298 br_token.type = SIMPLE_BRACKET;
3299 br_token.opr.sbcset = sbcset;
3300 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3303 }
3304 return work_tree;
3305
3306 parse_bracket_exp_espace:
3307 *err = REG_ESPACE;
3308 parse_bracket_exp_free_return:
3309 re_free (sbcset);
3310 #ifdef RE_ENABLE_I18N
3311 free_charset (mbcset);
3312 #endif /* RE_ENABLE_I18N */
3313 return NULL;
3314 }
3315
3316 /* Parse an element in the bracket expression. */
3317
3318 static reg_errcode_t
3319 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3320 re_token_t *token, int token_len, re_dfa_t *dfa,
3321 reg_syntax_t syntax, int accept_hyphen)
3322 {
3323 #ifdef RE_ENABLE_I18N
3324 int cur_char_size;
3325 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3326 if (cur_char_size > 1)
3327 {
3328 elem->type = MB_CHAR;
3329 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3330 re_string_skip_bytes (regexp, cur_char_size);
3331 return REG_NOERROR;
3332 }
3333 #endif /* RE_ENABLE_I18N */
3334 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3335 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3336 || token->type == OP_OPEN_EQUIV_CLASS)
3337 return parse_bracket_symbol (elem, regexp, token);
3338 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3339 {
3340 /* A '-' must only appear as anything but a range indicator before
3341 the closing bracket. Everything else is an error. */
3342 re_token_t token2;
3343 (void) peek_token_bracket (&token2, regexp, syntax);
3344 if (token2.type != OP_CLOSE_BRACKET)
3345 /* The actual error value is not standardized since this whole
3346 case is undefined. But ERANGE makes good sense. */
3347 return REG_ERANGE;
3348 }
3349 elem->type = SB_CHAR;
3350 elem->opr.ch = token->opr.c;
3351 return REG_NOERROR;
3352 }
3353
3354 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3355 such as [:<character_class>:], [.<collating_element>.], and
3356 [=<equivalent_class>=]. */
3357
3358 static reg_errcode_t
3359 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3360 re_token_t *token)
3361 {
3362 unsigned char ch, delim = token->opr.c;
3363 int i = 0;
3364 if (re_string_eoi(regexp))
3365 return REG_EBRACK;
3366 for (;; ++i)
3367 {
3368 if (i >= BRACKET_NAME_BUF_SIZE)
3369 return REG_EBRACK;
3370 if (token->type == OP_OPEN_CHAR_CLASS)
3371 ch = re_string_fetch_byte_case (regexp);
3372 else
3373 ch = re_string_fetch_byte (regexp);
3374 if (re_string_eoi(regexp))
3375 return REG_EBRACK;
3376 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3377 break;
3378 elem->opr.name[i] = ch;
3379 }
3380 re_string_skip_bytes (regexp, 1);
3381 elem->opr.name[i] = '\0';
3382 switch (token->type)
3383 {
3384 case OP_OPEN_COLL_ELEM:
3385 elem->type = COLL_SYM;
3386 break;
3387 case OP_OPEN_EQUIV_CLASS:
3388 elem->type = EQUIV_CLASS;
3389 break;
3390 case OP_OPEN_CHAR_CLASS:
3391 elem->type = CHAR_CLASS;
3392 break;
3393 default:
3394 break;
3395 }
3396 return REG_NOERROR;
3397 }
3398
3399 /* Helper function for parse_bracket_exp.
3400 Build the equivalence class which is represented by NAME.
3401 The result are written to MBCSET and SBCSET.
3402 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3403 is a pointer argument sinse we may update it. */
3404
3405 static reg_errcode_t
3406 #ifdef RE_ENABLE_I18N
3407 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3408 int *equiv_class_alloc, const unsigned char *name)
3409 #else /* not RE_ENABLE_I18N */
3410 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3411 #endif /* not RE_ENABLE_I18N */
3412 {
3413 #ifdef _LIBC
3414 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3415 if (nrules != 0)
3416 {
3417 const int32_t *table, *indirect;
3418 const unsigned char *weights, *extra, *cp;
3419 unsigned char char_buf[2];
3420 int32_t idx1, idx2;
3421 unsigned int ch;
3422 size_t len;
3423 /* This #include defines a local function! */
3424 # include <locale/weight.h>
3425 /* Calculate the index for equivalence class. */
3426 cp = name;
3427 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3428 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3429 _NL_COLLATE_WEIGHTMB);
3430 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3431 _NL_COLLATE_EXTRAMB);
3432 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3433 _NL_COLLATE_INDIRECTMB);
3434 idx1 = findidx (&cp, -1);
3435 if (BE (idx1 == 0 || *cp != '\0', 0))
3436 /* This isn't a valid character. */
3437 return REG_ECOLLATE;
3438
3439 /* Build single byte matcing table for this equivalence class. */
3440 len = weights[idx1 & 0xffffff];
3441 for (ch = 0; ch < SBC_MAX; ++ch)
3442 {
3443 char_buf[0] = ch;
3444 cp = char_buf;
3445 idx2 = findidx (&cp, 1);
3446 /*
3447 idx2 = table[ch];
3448 */
3449 if (idx2 == 0)
3450 /* This isn't a valid character. */
3451 continue;
3452 /* Compare only if the length matches and the collation rule
3453 index is the same. */
3454 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3455 {
3456 int cnt = 0;
3457
3458 while (cnt <= len &&
3459 weights[(idx1 & 0xffffff) + 1 + cnt]
3460 == weights[(idx2 & 0xffffff) + 1 + cnt])
3461 ++cnt;
3462
3463 if (cnt > len)
3464 bitset_set (sbcset, ch);
3465 }
3466 }
3467 /* Check whether the array has enough space. */
3468 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3469 {
3470 /* Not enough, realloc it. */
3471 /* +1 in case of mbcset->nequiv_classes is 0. */
3472 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3473 /* Use realloc since the array is NULL if *alloc == 0. */
3474 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3475 int32_t,
3476 new_equiv_class_alloc);
3477 if (BE (new_equiv_classes == NULL, 0))
3478 return REG_ESPACE;
3479 mbcset->equiv_classes = new_equiv_classes;
3480 *equiv_class_alloc = new_equiv_class_alloc;
3481 }
3482 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3483 }
3484 else
3485 #endif /* _LIBC */
3486 {
3487 if (BE (strlen ((const char *) name) != 1, 0))
3488 return REG_ECOLLATE;
3489 bitset_set (sbcset, *name);
3490 }
3491 return REG_NOERROR;
3492 }
3493
3494 /* Helper function for parse_bracket_exp.
3495 Build the character class which is represented by NAME.
3496 The result are written to MBCSET and SBCSET.
3497 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3498 is a pointer argument sinse we may update it. */
3499
3500 static reg_errcode_t
3501 #ifdef RE_ENABLE_I18N
3502 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3503 re_charset_t *mbcset, int *char_class_alloc,
3504 const unsigned char *class_name, reg_syntax_t syntax)
3505 #else /* not RE_ENABLE_I18N */
3506 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3507 const unsigned char *class_name, reg_syntax_t syntax)
3508 #endif /* not RE_ENABLE_I18N */
3509 {
3510 int i;
3511 const char *name = (const char *) class_name;
3512
3513 /* In case of REG_ICASE "upper" and "lower" match the both of
3514 upper and lower cases. */
3515 if ((syntax & RE_ICASE)
3516 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3517 name = "alpha";
3518
3519 #ifdef RE_ENABLE_I18N
3520 /* Check the space of the arrays. */
3521 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3522 {
3523 /* Not enough, realloc it. */
3524 /* +1 in case of mbcset->nchar_classes is 0. */
3525 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3526 /* Use realloc since array is NULL if *alloc == 0. */
3527 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3528 new_char_class_alloc);
3529 if (BE (new_char_classes == NULL, 0))
3530 return REG_ESPACE;
3531 mbcset->char_classes = new_char_classes;
3532 *char_class_alloc = new_char_class_alloc;
3533 }
3534 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3535 #endif /* RE_ENABLE_I18N */
3536
3537 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3538 do { \
3539 if (BE (trans != NULL, 0)) \
3540 { \
3541 for (i = 0; i < SBC_MAX; ++i) \
3542 if (ctype_func (i)) \
3543 bitset_set (sbcset, trans[i]); \
3544 } \
3545 else \
3546 { \
3547 for (i = 0; i < SBC_MAX; ++i) \
3548 if (ctype_func (i)) \
3549 bitset_set (sbcset, i); \
3550 } \
3551 } while (0)
3552
3553 if (strcmp (name, "alnum") == 0)
3554 BUILD_CHARCLASS_LOOP (isalnum);
3555 else if (strcmp (name, "cntrl") == 0)
3556 BUILD_CHARCLASS_LOOP (iscntrl);
3557 else if (strcmp (name, "lower") == 0)
3558 BUILD_CHARCLASS_LOOP (islower);
3559 else if (strcmp (name, "space") == 0)
3560 BUILD_CHARCLASS_LOOP (isspace);
3561 else if (strcmp (name, "alpha") == 0)
3562 BUILD_CHARCLASS_LOOP (isalpha);
3563 else if (strcmp (name, "digit") == 0)
3564 BUILD_CHARCLASS_LOOP (isdigit);
3565 else if (strcmp (name, "print") == 0)
3566 BUILD_CHARCLASS_LOOP (isprint);
3567 else if (strcmp (name, "upper") == 0)
3568 BUILD_CHARCLASS_LOOP (isupper);
3569 else if (strcmp (name, "blank") == 0)
3570 BUILD_CHARCLASS_LOOP (isblank);
3571 else if (strcmp (name, "graph") == 0)
3572 BUILD_CHARCLASS_LOOP (isgraph);
3573 else if (strcmp (name, "punct") == 0)
3574 BUILD_CHARCLASS_LOOP (ispunct);
3575 else if (strcmp (name, "xdigit") == 0)
3576 BUILD_CHARCLASS_LOOP (isxdigit);
3577 else
3578 return REG_ECTYPE;
3579
3580 return REG_NOERROR;
3581 }
3582
3583 static bin_tree_t *
3584 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3585 const unsigned char *class_name,
3586 const unsigned char *extra, int non_match,
3587 reg_errcode_t *err)
3588 {
3589 re_bitset_ptr_t sbcset;
3590 #ifdef RE_ENABLE_I18N
3591 re_charset_t *mbcset;
3592 int alloc = 0;
3593 #endif /* not RE_ENABLE_I18N */
3594 reg_errcode_t ret;
3595 re_token_t br_token;
3596 bin_tree_t *tree;
3597
3598 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3599 #ifdef RE_ENABLE_I18N
3600 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3601 #endif /* RE_ENABLE_I18N */
3602
3603 #ifdef RE_ENABLE_I18N
3604 if (BE (sbcset == NULL || mbcset == NULL, 0))
3605 #else /* not RE_ENABLE_I18N */
3606 if (BE (sbcset == NULL, 0))
3607 #endif /* not RE_ENABLE_I18N */
3608 {
3609 *err = REG_ESPACE;
3610 return NULL;
3611 }
3612
3613 if (non_match)
3614 {
3615 #ifdef RE_ENABLE_I18N
3616 mbcset->non_match = 1;
3617 #endif /* not RE_ENABLE_I18N */
3618 }
3619
3620 /* We don't care the syntax in this case. */
3621 ret = build_charclass (trans, sbcset,
3622 #ifdef RE_ENABLE_I18N
3623 mbcset, &alloc,
3624 #endif /* RE_ENABLE_I18N */
3625 class_name, 0);
3626
3627 if (BE (ret != REG_NOERROR, 0))
3628 {
3629 re_free (sbcset);
3630 #ifdef RE_ENABLE_I18N
3631 free_charset (mbcset);
3632 #endif /* RE_ENABLE_I18N */
3633 *err = ret;
3634 return NULL;
3635 }
3636 /* \w match '_' also. */
3637 for (; *extra; extra++)
3638 bitset_set (sbcset, *extra);
3639
3640 /* If it is non-matching list. */
3641 if (non_match)
3642 bitset_not (sbcset);
3643
3644 #ifdef RE_ENABLE_I18N
3645 /* Ensure only single byte characters are set. */
3646 if (dfa->mb_cur_max > 1)
3647 bitset_mask (sbcset, dfa->sb_char);
3648 #endif
3649
3650 /* Build a tree for simple bracket. */
3651 br_token.type = SIMPLE_BRACKET;
3652 br_token.opr.sbcset = sbcset;
3653 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3654 if (BE (tree == NULL, 0))
3655 goto build_word_op_espace;
3656
3657 #ifdef RE_ENABLE_I18N
3658 if (dfa->mb_cur_max > 1)
3659 {
3660 bin_tree_t *mbc_tree;
3661 /* Build a tree for complex bracket. */
3662 br_token.type = COMPLEX_BRACKET;
3663 br_token.opr.mbcset = mbcset;
3664 dfa->has_mb_node = 1;
3665 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3666 if (BE (mbc_tree == NULL, 0))
3667 goto build_word_op_espace;
3668 /* Then join them by ALT node. */
3669 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3670 if (BE (mbc_tree != NULL, 1))
3671 return tree;
3672 }
3673 else
3674 {
3675 free_charset (mbcset);
3676 return tree;
3677 }
3678 #else /* not RE_ENABLE_I18N */
3679 return tree;
3680 #endif /* not RE_ENABLE_I18N */
3681
3682 build_word_op_espace:
3683 re_free (sbcset);
3684 #ifdef RE_ENABLE_I18N
3685 free_charset (mbcset);
3686 #endif /* RE_ENABLE_I18N */
3687 *err = REG_ESPACE;
3688 return NULL;
3689 }
3690
3691 /* This is intended for the expressions like "a{1,3}".
3692 Fetch a number from `input', and return the number.
3693 Return -1, if the number field is empty like "{,1}".
3694 Return -2, If an error is occured. */
3695
3696 static int
3697 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3698 {
3699 int num = -1;
3700 unsigned char c;
3701 while (1)
3702 {
3703 fetch_token (token, input, syntax);
3704 c = token->opr.c;
3705 if (BE (token->type == END_OF_RE, 0))
3706 return -2;
3707 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3708 break;
3709 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3710 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3711 num = (num > RE_DUP_MAX) ? -2 : num;
3712 }
3713 return num;
3714 }
3715 \f
3716 #ifdef RE_ENABLE_I18N
3717 static void
3718 free_charset (re_charset_t *cset)
3719 {
3720 re_free (cset->mbchars);
3721 # ifdef _LIBC
3722 re_free (cset->coll_syms);
3723 re_free (cset->equiv_classes);
3724 re_free (cset->range_starts);
3725 re_free (cset->range_ends);
3726 # endif
3727 re_free (cset->char_classes);
3728 re_free (cset);
3729 }
3730 #endif /* RE_ENABLE_I18N */
3731 \f
3732 /* Functions for binary tree operation. */
3733
3734 /* Create a tree node. */
3735
3736 static bin_tree_t *
3737 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738 re_token_type_t type)
3739 {
3740 re_token_t t;
3741 t.type = type;
3742 return create_token_tree (dfa, left, right, &t);
3743 }
3744
3745 static bin_tree_t *
3746 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3747 const re_token_t *token)
3748 {
3749 bin_tree_t *tree;
3750 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3751 {
3752 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3753
3754 if (storage == NULL)
3755 return NULL;
3756 storage->next = dfa->str_tree_storage;
3757 dfa->str_tree_storage = storage;
3758 dfa->str_tree_storage_idx = 0;
3759 }
3760 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3761
3762 tree->parent = NULL;
3763 tree->left = left;
3764 tree->right = right;
3765 tree->token = *token;
3766 tree->token.duplicated = 0;
3767 tree->token.opt_subexp = 0;
3768 tree->first = NULL;
3769 tree->next = NULL;
3770 tree->node_idx = -1;
3771
3772 if (left != NULL)
3773 left->parent = tree;
3774 if (right != NULL)
3775 right->parent = tree;
3776 return tree;
3777 }
3778
3779 /* Mark the tree SRC as an optional subexpression.
3780 To be called from preorder or postorder. */
3781
3782 static reg_errcode_t
3783 mark_opt_subexp (void *extra, bin_tree_t *node)
3784 {
3785 int idx = (int) (long) extra;
3786 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3787 node->token.opt_subexp = 1;
3788
3789 return REG_NOERROR;
3790 }
3791
3792 /* Free the allocated memory inside NODE. */
3793
3794 static void
3795 free_token (re_token_t *node)
3796 {
3797 #ifdef RE_ENABLE_I18N
3798 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3799 free_charset (node->opr.mbcset);
3800 else
3801 #endif /* RE_ENABLE_I18N */
3802 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3803 re_free (node->opr.sbcset);
3804 }
3805
3806 /* Worker function for tree walking. Free the allocated memory inside NODE
3807 and its children. */
3808
3809 static reg_errcode_t
3810 free_tree (void *extra, bin_tree_t *node)
3811 {
3812 free_token (&node->token);
3813 return REG_NOERROR;
3814 }
3815
3816
3817 /* Duplicate the node SRC, and return new node. This is a preorder
3818 visit similar to the one implemented by the generic visitor, but
3819 we need more infrastructure to maintain two parallel trees --- so,
3820 it's easier to duplicate. */
3821
3822 static bin_tree_t *
3823 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3824 {
3825 const bin_tree_t *node;
3826 bin_tree_t *dup_root;
3827 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3828
3829 for (node = root; ; )
3830 {
3831 /* Create a new tree and link it back to the current parent. */
3832 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3833 if (*p_new == NULL)
3834 return NULL;
3835 (*p_new)->parent = dup_node;
3836 (*p_new)->token.duplicated = 1;
3837 dup_node = *p_new;
3838
3839 /* Go to the left node, or up and to the right. */
3840 if (node->left)
3841 {
3842 node = node->left;
3843 p_new = &dup_node->left;
3844 }
3845 else
3846 {
3847 const bin_tree_t *prev = NULL;
3848 while (node->right == prev || node->right == NULL)
3849 {
3850 prev = node;
3851 node = node->parent;
3852 dup_node = dup_node->parent;
3853 if (!node)
3854 return dup_root;
3855 }
3856 node = node->right;
3857 p_new = &dup_node->right;
3858 }
3859 }
3860 }