]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
6771dbb29980b636765b6a92d00b878fcbfab382
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 \f
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129 const char __re_error_msgid[] attribute_hidden =
130 {
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203 \f
204 /* Entry points for GNU code. */
205
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
209
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
212
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218 {
219 reg_errcode_t ret;
220
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225
226 /* Match anchors at newline. */
227 bufp->newline_anchor = 1;
228
229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230
231 if (!ret)
232 return NULL;
233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options;
245
246
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
250
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
253
254 reg_syntax_t
255 re_set_syntax (syntax)
256 reg_syntax_t syntax;
257 {
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266
267 int
268 re_compile_fastmap (bufp)
269 struct re_pattern_buffer *bufp;
270 {
271 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272 char *fastmap = bufp->fastmap;
273
274 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276 if (dfa->init_state != dfa->init_state_word)
277 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278 if (dfa->init_state != dfa->init_state_nl)
279 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280 if (dfa->init_state != dfa->init_state_begbuf)
281 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282 bufp->fastmap_accurate = 1;
283 return 0;
284 }
285 #ifdef _LIBC
286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
288
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
292 {
293 fastmap[ch] = 1;
294 if (icase)
295 fastmap[tolower (ch)] = 1;
296 }
297
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
300
301 static void
302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 char *fastmap)
304 {
305 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306 int node_cnt;
307 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 {
310 int node = init_state->nodes.elems[node_cnt];
311 re_token_type_t type = dfa->nodes[node].type;
312
313 if (type == CHARACTER)
314 {
315 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 {
319 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, 0, buf[0]);
335 }
336 #endif
337 }
338 else if (type == SIMPLE_BRACKET)
339 {
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
349 }
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
352 {
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 int i;
355
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
365 {
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
371 }
372 # endif /* _LIBC */
373
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
383 ))
384 {
385 unsigned char c = 0;
386 do
387 {
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
392 }
393 while (++c != 0);
394 }
395
396 else
397 {
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
400 {
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 {
408 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 }
412 }
413 }
414 }
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
421 {
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
426 }
427 }
428 }
429 \f
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
432
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
435
436 `buffer' to the compiled pattern;
437 `used' to the length of the compiled pattern;
438 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 `fastmap' to an allocated space for the fastmap;
443 `fastmap_accurate' to zero;
444 `re_nsub' to the number of subexpressions in PATTERN.
445
446 PATTERN is the address of the pattern string.
447
448 CFLAGS is a series of bits which affect compilation.
449
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
452
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
455
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
458
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
462
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
465
466 int
467 regcomp (preg, pattern, cflags)
468 regex_t *__restrict preg;
469 const char *__restrict pattern;
470 int cflags;
471 {
472 reg_errcode_t ret;
473 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
474 : RE_SYNTAX_POSIX_BASIC);
475
476 preg->buffer = NULL;
477 preg->allocated = 0;
478 preg->used = 0;
479
480 /* Try to allocate space for the fastmap. */
481 preg->fastmap = re_malloc (char, SBC_MAX);
482 if (BE (preg->fastmap == NULL, 0))
483 return REG_ESPACE;
484
485 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
486
487 /* If REG_NEWLINE is set, newlines are treated differently. */
488 if (cflags & REG_NEWLINE)
489 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
490 syntax &= ~RE_DOT_NEWLINE;
491 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
492 /* It also changes the matching behavior. */
493 preg->newline_anchor = 1;
494 }
495 else
496 preg->newline_anchor = 0;
497 preg->no_sub = !!(cflags & REG_NOSUB);
498 preg->translate = NULL;
499
500 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
501
502 /* POSIX doesn't distinguish between an unmatched open-group and an
503 unmatched close-group: both are REG_EPAREN. */
504 if (ret == REG_ERPAREN)
505 ret = REG_EPAREN;
506
507 /* We have already checked preg->fastmap != NULL. */
508 if (BE (ret == REG_NOERROR, 1))
509 /* Compute the fastmap now, since regexec cannot modify the pattern
510 buffer. This function never fails in this implementation. */
511 (void) re_compile_fastmap (preg);
512 else
513 {
514 /* Some error occurred while compiling the expression. */
515 re_free (preg->fastmap);
516 preg->fastmap = NULL;
517 }
518
519 return (int) ret;
520 }
521 #ifdef _LIBC
522 weak_alias (__regcomp, regcomp)
523 #endif
524
525 /* Returns a message corresponding to an error code, ERRCODE, returned
526 from either regcomp or regexec. We don't use PREG here. */
527
528 size_t
529 regerror (errcode, preg, errbuf, errbuf_size)
530 int errcode;
531 const regex_t *__restrict preg;
532 char *__restrict errbuf;
533 size_t errbuf_size;
534 {
535 const char *msg;
536 size_t msg_size;
537
538 if (BE (errcode < 0
539 || errcode >= (int) (sizeof (__re_error_msgid_idx)
540 / sizeof (__re_error_msgid_idx[0])), 0))
541 /* Only error codes returned by the rest of the code should be passed
542 to this routine. If we are given anything else, or if other regex
543 code generates an invalid error code, then the program has a bug.
544 Dump core so we can fix it. */
545 abort ();
546
547 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
548
549 msg_size = strlen (msg) + 1; /* Includes the null. */
550
551 if (BE (errbuf_size != 0, 1))
552 {
553 if (BE (msg_size > errbuf_size, 0))
554 {
555 #if defined HAVE_MEMPCPY || defined _LIBC
556 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
557 #else
558 memcpy (errbuf, msg, errbuf_size - 1);
559 errbuf[errbuf_size - 1] = 0;
560 #endif
561 }
562 else
563 memcpy (errbuf, msg, msg_size);
564 }
565
566 return msg_size;
567 }
568 #ifdef _LIBC
569 weak_alias (__regerror, regerror)
570 #endif
571
572
573 #ifdef RE_ENABLE_I18N
574 /* This static array is used for the map to single-byte characters when
575 UTF-8 is used. Otherwise we would allocate memory just to initialize
576 it the same all the time. UTF-8 is the preferred encoding so this is
577 a worthwhile optimization. */
578 static const bitset_t utf8_sb_map =
579 {
580 /* Set the first 128 bits. */
581 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
582 };
583 #endif
584
585
586 static void
587 free_dfa_content (re_dfa_t *dfa)
588 {
589 int i, j;
590
591 if (dfa->nodes)
592 for (i = 0; i < dfa->nodes_len; ++i)
593 free_token (dfa->nodes + i);
594 re_free (dfa->nexts);
595 for (i = 0; i < dfa->nodes_len; ++i)
596 {
597 if (dfa->eclosures != NULL)
598 re_node_set_free (dfa->eclosures + i);
599 if (dfa->inveclosures != NULL)
600 re_node_set_free (dfa->inveclosures + i);
601 if (dfa->edests != NULL)
602 re_node_set_free (dfa->edests + i);
603 }
604 re_free (dfa->edests);
605 re_free (dfa->eclosures);
606 re_free (dfa->inveclosures);
607 re_free (dfa->nodes);
608
609 if (dfa->state_table)
610 for (i = 0; i <= dfa->state_hash_mask; ++i)
611 {
612 struct re_state_table_entry *entry = dfa->state_table + i;
613 for (j = 0; j < entry->num; ++j)
614 {
615 re_dfastate_t *state = entry->array[j];
616 free_state (state);
617 }
618 re_free (entry->array);
619 }
620 re_free (dfa->state_table);
621 #ifdef RE_ENABLE_I18N
622 if (dfa->sb_char != utf8_sb_map)
623 re_free (dfa->sb_char);
624 #endif
625 re_free (dfa->subexp_map);
626 #ifdef DEBUG
627 re_free (dfa->re_str);
628 #endif
629
630 re_free (dfa);
631 }
632
633
634 /* Free dynamically allocated space used by PREG. */
635
636 void
637 regfree (preg)
638 regex_t *preg;
639 {
640 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641 if (BE (dfa != NULL, 1))
642 free_dfa_content (dfa);
643 preg->buffer = NULL;
644 preg->allocated = 0;
645
646 re_free (preg->fastmap);
647 preg->fastmap = NULL;
648
649 re_free (preg->translate);
650 preg->translate = NULL;
651 }
652 #ifdef _LIBC
653 weak_alias (__regfree, regfree)
654 #endif
655 \f
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
658
659 #if defined _REGEX_RE_COMP || defined _LIBC
660
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf;
663
664 char *
665 # ifdef _LIBC
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
669 weak_function
670 # endif
671 re_comp (s)
672 const char *s;
673 {
674 reg_errcode_t ret;
675 char *fastmap;
676
677 if (!s)
678 {
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
681 return 0;
682 }
683
684 if (re_comp_buf.buffer)
685 {
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
691 }
692
693 if (re_comp_buf.fastmap == NULL)
694 {
695 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
699 }
700
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
703
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
706
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709 if (!ret)
710 return NULL;
711
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714 }
715
716 #ifdef _LIBC
717 libc_freeres_fn (free_mem)
718 {
719 __regfree (&re_comp_buf);
720 }
721 #endif
722
723 #endif /* _REGEX_RE_COMP */
724 \f
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
728
729 static reg_errcode_t
730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
731 reg_syntax_t syntax)
732 {
733 reg_errcode_t err = REG_NOERROR;
734 re_dfa_t *dfa;
735 re_string_t regexp;
736
737 /* Initialize the pattern buffer. */
738 preg->fastmap_accurate = 0;
739 preg->syntax = syntax;
740 preg->not_bol = preg->not_eol = 0;
741 preg->used = 0;
742 preg->re_nsub = 0;
743 preg->can_be_null = 0;
744 preg->regs_allocated = REGS_UNALLOCATED;
745
746 /* Initialize the dfa. */
747 dfa = (re_dfa_t *) preg->buffer;
748 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749 {
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
755 if (dfa == NULL)
756 return REG_ESPACE;
757 preg->allocated = sizeof (re_dfa_t);
758 preg->buffer = (unsigned char *) dfa;
759 }
760 preg->used = sizeof (re_dfa_t);
761
762 err = init_dfa (dfa, length);
763 if (BE (err != REG_NOERROR, 0))
764 {
765 free_dfa_content (dfa);
766 preg->buffer = NULL;
767 preg->allocated = 0;
768 return err;
769 }
770 #ifdef DEBUG
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa->re_str = re_malloc (char, length + 1);
773 strncpy (dfa->re_str, pattern, length + 1);
774 #endif
775
776 __libc_lock_init (dfa->lock);
777
778 err = re_string_construct (&regexp, pattern, length, preg->translate,
779 syntax & RE_ICASE, dfa);
780 if (BE (err != REG_NOERROR, 0))
781 {
782 re_compile_internal_free_return:
783 free_workarea_compile (preg);
784 re_string_destruct (&regexp);
785 free_dfa_content (dfa);
786 preg->buffer = NULL;
787 preg->allocated = 0;
788 return err;
789 }
790
791 /* Parse the regular expression, and build a structure tree. */
792 preg->re_nsub = 0;
793 dfa->str_tree = parse (&regexp, preg, syntax, &err);
794 if (BE (dfa->str_tree == NULL, 0))
795 goto re_compile_internal_free_return;
796
797 /* Analyze the tree and create the nfa. */
798 err = analyze (preg);
799 if (BE (err != REG_NOERROR, 0))
800 goto re_compile_internal_free_return;
801
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
805 optimize_utf8 (dfa);
806 #endif
807
808 /* Then create the initial state of the dfa. */
809 err = create_initial_state (dfa);
810
811 /* Release work areas. */
812 free_workarea_compile (preg);
813 re_string_destruct (&regexp);
814
815 if (BE (err != REG_NOERROR, 0))
816 {
817 free_dfa_content (dfa);
818 preg->buffer = NULL;
819 preg->allocated = 0;
820 }
821
822 return err;
823 }
824
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
827
828 static reg_errcode_t
829 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 {
831 unsigned int table_size;
832 #ifndef _LIBC
833 char *codeset_name;
834 #endif
835
836 memset (dfa, '\0', sizeof (re_dfa_t));
837
838 /* Force allocation of str_tree_storage the first time. */
839 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
840
841 /* Avoid overflows. */
842 if (pat_len == SIZE_MAX)
843 return REG_ESPACE;
844
845 dfa->nodes_alloc = pat_len + 1;
846 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
847
848 /* table_size = 2 ^ ceil(log pat_len) */
849 for (table_size = 1; ; table_size <<= 1)
850 if (table_size > pat_len)
851 break;
852
853 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
854 dfa->state_hash_mask = table_size - 1;
855
856 dfa->mb_cur_max = MB_CUR_MAX;
857 #ifdef _LIBC
858 if (dfa->mb_cur_max == 6
859 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
860 dfa->is_utf8 = 1;
861 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
862 != 0);
863 #else
864 # ifdef HAVE_LANGINFO_CODESET
865 codeset_name = nl_langinfo (CODESET);
866 # else
867 codeset_name = getenv ("LC_ALL");
868 if (codeset_name == NULL || codeset_name[0] == '\0')
869 codeset_name = getenv ("LC_CTYPE");
870 if (codeset_name == NULL || codeset_name[0] == '\0')
871 codeset_name = getenv ("LANG");
872 if (codeset_name == NULL)
873 codeset_name = "";
874 else if (strchr (codeset_name, '.') != NULL)
875 codeset_name = strchr (codeset_name, '.') + 1;
876 # endif
877
878 if (strcasecmp (codeset_name, "UTF-8") == 0
879 || strcasecmp (codeset_name, "UTF8") == 0)
880 dfa->is_utf8 = 1;
881
882 /* We check exhaustively in the loop below if this charset is a
883 superset of ASCII. */
884 dfa->map_notascii = 0;
885 #endif
886
887 #ifdef RE_ENABLE_I18N
888 if (dfa->mb_cur_max > 1)
889 {
890 if (dfa->is_utf8)
891 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
892 else
893 {
894 int i, j, ch;
895
896 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
897 if (BE (dfa->sb_char == NULL, 0))
898 return REG_ESPACE;
899
900 /* Set the bits corresponding to single byte chars. */
901 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
902 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
903 {
904 wint_t wch = __btowc (ch);
905 if (wch != WEOF)
906 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
907 # ifndef _LIBC
908 if (isascii (ch) && wch != ch)
909 dfa->map_notascii = 1;
910 # endif
911 }
912 }
913 }
914 #endif
915
916 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
917 return REG_ESPACE;
918 return REG_NOERROR;
919 }
920
921 /* Initialize WORD_CHAR table, which indicate which character is
922 "word". In this case "word" means that it is the word construction
923 character used by some operators like "\<", "\>", etc. */
924
925 static void
926 internal_function
927 init_word_char (re_dfa_t *dfa)
928 {
929 dfa->word_ops_used = 1;
930 int i = 0;
931 int ch = 0;
932 if (BE (dfa->map_notascii == 0, 1))
933 {
934 if (sizeof (dfa->word_char[0]) == 8)
935 {
936 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
937 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
938 i = 2;
939 }
940 else if (sizeof (dfa->word_char[0]) == 4)
941 {
942 dfa->word_char[0] = UINT32_C (0x00000000);
943 dfa->word_char[1] = UINT32_C (0x03ff0000);
944 dfa->word_char[2] = UINT32_C (0x87fffffe);
945 dfa->word_char[3] = UINT32_C (0x07fffffe);
946 i = 4;
947 }
948 else
949 abort ();
950 ch = 128;
951
952 if (BE (dfa->is_utf8, 1))
953 {
954 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
955 return;
956 }
957 }
958
959 for (; i < BITSET_WORDS; ++i)
960 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
961 if (isalnum (ch) || ch == '_')
962 dfa->word_char[i] |= (bitset_word_t) 1 << j;
963 }
964
965 /* Free the work area which are only used while compiling. */
966
967 static void
968 free_workarea_compile (regex_t *preg)
969 {
970 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
971 bin_tree_storage_t *storage, *next;
972 for (storage = dfa->str_tree_storage; storage; storage = next)
973 {
974 next = storage->next;
975 re_free (storage);
976 }
977 dfa->str_tree_storage = NULL;
978 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
979 dfa->str_tree = NULL;
980 re_free (dfa->org_indices);
981 dfa->org_indices = NULL;
982 }
983
984 /* Create initial states for all contexts. */
985
986 static reg_errcode_t
987 create_initial_state (re_dfa_t *dfa)
988 {
989 int first, i;
990 reg_errcode_t err;
991 re_node_set init_nodes;
992
993 /* Initial states have the epsilon closure of the node which is
994 the first node of the regular expression. */
995 first = dfa->str_tree->first->node_idx;
996 dfa->init_node = first;
997 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
998 if (BE (err != REG_NOERROR, 0))
999 return err;
1000
1001 /* The back-references which are in initial states can epsilon transit,
1002 since in this case all of the subexpressions can be null.
1003 Then we add epsilon closures of the nodes which are the next nodes of
1004 the back-references. */
1005 if (dfa->nbackref > 0)
1006 for (i = 0; i < init_nodes.nelem; ++i)
1007 {
1008 int node_idx = init_nodes.elems[i];
1009 re_token_type_t type = dfa->nodes[node_idx].type;
1010
1011 int clexp_idx;
1012 if (type != OP_BACK_REF)
1013 continue;
1014 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1015 {
1016 re_token_t *clexp_node;
1017 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1018 if (clexp_node->type == OP_CLOSE_SUBEXP
1019 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1020 break;
1021 }
1022 if (clexp_idx == init_nodes.nelem)
1023 continue;
1024
1025 if (type == OP_BACK_REF)
1026 {
1027 int dest_idx = dfa->edests[node_idx].elems[0];
1028 if (!re_node_set_contains (&init_nodes, dest_idx))
1029 {
1030 reg_errcode_t err = re_node_set_merge (&init_nodes,
1031 dfa->eclosures
1032 + dest_idx);
1033 if (err != REG_NOERROR)
1034 return err;
1035 i = 0;
1036 }
1037 }
1038 }
1039
1040 /* It must be the first time to invoke acquire_state. */
1041 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1042 /* We don't check ERR here, since the initial state must not be NULL. */
1043 if (BE (dfa->init_state == NULL, 0))
1044 return err;
1045 if (dfa->init_state->has_constraint)
1046 {
1047 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1048 CONTEXT_WORD);
1049 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1050 CONTEXT_NEWLINE);
1051 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1052 &init_nodes,
1053 CONTEXT_NEWLINE
1054 | CONTEXT_BEGBUF);
1055 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1056 || dfa->init_state_begbuf == NULL, 0))
1057 return err;
1058 }
1059 else
1060 dfa->init_state_word = dfa->init_state_nl
1061 = dfa->init_state_begbuf = dfa->init_state;
1062
1063 re_node_set_free (&init_nodes);
1064 return REG_NOERROR;
1065 }
1066 \f
1067 #ifdef RE_ENABLE_I18N
1068 /* If it is possible to do searching in single byte encoding instead of UTF-8
1069 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1070 DFA nodes where needed. */
1071
1072 static void
1073 optimize_utf8 (re_dfa_t *dfa)
1074 {
1075 int node, i, mb_chars = 0, has_period = 0;
1076
1077 for (node = 0; node < dfa->nodes_len; ++node)
1078 switch (dfa->nodes[node].type)
1079 {
1080 case CHARACTER:
1081 if (dfa->nodes[node].opr.c >= 0x80)
1082 mb_chars = 1;
1083 break;
1084 case ANCHOR:
1085 switch (dfa->nodes[node].opr.ctx_type)
1086 {
1087 case LINE_FIRST:
1088 case LINE_LAST:
1089 case BUF_FIRST:
1090 case BUF_LAST:
1091 break;
1092 default:
1093 /* Word anchors etc. cannot be handled. It's okay to test
1094 opr.ctx_type since constraints (for all DFA nodes) are
1095 created by ORing one or more opr.ctx_type values. */
1096 return;
1097 }
1098 break;
1099 case OP_PERIOD:
1100 has_period = 1;
1101 break;
1102 case OP_BACK_REF:
1103 case OP_ALT:
1104 case END_OF_RE:
1105 case OP_DUP_ASTERISK:
1106 case OP_OPEN_SUBEXP:
1107 case OP_CLOSE_SUBEXP:
1108 break;
1109 case COMPLEX_BRACKET:
1110 return;
1111 case SIMPLE_BRACKET:
1112 /* Just double check. The non-ASCII range starts at 0x80. */
1113 assert (0x80 % BITSET_WORD_BITS == 0);
1114 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115 if (dfa->nodes[node].opr.sbcset[i])
1116 return;
1117 break;
1118 default:
1119 abort ();
1120 }
1121
1122 if (mb_chars || has_period)
1123 for (node = 0; node < dfa->nodes_len; ++node)
1124 {
1125 if (dfa->nodes[node].type == CHARACTER
1126 && dfa->nodes[node].opr.c >= 0x80)
1127 dfa->nodes[node].mb_partial = 0;
1128 else if (dfa->nodes[node].type == OP_PERIOD)
1129 dfa->nodes[node].type = OP_UTF8_PERIOD;
1130 }
1131
1132 /* The search can be in single byte locale. */
1133 dfa->mb_cur_max = 1;
1134 dfa->is_utf8 = 0;
1135 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1136 }
1137 #endif
1138 \f
1139 /* Analyze the structure tree, and calculate "first", "next", "edest",
1140 "eclosure", and "inveclosure". */
1141
1142 static reg_errcode_t
1143 analyze (regex_t *preg)
1144 {
1145 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1146 reg_errcode_t ret;
1147
1148 /* Allocate arrays. */
1149 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1150 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1151 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1152 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1153 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1154 || dfa->eclosures == NULL, 0))
1155 return REG_ESPACE;
1156
1157 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1158 if (dfa->subexp_map != NULL)
1159 {
1160 int i;
1161 for (i = 0; i < preg->re_nsub; i++)
1162 dfa->subexp_map[i] = i;
1163 preorder (dfa->str_tree, optimize_subexps, dfa);
1164 for (i = 0; i < preg->re_nsub; i++)
1165 if (dfa->subexp_map[i] != i)
1166 break;
1167 if (i == preg->re_nsub)
1168 {
1169 free (dfa->subexp_map);
1170 dfa->subexp_map = NULL;
1171 }
1172 }
1173
1174 ret = postorder (dfa->str_tree, lower_subexps, preg);
1175 if (BE (ret != REG_NOERROR, 0))
1176 return ret;
1177 ret = postorder (dfa->str_tree, calc_first, dfa);
1178 if (BE (ret != REG_NOERROR, 0))
1179 return ret;
1180 preorder (dfa->str_tree, calc_next, dfa);
1181 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1182 if (BE (ret != REG_NOERROR, 0))
1183 return ret;
1184 ret = calc_eclosure (dfa);
1185 if (BE (ret != REG_NOERROR, 0))
1186 return ret;
1187
1188 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1189 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1190 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1191 || dfa->nbackref)
1192 {
1193 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1194 if (BE (dfa->inveclosures == NULL, 0))
1195 return REG_ESPACE;
1196 ret = calc_inveclosure (dfa);
1197 }
1198
1199 return ret;
1200 }
1201
1202 /* Our parse trees are very unbalanced, so we cannot use a stack to
1203 implement parse tree visits. Instead, we use parent pointers and
1204 some hairy code in these two functions. */
1205 static reg_errcode_t
1206 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1207 void *extra)
1208 {
1209 bin_tree_t *node, *prev;
1210
1211 for (node = root; ; )
1212 {
1213 /* Descend down the tree, preferably to the left (or to the right
1214 if that's the only child). */
1215 while (node->left || node->right)
1216 if (node->left)
1217 node = node->left;
1218 else
1219 node = node->right;
1220
1221 do
1222 {
1223 reg_errcode_t err = fn (extra, node);
1224 if (BE (err != REG_NOERROR, 0))
1225 return err;
1226 if (node->parent == NULL)
1227 return REG_NOERROR;
1228 prev = node;
1229 node = node->parent;
1230 }
1231 /* Go up while we have a node that is reached from the right. */
1232 while (node->right == prev || node->right == NULL);
1233 node = node->right;
1234 }
1235 }
1236
1237 static reg_errcode_t
1238 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1239 void *extra)
1240 {
1241 bin_tree_t *node;
1242
1243 for (node = root; ; )
1244 {
1245 reg_errcode_t err = fn (extra, node);
1246 if (BE (err != REG_NOERROR, 0))
1247 return err;
1248
1249 /* Go to the left node, or up and to the right. */
1250 if (node->left)
1251 node = node->left;
1252 else
1253 {
1254 bin_tree_t *prev = NULL;
1255 while (node->right == prev || node->right == NULL)
1256 {
1257 prev = node;
1258 node = node->parent;
1259 if (!node)
1260 return REG_NOERROR;
1261 }
1262 node = node->right;
1263 }
1264 }
1265 }
1266
1267 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1268 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1269 backreferences as well. Requires a preorder visit. */
1270 static reg_errcode_t
1271 optimize_subexps (void *extra, bin_tree_t *node)
1272 {
1273 re_dfa_t *dfa = (re_dfa_t *) extra;
1274
1275 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1276 {
1277 int idx = node->token.opr.idx;
1278 node->token.opr.idx = dfa->subexp_map[idx];
1279 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1280 }
1281
1282 else if (node->token.type == SUBEXP
1283 && node->left && node->left->token.type == SUBEXP)
1284 {
1285 int other_idx = node->left->token.opr.idx;
1286
1287 node->left = node->left->left;
1288 if (node->left)
1289 node->left->parent = node;
1290
1291 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1292 if (other_idx < BITSET_WORD_BITS)
1293 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1294 }
1295
1296 return REG_NOERROR;
1297 }
1298
1299 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1300 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1301 static reg_errcode_t
1302 lower_subexps (void *extra, bin_tree_t *node)
1303 {
1304 regex_t *preg = (regex_t *) extra;
1305 reg_errcode_t err = REG_NOERROR;
1306
1307 if (node->left && node->left->token.type == SUBEXP)
1308 {
1309 node->left = lower_subexp (&err, preg, node->left);
1310 if (node->left)
1311 node->left->parent = node;
1312 }
1313 if (node->right && node->right->token.type == SUBEXP)
1314 {
1315 node->right = lower_subexp (&err, preg, node->right);
1316 if (node->right)
1317 node->right->parent = node;
1318 }
1319
1320 return err;
1321 }
1322
1323 static bin_tree_t *
1324 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1325 {
1326 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1327 bin_tree_t *body = node->left;
1328 bin_tree_t *op, *cls, *tree1, *tree;
1329
1330 if (preg->no_sub
1331 /* We do not optimize empty subexpressions, because otherwise we may
1332 have bad CONCAT nodes with NULL children. This is obviously not
1333 very common, so we do not lose much. An example that triggers
1334 this case is the sed "script" /\(\)/x. */
1335 && node->left != NULL
1336 && (node->token.opr.idx >= BITSET_WORD_BITS
1337 || !(dfa->used_bkref_map
1338 & ((bitset_word_t) 1 << node->token.opr.idx))))
1339 return node->left;
1340
1341 /* Convert the SUBEXP node to the concatenation of an
1342 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1343 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1344 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1345 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1346 tree = create_tree (dfa, op, tree1, CONCAT);
1347 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1348 {
1349 *err = REG_ESPACE;
1350 return NULL;
1351 }
1352
1353 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1354 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1355 return tree;
1356 }
1357
1358 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1359 nodes. Requires a postorder visit. */
1360 static reg_errcode_t
1361 calc_first (void *extra, bin_tree_t *node)
1362 {
1363 re_dfa_t *dfa = (re_dfa_t *) extra;
1364 if (node->token.type == CONCAT)
1365 {
1366 node->first = node->left->first;
1367 node->node_idx = node->left->node_idx;
1368 }
1369 else
1370 {
1371 node->first = node;
1372 node->node_idx = re_dfa_add_node (dfa, node->token);
1373 if (BE (node->node_idx == -1, 0))
1374 return REG_ESPACE;
1375 if (node->token.type == ANCHOR)
1376 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1377 }
1378 return REG_NOERROR;
1379 }
1380
1381 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1382 static reg_errcode_t
1383 calc_next (void *extra, bin_tree_t *node)
1384 {
1385 switch (node->token.type)
1386 {
1387 case OP_DUP_ASTERISK:
1388 node->left->next = node;
1389 break;
1390 case CONCAT:
1391 node->left->next = node->right->first;
1392 node->right->next = node->next;
1393 break;
1394 default:
1395 if (node->left)
1396 node->left->next = node->next;
1397 if (node->right)
1398 node->right->next = node->next;
1399 break;
1400 }
1401 return REG_NOERROR;
1402 }
1403
1404 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1405 static reg_errcode_t
1406 link_nfa_nodes (void *extra, bin_tree_t *node)
1407 {
1408 re_dfa_t *dfa = (re_dfa_t *) extra;
1409 int idx = node->node_idx;
1410 reg_errcode_t err = REG_NOERROR;
1411
1412 switch (node->token.type)
1413 {
1414 case CONCAT:
1415 break;
1416
1417 case END_OF_RE:
1418 assert (node->next == NULL);
1419 break;
1420
1421 case OP_DUP_ASTERISK:
1422 case OP_ALT:
1423 {
1424 int left, right;
1425 dfa->has_plural_match = 1;
1426 if (node->left != NULL)
1427 left = node->left->first->node_idx;
1428 else
1429 left = node->next->node_idx;
1430 if (node->right != NULL)
1431 right = node->right->first->node_idx;
1432 else
1433 right = node->next->node_idx;
1434 assert (left > -1);
1435 assert (right > -1);
1436 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1437 }
1438 break;
1439
1440 case ANCHOR:
1441 case OP_OPEN_SUBEXP:
1442 case OP_CLOSE_SUBEXP:
1443 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1444 break;
1445
1446 case OP_BACK_REF:
1447 dfa->nexts[idx] = node->next->node_idx;
1448 if (node->token.type == OP_BACK_REF)
1449 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1450 break;
1451
1452 default:
1453 assert (!IS_EPSILON_NODE (node->token.type));
1454 dfa->nexts[idx] = node->next->node_idx;
1455 break;
1456 }
1457
1458 return err;
1459 }
1460
1461 /* Duplicate the epsilon closure of the node ROOT_NODE.
1462 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1463 to their own constraint. */
1464
1465 static reg_errcode_t
1466 internal_function
1467 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1468 int root_node, unsigned int init_constraint)
1469 {
1470 int org_node, clone_node, ret;
1471 unsigned int constraint = init_constraint;
1472 for (org_node = top_org_node, clone_node = top_clone_node;;)
1473 {
1474 int org_dest, clone_dest;
1475 if (dfa->nodes[org_node].type == OP_BACK_REF)
1476 {
1477 /* If the back reference epsilon-transit, its destination must
1478 also have the constraint. Then duplicate the epsilon closure
1479 of the destination of the back reference, and store it in
1480 edests of the back reference. */
1481 org_dest = dfa->nexts[org_node];
1482 re_node_set_empty (dfa->edests + clone_node);
1483 clone_dest = duplicate_node (dfa, org_dest, constraint);
1484 if (BE (clone_dest == -1, 0))
1485 return REG_ESPACE;
1486 dfa->nexts[clone_node] = dfa->nexts[org_node];
1487 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488 if (BE (ret < 0, 0))
1489 return REG_ESPACE;
1490 }
1491 else if (dfa->edests[org_node].nelem == 0)
1492 {
1493 /* In case of the node can't epsilon-transit, don't duplicate the
1494 destination and store the original destination as the
1495 destination of the node. */
1496 dfa->nexts[clone_node] = dfa->nexts[org_node];
1497 break;
1498 }
1499 else if (dfa->edests[org_node].nelem == 1)
1500 {
1501 /* In case of the node can epsilon-transit, and it has only one
1502 destination. */
1503 org_dest = dfa->edests[org_node].elems[0];
1504 re_node_set_empty (dfa->edests + clone_node);
1505 /* If the node is root_node itself, it means the epsilon clsoure
1506 has a loop. Then tie it to the destination of the root_node. */
1507 if (org_node == root_node && clone_node != org_node)
1508 {
1509 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1510 if (BE (ret < 0, 0))
1511 return REG_ESPACE;
1512 break;
1513 }
1514 /* In case of the node has another constraint, add it. */
1515 constraint |= dfa->nodes[org_node].constraint;
1516 clone_dest = duplicate_node (dfa, org_dest, constraint);
1517 if (BE (clone_dest == -1, 0))
1518 return REG_ESPACE;
1519 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520 if (BE (ret < 0, 0))
1521 return REG_ESPACE;
1522 }
1523 else /* dfa->edests[org_node].nelem == 2 */
1524 {
1525 /* In case of the node can epsilon-transit, and it has two
1526 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1527 org_dest = dfa->edests[org_node].elems[0];
1528 re_node_set_empty (dfa->edests + clone_node);
1529 /* Search for a duplicated node which satisfies the constraint. */
1530 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1531 if (clone_dest == -1)
1532 {
1533 /* There is no such duplicated node, create a new one. */
1534 reg_errcode_t err;
1535 clone_dest = duplicate_node (dfa, org_dest, constraint);
1536 if (BE (clone_dest == -1, 0))
1537 return REG_ESPACE;
1538 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1539 if (BE (ret < 0, 0))
1540 return REG_ESPACE;
1541 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1542 root_node, constraint);
1543 if (BE (err != REG_NOERROR, 0))
1544 return err;
1545 }
1546 else
1547 {
1548 /* There is a duplicated node which satisfies the constraint,
1549 use it to avoid infinite loop. */
1550 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551 if (BE (ret < 0, 0))
1552 return REG_ESPACE;
1553 }
1554
1555 org_dest = dfa->edests[org_node].elems[1];
1556 clone_dest = duplicate_node (dfa, org_dest, constraint);
1557 if (BE (clone_dest == -1, 0))
1558 return REG_ESPACE;
1559 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (BE (ret < 0, 0))
1561 return REG_ESPACE;
1562 }
1563 org_node = org_dest;
1564 clone_node = clone_dest;
1565 }
1566 return REG_NOERROR;
1567 }
1568
1569 /* Search for a node which is duplicated from the node ORG_NODE, and
1570 satisfies the constraint CONSTRAINT. */
1571
1572 static int
1573 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1574 unsigned int constraint)
1575 {
1576 int idx;
1577 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1578 {
1579 if (org_node == dfa->org_indices[idx]
1580 && constraint == dfa->nodes[idx].constraint)
1581 return idx; /* Found. */
1582 }
1583 return -1; /* Not found. */
1584 }
1585
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587 Return the index of the new node, or -1 if insufficient storage is
1588 available. */
1589
1590 static int
1591 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1592 {
1593 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1594 if (BE (dup_idx != -1, 1))
1595 {
1596 dfa->nodes[dup_idx].constraint = constraint;
1597 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1598 dfa->nodes[dup_idx].duplicated = 1;
1599
1600 /* Store the index of the original node. */
1601 dfa->org_indices[dup_idx] = org_idx;
1602 }
1603 return dup_idx;
1604 }
1605
1606 static reg_errcode_t
1607 calc_inveclosure (re_dfa_t *dfa)
1608 {
1609 int src, idx, ret;
1610 for (idx = 0; idx < dfa->nodes_len; ++idx)
1611 re_node_set_init_empty (dfa->inveclosures + idx);
1612
1613 for (src = 0; src < dfa->nodes_len; ++src)
1614 {
1615 int *elems = dfa->eclosures[src].elems;
1616 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1617 {
1618 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1619 if (BE (ret == -1, 0))
1620 return REG_ESPACE;
1621 }
1622 }
1623
1624 return REG_NOERROR;
1625 }
1626
1627 /* Calculate "eclosure" for all the node in DFA. */
1628
1629 static reg_errcode_t
1630 calc_eclosure (re_dfa_t *dfa)
1631 {
1632 int node_idx, incomplete;
1633 #ifdef DEBUG
1634 assert (dfa->nodes_len > 0);
1635 #endif
1636 incomplete = 0;
1637 /* For each nodes, calculate epsilon closure. */
1638 for (node_idx = 0; ; ++node_idx)
1639 {
1640 reg_errcode_t err;
1641 re_node_set eclosure_elem;
1642 if (node_idx == dfa->nodes_len)
1643 {
1644 if (!incomplete)
1645 break;
1646 incomplete = 0;
1647 node_idx = 0;
1648 }
1649
1650 #ifdef DEBUG
1651 assert (dfa->eclosures[node_idx].nelem != -1);
1652 #endif
1653
1654 /* If we have already calculated, skip it. */
1655 if (dfa->eclosures[node_idx].nelem != 0)
1656 continue;
1657 /* Calculate epsilon closure of `node_idx'. */
1658 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1659 if (BE (err != REG_NOERROR, 0))
1660 return err;
1661
1662 if (dfa->eclosures[node_idx].nelem == 0)
1663 {
1664 incomplete = 1;
1665 re_node_set_free (&eclosure_elem);
1666 }
1667 }
1668 return REG_NOERROR;
1669 }
1670
1671 /* Calculate epsilon closure of NODE. */
1672
1673 static reg_errcode_t
1674 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1675 {
1676 reg_errcode_t err;
1677 int i;
1678 re_node_set eclosure;
1679 int ret;
1680 int incomplete = 0;
1681 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1682 if (BE (err != REG_NOERROR, 0))
1683 return err;
1684
1685 /* This indicates that we are calculating this node now.
1686 We reference this value to avoid infinite loop. */
1687 dfa->eclosures[node].nelem = -1;
1688
1689 /* If the current node has constraints, duplicate all nodes
1690 since they must inherit the constraints. */
1691 if (dfa->nodes[node].constraint
1692 && dfa->edests[node].nelem
1693 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1694 {
1695 err = duplicate_node_closure (dfa, node, node, node,
1696 dfa->nodes[node].constraint);
1697 if (BE (err != REG_NOERROR, 0))
1698 return err;
1699 }
1700
1701 /* Expand each epsilon destination nodes. */
1702 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1703 for (i = 0; i < dfa->edests[node].nelem; ++i)
1704 {
1705 re_node_set eclosure_elem;
1706 int edest = dfa->edests[node].elems[i];
1707 /* If calculating the epsilon closure of `edest' is in progress,
1708 return intermediate result. */
1709 if (dfa->eclosures[edest].nelem == -1)
1710 {
1711 incomplete = 1;
1712 continue;
1713 }
1714 /* If we haven't calculated the epsilon closure of `edest' yet,
1715 calculate now. Otherwise use calculated epsilon closure. */
1716 if (dfa->eclosures[edest].nelem == 0)
1717 {
1718 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1719 if (BE (err != REG_NOERROR, 0))
1720 return err;
1721 }
1722 else
1723 eclosure_elem = dfa->eclosures[edest];
1724 /* Merge the epsilon closure of `edest'. */
1725 err = re_node_set_merge (&eclosure, &eclosure_elem);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728 /* If the epsilon closure of `edest' is incomplete,
1729 the epsilon closure of this node is also incomplete. */
1730 if (dfa->eclosures[edest].nelem == 0)
1731 {
1732 incomplete = 1;
1733 re_node_set_free (&eclosure_elem);
1734 }
1735 }
1736
1737 /* An epsilon closure includes itself. */
1738 ret = re_node_set_insert (&eclosure, node);
1739 if (BE (ret < 0, 0))
1740 return REG_ESPACE;
1741 if (incomplete && !root)
1742 dfa->eclosures[node].nelem = 0;
1743 else
1744 dfa->eclosures[node] = eclosure;
1745 *new_set = eclosure;
1746 return REG_NOERROR;
1747 }
1748 \f
1749 /* Functions for token which are used in the parser. */
1750
1751 /* Fetch a token from INPUT.
1752 We must not use this function inside bracket expressions. */
1753
1754 static void
1755 internal_function
1756 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1757 {
1758 re_string_skip_bytes (input, peek_token (result, input, syntax));
1759 }
1760
1761 /* Peek a token from INPUT, and return the length of the token.
1762 We must not use this function inside bracket expressions. */
1763
1764 static int
1765 internal_function
1766 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1767 {
1768 unsigned char c;
1769
1770 if (re_string_eoi (input))
1771 {
1772 token->type = END_OF_RE;
1773 return 0;
1774 }
1775
1776 c = re_string_peek_byte (input, 0);
1777 token->opr.c = c;
1778
1779 token->word_char = 0;
1780 #ifdef RE_ENABLE_I18N
1781 token->mb_partial = 0;
1782 if (input->mb_cur_max > 1 &&
1783 !re_string_first_byte (input, re_string_cur_idx (input)))
1784 {
1785 token->type = CHARACTER;
1786 token->mb_partial = 1;
1787 return 1;
1788 }
1789 #endif
1790 if (c == '\\')
1791 {
1792 unsigned char c2;
1793 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1794 {
1795 token->type = BACK_SLASH;
1796 return 1;
1797 }
1798
1799 c2 = re_string_peek_byte_case (input, 1);
1800 token->opr.c = c2;
1801 token->type = CHARACTER;
1802 #ifdef RE_ENABLE_I18N
1803 if (input->mb_cur_max > 1)
1804 {
1805 wint_t wc = re_string_wchar_at (input,
1806 re_string_cur_idx (input) + 1);
1807 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1808 }
1809 else
1810 #endif
1811 token->word_char = IS_WORD_CHAR (c2) != 0;
1812
1813 switch (c2)
1814 {
1815 case '|':
1816 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1817 token->type = OP_ALT;
1818 break;
1819 case '1': case '2': case '3': case '4': case '5':
1820 case '6': case '7': case '8': case '9':
1821 if (!(syntax & RE_NO_BK_REFS))
1822 {
1823 token->type = OP_BACK_REF;
1824 token->opr.idx = c2 - '1';
1825 }
1826 break;
1827 case '<':
1828 if (!(syntax & RE_NO_GNU_OPS))
1829 {
1830 token->type = ANCHOR;
1831 token->opr.ctx_type = WORD_FIRST;
1832 }
1833 break;
1834 case '>':
1835 if (!(syntax & RE_NO_GNU_OPS))
1836 {
1837 token->type = ANCHOR;
1838 token->opr.ctx_type = WORD_LAST;
1839 }
1840 break;
1841 case 'b':
1842 if (!(syntax & RE_NO_GNU_OPS))
1843 {
1844 token->type = ANCHOR;
1845 token->opr.ctx_type = WORD_DELIM;
1846 }
1847 break;
1848 case 'B':
1849 if (!(syntax & RE_NO_GNU_OPS))
1850 {
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = NOT_WORD_DELIM;
1853 }
1854 break;
1855 case 'w':
1856 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = OP_WORD;
1858 break;
1859 case 'W':
1860 if (!(syntax & RE_NO_GNU_OPS))
1861 token->type = OP_NOTWORD;
1862 break;
1863 case 's':
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_SPACE;
1866 break;
1867 case 'S':
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTSPACE;
1870 break;
1871 case '`':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 {
1874 token->type = ANCHOR;
1875 token->opr.ctx_type = BUF_FIRST;
1876 }
1877 break;
1878 case '\'':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 {
1881 token->type = ANCHOR;
1882 token->opr.ctx_type = BUF_LAST;
1883 }
1884 break;
1885 case '(':
1886 if (!(syntax & RE_NO_BK_PARENS))
1887 token->type = OP_OPEN_SUBEXP;
1888 break;
1889 case ')':
1890 if (!(syntax & RE_NO_BK_PARENS))
1891 token->type = OP_CLOSE_SUBEXP;
1892 break;
1893 case '+':
1894 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1895 token->type = OP_DUP_PLUS;
1896 break;
1897 case '?':
1898 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1899 token->type = OP_DUP_QUESTION;
1900 break;
1901 case '{':
1902 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1903 token->type = OP_OPEN_DUP_NUM;
1904 break;
1905 case '}':
1906 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1907 token->type = OP_CLOSE_DUP_NUM;
1908 break;
1909 default:
1910 break;
1911 }
1912 return 2;
1913 }
1914
1915 token->type = CHARACTER;
1916 #ifdef RE_ENABLE_I18N
1917 if (input->mb_cur_max > 1)
1918 {
1919 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1920 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1921 }
1922 else
1923 #endif
1924 token->word_char = IS_WORD_CHAR (token->opr.c);
1925
1926 switch (c)
1927 {
1928 case '\n':
1929 if (syntax & RE_NEWLINE_ALT)
1930 token->type = OP_ALT;
1931 break;
1932 case '|':
1933 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1934 token->type = OP_ALT;
1935 break;
1936 case '*':
1937 token->type = OP_DUP_ASTERISK;
1938 break;
1939 case '+':
1940 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1941 token->type = OP_DUP_PLUS;
1942 break;
1943 case '?':
1944 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1945 token->type = OP_DUP_QUESTION;
1946 break;
1947 case '{':
1948 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1949 token->type = OP_OPEN_DUP_NUM;
1950 break;
1951 case '}':
1952 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1953 token->type = OP_CLOSE_DUP_NUM;
1954 break;
1955 case '(':
1956 if (syntax & RE_NO_BK_PARENS)
1957 token->type = OP_OPEN_SUBEXP;
1958 break;
1959 case ')':
1960 if (syntax & RE_NO_BK_PARENS)
1961 token->type = OP_CLOSE_SUBEXP;
1962 break;
1963 case '[':
1964 token->type = OP_OPEN_BRACKET;
1965 break;
1966 case '.':
1967 token->type = OP_PERIOD;
1968 break;
1969 case '^':
1970 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1971 re_string_cur_idx (input) != 0)
1972 {
1973 char prev = re_string_peek_byte (input, -1);
1974 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1975 break;
1976 }
1977 token->type = ANCHOR;
1978 token->opr.ctx_type = LINE_FIRST;
1979 break;
1980 case '$':
1981 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1982 re_string_cur_idx (input) + 1 != re_string_length (input))
1983 {
1984 re_token_t next;
1985 re_string_skip_bytes (input, 1);
1986 peek_token (&next, input, syntax);
1987 re_string_skip_bytes (input, -1);
1988 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1989 break;
1990 }
1991 token->type = ANCHOR;
1992 token->opr.ctx_type = LINE_LAST;
1993 break;
1994 default:
1995 break;
1996 }
1997 return 1;
1998 }
1999
2000 /* Peek a token from INPUT, and return the length of the token.
2001 We must not use this function out of bracket expressions. */
2002
2003 static int
2004 internal_function
2005 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2006 {
2007 unsigned char c;
2008 if (re_string_eoi (input))
2009 {
2010 token->type = END_OF_RE;
2011 return 0;
2012 }
2013 c = re_string_peek_byte (input, 0);
2014 token->opr.c = c;
2015
2016 #ifdef RE_ENABLE_I18N
2017 if (input->mb_cur_max > 1 &&
2018 !re_string_first_byte (input, re_string_cur_idx (input)))
2019 {
2020 token->type = CHARACTER;
2021 return 1;
2022 }
2023 #endif /* RE_ENABLE_I18N */
2024
2025 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2026 && re_string_cur_idx (input) + 1 < re_string_length (input))
2027 {
2028 /* In this case, '\' escape a character. */
2029 unsigned char c2;
2030 re_string_skip_bytes (input, 1);
2031 c2 = re_string_peek_byte (input, 0);
2032 token->opr.c = c2;
2033 token->type = CHARACTER;
2034 return 1;
2035 }
2036 if (c == '[') /* '[' is a special char in a bracket exps. */
2037 {
2038 unsigned char c2;
2039 int token_len;
2040 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2041 c2 = re_string_peek_byte (input, 1);
2042 else
2043 c2 = 0;
2044 token->opr.c = c2;
2045 token_len = 2;
2046 switch (c2)
2047 {
2048 case '.':
2049 token->type = OP_OPEN_COLL_ELEM;
2050 break;
2051 case '=':
2052 token->type = OP_OPEN_EQUIV_CLASS;
2053 break;
2054 case ':':
2055 if (syntax & RE_CHAR_CLASSES)
2056 {
2057 token->type = OP_OPEN_CHAR_CLASS;
2058 break;
2059 }
2060 /* else fall through. */
2061 default:
2062 token->type = CHARACTER;
2063 token->opr.c = c;
2064 token_len = 1;
2065 break;
2066 }
2067 return token_len;
2068 }
2069 switch (c)
2070 {
2071 case '-':
2072 token->type = OP_CHARSET_RANGE;
2073 break;
2074 case ']':
2075 token->type = OP_CLOSE_BRACKET;
2076 break;
2077 case '^':
2078 token->type = OP_NON_MATCH_LIST;
2079 break;
2080 default:
2081 token->type = CHARACTER;
2082 }
2083 return 1;
2084 }
2085 \f
2086 /* Functions for parser. */
2087
2088 /* Entry point of the parser.
2089 Parse the regular expression REGEXP and return the structure tree.
2090 If an error is occured, ERR is set by error code, and return NULL.
2091 This function build the following tree, from regular expression <reg_exp>:
2092 CAT
2093 / \
2094 / \
2095 <reg_exp> EOR
2096
2097 CAT means concatenation.
2098 EOR means end of regular expression. */
2099
2100 static bin_tree_t *
2101 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2102 reg_errcode_t *err)
2103 {
2104 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105 bin_tree_t *tree, *eor, *root;
2106 re_token_t current_token;
2107 dfa->syntax = syntax;
2108 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2109 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2110 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2111 return NULL;
2112 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2113 if (tree != NULL)
2114 root = create_tree (dfa, tree, eor, CONCAT);
2115 else
2116 root = eor;
2117 if (BE (eor == NULL || root == NULL, 0))
2118 {
2119 *err = REG_ESPACE;
2120 return NULL;
2121 }
2122 return root;
2123 }
2124
2125 /* This function build the following tree, from regular expression
2126 <branch1>|<branch2>:
2127 ALT
2128 / \
2129 / \
2130 <branch1> <branch2>
2131
2132 ALT means alternative, which represents the operator `|'. */
2133
2134 static bin_tree_t *
2135 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2136 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2137 {
2138 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2139 bin_tree_t *tree, *branch = NULL;
2140 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2141 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142 return NULL;
2143
2144 while (token->type == OP_ALT)
2145 {
2146 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147 if (token->type != OP_ALT && token->type != END_OF_RE
2148 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2149 {
2150 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2151 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2152 return NULL;
2153 }
2154 else
2155 branch = NULL;
2156 tree = create_tree (dfa, tree, branch, OP_ALT);
2157 if (BE (tree == NULL, 0))
2158 {
2159 *err = REG_ESPACE;
2160 return NULL;
2161 }
2162 }
2163 return tree;
2164 }
2165
2166 /* This function build the following tree, from regular expression
2167 <exp1><exp2>:
2168 CAT
2169 / \
2170 / \
2171 <exp1> <exp2>
2172
2173 CAT means concatenation. */
2174
2175 static bin_tree_t *
2176 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2177 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2178 {
2179 bin_tree_t *tree, *exp;
2180 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2181 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2182 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183 return NULL;
2184
2185 while (token->type != OP_ALT && token->type != END_OF_RE
2186 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187 {
2188 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2189 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2190 {
2191 if (tree != NULL)
2192 postorder (tree, free_tree, NULL);
2193 return NULL;
2194 }
2195 if (tree != NULL && exp != NULL)
2196 {
2197 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2198 if (newtree == NULL)
2199 {
2200 postorder (exp, free_tree, NULL);
2201 postorder (tree, free_tree, NULL);
2202 *err = REG_ESPACE;
2203 return NULL;
2204 }
2205 tree = newtree;
2206 }
2207 else if (tree == NULL)
2208 tree = exp;
2209 /* Otherwise exp == NULL, we don't need to create new tree. */
2210 }
2211 return tree;
2212 }
2213
2214 /* This function build the following tree, from regular expression a*:
2215 *
2216 |
2217 a
2218 */
2219
2220 static bin_tree_t *
2221 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2223 {
2224 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2225 bin_tree_t *tree;
2226 switch (token->type)
2227 {
2228 case CHARACTER:
2229 tree = create_token_tree (dfa, NULL, NULL, token);
2230 if (BE (tree == NULL, 0))
2231 {
2232 *err = REG_ESPACE;
2233 return NULL;
2234 }
2235 #ifdef RE_ENABLE_I18N
2236 if (dfa->mb_cur_max > 1)
2237 {
2238 while (!re_string_eoi (regexp)
2239 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2240 {
2241 bin_tree_t *mbc_remain;
2242 fetch_token (token, regexp, syntax);
2243 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2244 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2245 if (BE (mbc_remain == NULL || tree == NULL, 0))
2246 {
2247 *err = REG_ESPACE;
2248 return NULL;
2249 }
2250 }
2251 }
2252 #endif
2253 break;
2254 case OP_OPEN_SUBEXP:
2255 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2256 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257 return NULL;
2258 break;
2259 case OP_OPEN_BRACKET:
2260 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2261 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 return NULL;
2263 break;
2264 case OP_BACK_REF:
2265 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2266 {
2267 *err = REG_ESUBREG;
2268 return NULL;
2269 }
2270 dfa->used_bkref_map |= 1 << token->opr.idx;
2271 tree = create_token_tree (dfa, NULL, NULL, token);
2272 if (BE (tree == NULL, 0))
2273 {
2274 *err = REG_ESPACE;
2275 return NULL;
2276 }
2277 ++dfa->nbackref;
2278 dfa->has_mb_node = 1;
2279 break;
2280 case OP_OPEN_DUP_NUM:
2281 if (syntax & RE_CONTEXT_INVALID_DUP)
2282 {
2283 *err = REG_BADRPT;
2284 return NULL;
2285 }
2286 /* FALLTHROUGH */
2287 case OP_DUP_ASTERISK:
2288 case OP_DUP_PLUS:
2289 case OP_DUP_QUESTION:
2290 if (syntax & RE_CONTEXT_INVALID_OPS)
2291 {
2292 *err = REG_BADRPT;
2293 return NULL;
2294 }
2295 else if (syntax & RE_CONTEXT_INDEP_OPS)
2296 {
2297 fetch_token (token, regexp, syntax);
2298 return parse_expression (regexp, preg, token, syntax, nest, err);
2299 }
2300 /* else fall through */
2301 case OP_CLOSE_SUBEXP:
2302 if ((token->type == OP_CLOSE_SUBEXP) &&
2303 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2304 {
2305 *err = REG_ERPAREN;
2306 return NULL;
2307 }
2308 /* else fall through */
2309 case OP_CLOSE_DUP_NUM:
2310 /* We treat it as a normal character. */
2311
2312 /* Then we can these characters as normal characters. */
2313 token->type = CHARACTER;
2314 /* mb_partial and word_char bits should be initialized already
2315 by peek_token. */
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2318 {
2319 *err = REG_ESPACE;
2320 return NULL;
2321 }
2322 break;
2323 case ANCHOR:
2324 if ((token->opr.ctx_type
2325 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2326 && dfa->word_ops_used == 0)
2327 init_word_char (dfa);
2328 if (token->opr.ctx_type == WORD_DELIM
2329 || token->opr.ctx_type == NOT_WORD_DELIM)
2330 {
2331 bin_tree_t *tree_first, *tree_last;
2332 if (token->opr.ctx_type == WORD_DELIM)
2333 {
2334 token->opr.ctx_type = WORD_FIRST;
2335 tree_first = create_token_tree (dfa, NULL, NULL, token);
2336 token->opr.ctx_type = WORD_LAST;
2337 }
2338 else
2339 {
2340 token->opr.ctx_type = INSIDE_WORD;
2341 tree_first = create_token_tree (dfa, NULL, NULL, token);
2342 token->opr.ctx_type = INSIDE_NOTWORD;
2343 }
2344 tree_last = create_token_tree (dfa, NULL, NULL, token);
2345 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2346 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2347 {
2348 *err = REG_ESPACE;
2349 return NULL;
2350 }
2351 }
2352 else
2353 {
2354 tree = create_token_tree (dfa, NULL, NULL, token);
2355 if (BE (tree == NULL, 0))
2356 {
2357 *err = REG_ESPACE;
2358 return NULL;
2359 }
2360 }
2361 /* We must return here, since ANCHORs can't be followed
2362 by repetition operators.
2363 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2364 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2365 fetch_token (token, regexp, syntax);
2366 return tree;
2367 case OP_PERIOD:
2368 tree = create_token_tree (dfa, NULL, NULL, token);
2369 if (BE (tree == NULL, 0))
2370 {
2371 *err = REG_ESPACE;
2372 return NULL;
2373 }
2374 if (dfa->mb_cur_max > 1)
2375 dfa->has_mb_node = 1;
2376 break;
2377 case OP_WORD:
2378 case OP_NOTWORD:
2379 tree = build_charclass_op (dfa, regexp->trans,
2380 (const unsigned char *) "alnum",
2381 (const unsigned char *) "_",
2382 token->type == OP_NOTWORD, err);
2383 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2384 return NULL;
2385 break;
2386 case OP_SPACE:
2387 case OP_NOTSPACE:
2388 tree = build_charclass_op (dfa, regexp->trans,
2389 (const unsigned char *) "space",
2390 (const unsigned char *) "",
2391 token->type == OP_NOTSPACE, err);
2392 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2393 return NULL;
2394 break;
2395 case OP_ALT:
2396 case END_OF_RE:
2397 return NULL;
2398 case BACK_SLASH:
2399 *err = REG_EESCAPE;
2400 return NULL;
2401 default:
2402 /* Must not happen? */
2403 #ifdef DEBUG
2404 assert (0);
2405 #endif
2406 return NULL;
2407 }
2408 fetch_token (token, regexp, syntax);
2409
2410 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2411 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2412 {
2413 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2414 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2415 return NULL;
2416 /* In BRE consecutive duplications are not allowed. */
2417 if ((syntax & RE_CONTEXT_INVALID_DUP)
2418 && (token->type == OP_DUP_ASTERISK
2419 || token->type == OP_OPEN_DUP_NUM))
2420 {
2421 *err = REG_BADRPT;
2422 return NULL;
2423 }
2424 }
2425
2426 return tree;
2427 }
2428
2429 /* This function build the following tree, from regular expression
2430 (<reg_exp>):
2431 SUBEXP
2432 |
2433 <reg_exp>
2434 */
2435
2436 static bin_tree_t *
2437 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2438 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2439 {
2440 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2441 bin_tree_t *tree;
2442 size_t cur_nsub;
2443 cur_nsub = preg->re_nsub++;
2444
2445 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2446
2447 /* The subexpression may be a null string. */
2448 if (token->type == OP_CLOSE_SUBEXP)
2449 tree = NULL;
2450 else
2451 {
2452 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2453 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2454 {
2455 if (tree != NULL)
2456 postorder (tree, free_tree, NULL);
2457 *err = REG_EPAREN;
2458 }
2459 if (BE (*err != REG_NOERROR, 0))
2460 return NULL;
2461 }
2462
2463 if (cur_nsub <= '9' - '1')
2464 dfa->completed_bkref_map |= 1 << cur_nsub;
2465
2466 tree = create_tree (dfa, tree, NULL, SUBEXP);
2467 if (BE (tree == NULL, 0))
2468 {
2469 *err = REG_ESPACE;
2470 return NULL;
2471 }
2472 tree->token.opr.idx = cur_nsub;
2473 return tree;
2474 }
2475
2476 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2477
2478 static bin_tree_t *
2479 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2480 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481 {
2482 bin_tree_t *tree = NULL, *old_tree = NULL;
2483 int i, start, end, start_idx = re_string_cur_idx (regexp);
2484 re_token_t start_token = *token;
2485
2486 if (token->type == OP_OPEN_DUP_NUM)
2487 {
2488 end = 0;
2489 start = fetch_number (regexp, token, syntax);
2490 if (start == -1)
2491 {
2492 if (token->type == CHARACTER && token->opr.c == ',')
2493 start = 0; /* We treat "{,m}" as "{0,m}". */
2494 else
2495 {
2496 *err = REG_BADBR; /* <re>{} is invalid. */
2497 return NULL;
2498 }
2499 }
2500 if (BE (start != -2, 1))
2501 {
2502 /* We treat "{n}" as "{n,n}". */
2503 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2504 : ((token->type == CHARACTER && token->opr.c == ',')
2505 ? fetch_number (regexp, token, syntax) : -2));
2506 }
2507 if (BE (start == -2 || end == -2, 0))
2508 {
2509 /* Invalid sequence. */
2510 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2511 {
2512 if (token->type == END_OF_RE)
2513 *err = REG_EBRACE;
2514 else
2515 *err = REG_BADBR;
2516
2517 return NULL;
2518 }
2519
2520 /* If the syntax bit is set, rollback. */
2521 re_string_set_index (regexp, start_idx);
2522 *token = start_token;
2523 token->type = CHARACTER;
2524 /* mb_partial and word_char bits should be already initialized by
2525 peek_token. */
2526 return elem;
2527 }
2528
2529 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2530 {
2531 /* First number greater than second. */
2532 *err = REG_BADBR;
2533 return NULL;
2534 }
2535 }
2536 else
2537 {
2538 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2540 }
2541
2542 fetch_token (token, regexp, syntax);
2543
2544 if (BE (elem == NULL, 0))
2545 return NULL;
2546 if (BE (start == 0 && end == 0, 0))
2547 {
2548 postorder (elem, free_tree, NULL);
2549 return NULL;
2550 }
2551
2552 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2553 if (BE (start > 0, 0))
2554 {
2555 tree = elem;
2556 for (i = 2; i <= start; ++i)
2557 {
2558 elem = duplicate_tree (elem, dfa);
2559 tree = create_tree (dfa, tree, elem, CONCAT);
2560 if (BE (elem == NULL || tree == NULL, 0))
2561 goto parse_dup_op_espace;
2562 }
2563
2564 if (start == end)
2565 return tree;
2566
2567 /* Duplicate ELEM before it is marked optional. */
2568 elem = duplicate_tree (elem, dfa);
2569 old_tree = tree;
2570 }
2571 else
2572 old_tree = NULL;
2573
2574 if (elem->token.type == SUBEXP)
2575 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576
2577 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2578 if (BE (tree == NULL, 0))
2579 goto parse_dup_op_espace;
2580
2581 /* This loop is actually executed only when end != -1,
2582 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2583 already created the start+1-th copy. */
2584 for (i = start + 2; i <= end; ++i)
2585 {
2586 elem = duplicate_tree (elem, dfa);
2587 tree = create_tree (dfa, tree, elem, CONCAT);
2588 if (BE (elem == NULL || tree == NULL, 0))
2589 goto parse_dup_op_espace;
2590
2591 tree = create_tree (dfa, tree, NULL, OP_ALT);
2592 if (BE (tree == NULL, 0))
2593 goto parse_dup_op_espace;
2594 }
2595
2596 if (old_tree)
2597 tree = create_tree (dfa, old_tree, tree, CONCAT);
2598
2599 return tree;
2600
2601 parse_dup_op_espace:
2602 *err = REG_ESPACE;
2603 return NULL;
2604 }
2605
2606 /* Size of the names for collating symbol/equivalence_class/character_class.
2607 I'm not sure, but maybe enough. */
2608 #define BRACKET_NAME_BUF_SIZE 32
2609
2610 #ifndef _LIBC
2611 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2612 Build the range expression which starts from START_ELEM, and ends
2613 at END_ELEM. The result are written to MBCSET and SBCSET.
2614 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2615 mbcset->range_ends, is a pointer argument sinse we may
2616 update it. */
2617
2618 static reg_errcode_t
2619 internal_function
2620 # ifdef RE_ENABLE_I18N
2621 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2622 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2623 # else /* not RE_ENABLE_I18N */
2624 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2625 bracket_elem_t *end_elem)
2626 # endif /* not RE_ENABLE_I18N */
2627 {
2628 unsigned int start_ch, end_ch;
2629 /* Equivalence Classes and Character Classes can't be a range start/end. */
2630 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2631 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2632 0))
2633 return REG_ERANGE;
2634
2635 /* We can handle no multi character collating elements without libc
2636 support. */
2637 if (BE ((start_elem->type == COLL_SYM
2638 && strlen ((char *) start_elem->opr.name) > 1)
2639 || (end_elem->type == COLL_SYM
2640 && strlen ((char *) end_elem->opr.name) > 1), 0))
2641 return REG_ECOLLATE;
2642
2643 # ifdef RE_ENABLE_I18N
2644 {
2645 wchar_t wc;
2646 wint_t start_wc;
2647 wint_t end_wc;
2648 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2649
2650 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2651 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2652 : 0));
2653 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2654 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2655 : 0));
2656 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2657 ? __btowc (start_ch) : start_elem->opr.wch);
2658 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2659 ? __btowc (end_ch) : end_elem->opr.wch);
2660 if (start_wc == WEOF || end_wc == WEOF)
2661 return REG_ECOLLATE;
2662 cmp_buf[0] = start_wc;
2663 cmp_buf[4] = end_wc;
2664 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2665 return REG_ERANGE;
2666
2667 /* Got valid collation sequence values, add them as a new entry.
2668 However, for !_LIBC we have no collation elements: if the
2669 character set is single byte, the single byte character set
2670 that we build below suffices. parse_bracket_exp passes
2671 no MBCSET if dfa->mb_cur_max == 1. */
2672 if (mbcset)
2673 {
2674 /* Check the space of the arrays. */
2675 if (BE (*range_alloc == mbcset->nranges, 0))
2676 {
2677 /* There is not enough space, need realloc. */
2678 wchar_t *new_array_start, *new_array_end;
2679 int new_nranges;
2680
2681 /* +1 in case of mbcset->nranges is 0. */
2682 new_nranges = 2 * mbcset->nranges + 1;
2683 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2684 are NULL if *range_alloc == 0. */
2685 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2686 new_nranges);
2687 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2688 new_nranges);
2689
2690 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2691 return REG_ESPACE;
2692
2693 mbcset->range_starts = new_array_start;
2694 mbcset->range_ends = new_array_end;
2695 *range_alloc = new_nranges;
2696 }
2697
2698 mbcset->range_starts[mbcset->nranges] = start_wc;
2699 mbcset->range_ends[mbcset->nranges++] = end_wc;
2700 }
2701
2702 /* Build the table for single byte characters. */
2703 for (wc = 0; wc < SBC_MAX; ++wc)
2704 {
2705 cmp_buf[2] = wc;
2706 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2707 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2708 bitset_set (sbcset, wc);
2709 }
2710 }
2711 # else /* not RE_ENABLE_I18N */
2712 {
2713 unsigned int ch;
2714 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2715 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2716 : 0));
2717 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2718 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2719 : 0));
2720 if (start_ch > end_ch)
2721 return REG_ERANGE;
2722 /* Build the table for single byte characters. */
2723 for (ch = 0; ch < SBC_MAX; ++ch)
2724 if (start_ch <= ch && ch <= end_ch)
2725 bitset_set (sbcset, ch);
2726 }
2727 # endif /* not RE_ENABLE_I18N */
2728 return REG_NOERROR;
2729 }
2730 #endif /* not _LIBC */
2731
2732 #ifndef _LIBC
2733 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2734 Build the collating element which is represented by NAME.
2735 The result are written to MBCSET and SBCSET.
2736 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2737 pointer argument since we may update it. */
2738
2739 static reg_errcode_t
2740 internal_function
2741 # ifdef RE_ENABLE_I18N
2742 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2743 int *coll_sym_alloc, const unsigned char *name)
2744 # else /* not RE_ENABLE_I18N */
2745 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2746 # endif /* not RE_ENABLE_I18N */
2747 {
2748 size_t name_len = strlen ((const char *) name);
2749 if (BE (name_len != 1, 0))
2750 return REG_ECOLLATE;
2751 else
2752 {
2753 bitset_set (sbcset, name[0]);
2754 return REG_NOERROR;
2755 }
2756 }
2757 #endif /* not _LIBC */
2758
2759 /* This function parse bracket expression like "[abc]", "[a-c]",
2760 "[[.a-a.]]" etc. */
2761
2762 static bin_tree_t *
2763 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2764 reg_syntax_t syntax, reg_errcode_t *err)
2765 {
2766 #ifdef _LIBC
2767 const unsigned char *collseqmb;
2768 const char *collseqwc;
2769 uint32_t nrules;
2770 int32_t table_size;
2771 const int32_t *symb_table;
2772 const unsigned char *extra;
2773
2774 /* Local function for parse_bracket_exp used in _LIBC environement.
2775 Seek the collating symbol entry correspondings to NAME.
2776 Return the index of the symbol in the SYMB_TABLE. */
2777
2778 auto inline int32_t
2779 __attribute ((always_inline))
2780 seek_collating_symbol_entry (name, name_len)
2781 const unsigned char *name;
2782 size_t name_len;
2783 {
2784 int32_t hash = elem_hash ((const char *) name, name_len);
2785 int32_t elem = hash % table_size;
2786 if (symb_table[2 * elem] != 0)
2787 {
2788 int32_t second = hash % (table_size - 2) + 1;
2789
2790 do
2791 {
2792 /* First compare the hashing value. */
2793 if (symb_table[2 * elem] == hash
2794 /* Compare the length of the name. */
2795 && name_len == extra[symb_table[2 * elem + 1]]
2796 /* Compare the name. */
2797 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2798 name_len) == 0)
2799 {
2800 /* Yep, this is the entry. */
2801 break;
2802 }
2803
2804 /* Next entry. */
2805 elem += second;
2806 }
2807 while (symb_table[2 * elem] != 0);
2808 }
2809 return elem;
2810 }
2811
2812 /* Local function for parse_bracket_exp used in _LIBC environment.
2813 Look up the collation sequence value of BR_ELEM.
2814 Return the value if succeeded, UINT_MAX otherwise. */
2815
2816 auto inline unsigned int
2817 __attribute ((always_inline))
2818 lookup_collation_sequence_value (br_elem)
2819 bracket_elem_t *br_elem;
2820 {
2821 if (br_elem->type == SB_CHAR)
2822 {
2823 /*
2824 if (MB_CUR_MAX == 1)
2825 */
2826 if (nrules == 0)
2827 return collseqmb[br_elem->opr.ch];
2828 else
2829 {
2830 wint_t wc = __btowc (br_elem->opr.ch);
2831 return __collseq_table_lookup (collseqwc, wc);
2832 }
2833 }
2834 else if (br_elem->type == MB_CHAR)
2835 {
2836 if (nrules != 0)
2837 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2838 }
2839 else if (br_elem->type == COLL_SYM)
2840 {
2841 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2842 if (nrules != 0)
2843 {
2844 int32_t elem, idx;
2845 elem = seek_collating_symbol_entry (br_elem->opr.name,
2846 sym_name_len);
2847 if (symb_table[2 * elem] != 0)
2848 {
2849 /* We found the entry. */
2850 idx = symb_table[2 * elem + 1];
2851 /* Skip the name of collating element name. */
2852 idx += 1 + extra[idx];
2853 /* Skip the byte sequence of the collating element. */
2854 idx += 1 + extra[idx];
2855 /* Adjust for the alignment. */
2856 idx = (idx + 3) & ~3;
2857 /* Skip the multibyte collation sequence value. */
2858 idx += sizeof (unsigned int);
2859 /* Skip the wide char sequence of the collating element. */
2860 idx += sizeof (unsigned int) *
2861 (1 + *(unsigned int *) (extra + idx));
2862 /* Return the collation sequence value. */
2863 return *(unsigned int *) (extra + idx);
2864 }
2865 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2866 {
2867 /* No valid character. Match it as a single byte
2868 character. */
2869 return collseqmb[br_elem->opr.name[0]];
2870 }
2871 }
2872 else if (sym_name_len == 1)
2873 return collseqmb[br_elem->opr.name[0]];
2874 }
2875 return UINT_MAX;
2876 }
2877
2878 /* Local function for parse_bracket_exp used in _LIBC environement.
2879 Build the range expression which starts from START_ELEM, and ends
2880 at END_ELEM. The result are written to MBCSET and SBCSET.
2881 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2882 mbcset->range_ends, is a pointer argument sinse we may
2883 update it. */
2884
2885 auto inline reg_errcode_t
2886 __attribute ((always_inline))
2887 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2888 re_charset_t *mbcset;
2889 int *range_alloc;
2890 bitset_t sbcset;
2891 bracket_elem_t *start_elem, *end_elem;
2892 {
2893 unsigned int ch;
2894 uint32_t start_collseq;
2895 uint32_t end_collseq;
2896
2897 /* Equivalence Classes and Character Classes can't be a range
2898 start/end. */
2899 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2900 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2901 0))
2902 return REG_ERANGE;
2903
2904 start_collseq = lookup_collation_sequence_value (start_elem);
2905 end_collseq = lookup_collation_sequence_value (end_elem);
2906 /* Check start/end collation sequence values. */
2907 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2908 return REG_ECOLLATE;
2909 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2910 return REG_ERANGE;
2911
2912 /* Got valid collation sequence values, add them as a new entry.
2913 However, if we have no collation elements, and the character set
2914 is single byte, the single byte character set that we
2915 build below suffices. */
2916 if (nrules > 0 || dfa->mb_cur_max > 1)
2917 {
2918 /* Check the space of the arrays. */
2919 if (BE (*range_alloc == mbcset->nranges, 0))
2920 {
2921 /* There is not enough space, need realloc. */
2922 uint32_t *new_array_start;
2923 uint32_t *new_array_end;
2924 int new_nranges;
2925
2926 /* +1 in case of mbcset->nranges is 0. */
2927 new_nranges = 2 * mbcset->nranges + 1;
2928 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2929 new_nranges);
2930 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2931 new_nranges);
2932
2933 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2934 return REG_ESPACE;
2935
2936 mbcset->range_starts = new_array_start;
2937 mbcset->range_ends = new_array_end;
2938 *range_alloc = new_nranges;
2939 }
2940
2941 mbcset->range_starts[mbcset->nranges] = start_collseq;
2942 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2943 }
2944
2945 /* Build the table for single byte characters. */
2946 for (ch = 0; ch < SBC_MAX; ch++)
2947 {
2948 uint32_t ch_collseq;
2949 /*
2950 if (MB_CUR_MAX == 1)
2951 */
2952 if (nrules == 0)
2953 ch_collseq = collseqmb[ch];
2954 else
2955 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2956 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2957 bitset_set (sbcset, ch);
2958 }
2959 return REG_NOERROR;
2960 }
2961
2962 /* Local function for parse_bracket_exp used in _LIBC environement.
2963 Build the collating element which is represented by NAME.
2964 The result are written to MBCSET and SBCSET.
2965 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2966 pointer argument sinse we may update it. */
2967
2968 auto inline reg_errcode_t
2969 __attribute ((always_inline))
2970 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2971 re_charset_t *mbcset;
2972 int *coll_sym_alloc;
2973 bitset_t sbcset;
2974 const unsigned char *name;
2975 {
2976 int32_t elem, idx;
2977 size_t name_len = strlen ((const char *) name);
2978 if (nrules != 0)
2979 {
2980 elem = seek_collating_symbol_entry (name, name_len);
2981 if (symb_table[2 * elem] != 0)
2982 {
2983 /* We found the entry. */
2984 idx = symb_table[2 * elem + 1];
2985 /* Skip the name of collating element name. */
2986 idx += 1 + extra[idx];
2987 }
2988 else if (symb_table[2 * elem] == 0 && name_len == 1)
2989 {
2990 /* No valid character, treat it as a normal
2991 character. */
2992 bitset_set (sbcset, name[0]);
2993 return REG_NOERROR;
2994 }
2995 else
2996 return REG_ECOLLATE;
2997
2998 /* Got valid collation sequence, add it as a new entry. */
2999 /* Check the space of the arrays. */
3000 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3001 {
3002 /* Not enough, realloc it. */
3003 /* +1 in case of mbcset->ncoll_syms is 0. */
3004 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3005 /* Use realloc since mbcset->coll_syms is NULL
3006 if *alloc == 0. */
3007 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3008 new_coll_sym_alloc);
3009 if (BE (new_coll_syms == NULL, 0))
3010 return REG_ESPACE;
3011 mbcset->coll_syms = new_coll_syms;
3012 *coll_sym_alloc = new_coll_sym_alloc;
3013 }
3014 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3015 return REG_NOERROR;
3016 }
3017 else
3018 {
3019 if (BE (name_len != 1, 0))
3020 return REG_ECOLLATE;
3021 else
3022 {
3023 bitset_set (sbcset, name[0]);
3024 return REG_NOERROR;
3025 }
3026 }
3027 }
3028 #endif
3029
3030 re_token_t br_token;
3031 re_bitset_ptr_t sbcset;
3032 #ifdef RE_ENABLE_I18N
3033 re_charset_t *mbcset;
3034 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3035 int equiv_class_alloc = 0, char_class_alloc = 0;
3036 #endif /* not RE_ENABLE_I18N */
3037 int non_match = 0;
3038 bin_tree_t *work_tree;
3039 int token_len;
3040 int first_round = 1;
3041 #ifdef _LIBC
3042 collseqmb = (const unsigned char *)
3043 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3044 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3045 if (nrules)
3046 {
3047 /*
3048 if (MB_CUR_MAX > 1)
3049 */
3050 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3051 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3052 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3053 _NL_COLLATE_SYMB_TABLEMB);
3054 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055 _NL_COLLATE_SYMB_EXTRAMB);
3056 }
3057 #endif
3058 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059 #ifdef RE_ENABLE_I18N
3060 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3061 #endif /* RE_ENABLE_I18N */
3062 #ifdef RE_ENABLE_I18N
3063 if (BE (sbcset == NULL || mbcset == NULL, 0))
3064 #else
3065 if (BE (sbcset == NULL, 0))
3066 #endif /* RE_ENABLE_I18N */
3067 {
3068 re_free (sbcset);
3069 #ifdef RE_ENABLE_I18N
3070 re_free (mbcset);
3071 #endif
3072 *err = REG_ESPACE;
3073 return NULL;
3074 }
3075
3076 token_len = peek_token_bracket (token, regexp, syntax);
3077 if (BE (token->type == END_OF_RE, 0))
3078 {
3079 *err = REG_BADPAT;
3080 goto parse_bracket_exp_free_return;
3081 }
3082 if (token->type == OP_NON_MATCH_LIST)
3083 {
3084 #ifdef RE_ENABLE_I18N
3085 mbcset->non_match = 1;
3086 #endif /* not RE_ENABLE_I18N */
3087 non_match = 1;
3088 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3089 bitset_set (sbcset, '\n');
3090 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3091 token_len = peek_token_bracket (token, regexp, syntax);
3092 if (BE (token->type == END_OF_RE, 0))
3093 {
3094 *err = REG_BADPAT;
3095 goto parse_bracket_exp_free_return;
3096 }
3097 }
3098
3099 /* We treat the first ']' as a normal character. */
3100 if (token->type == OP_CLOSE_BRACKET)
3101 token->type = CHARACTER;
3102
3103 while (1)
3104 {
3105 bracket_elem_t start_elem, end_elem;
3106 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3107 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3108 reg_errcode_t ret;
3109 int token_len2 = 0, is_range_exp = 0;
3110 re_token_t token2;
3111
3112 start_elem.opr.name = start_name_buf;
3113 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3114 syntax, first_round);
3115 if (BE (ret != REG_NOERROR, 0))
3116 {
3117 *err = ret;
3118 goto parse_bracket_exp_free_return;
3119 }
3120 first_round = 0;
3121
3122 /* Get information about the next token. We need it in any case. */
3123 token_len = peek_token_bracket (token, regexp, syntax);
3124
3125 /* Do not check for ranges if we know they are not allowed. */
3126 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3127 {
3128 if (BE (token->type == END_OF_RE, 0))
3129 {
3130 *err = REG_EBRACK;
3131 goto parse_bracket_exp_free_return;
3132 }
3133 if (token->type == OP_CHARSET_RANGE)
3134 {
3135 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3136 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3137 if (BE (token2.type == END_OF_RE, 0))
3138 {
3139 *err = REG_EBRACK;
3140 goto parse_bracket_exp_free_return;
3141 }
3142 if (token2.type == OP_CLOSE_BRACKET)
3143 {
3144 /* We treat the last '-' as a normal character. */
3145 re_string_skip_bytes (regexp, -token_len);
3146 token->type = CHARACTER;
3147 }
3148 else
3149 is_range_exp = 1;
3150 }
3151 }
3152
3153 if (is_range_exp == 1)
3154 {
3155 end_elem.opr.name = end_name_buf;
3156 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3157 dfa, syntax, 1);
3158 if (BE (ret != REG_NOERROR, 0))
3159 {
3160 *err = ret;
3161 goto parse_bracket_exp_free_return;
3162 }
3163
3164 token_len = peek_token_bracket (token, regexp, syntax);
3165
3166 #ifdef _LIBC
3167 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3168 &start_elem, &end_elem);
3169 #else
3170 # ifdef RE_ENABLE_I18N
3171 *err = build_range_exp (sbcset,
3172 dfa->mb_cur_max > 1 ? mbcset : NULL,
3173 &range_alloc, &start_elem, &end_elem);
3174 # else
3175 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3176 # endif
3177 #endif /* RE_ENABLE_I18N */
3178 if (BE (*err != REG_NOERROR, 0))
3179 goto parse_bracket_exp_free_return;
3180 }
3181 else
3182 {
3183 switch (start_elem.type)
3184 {
3185 case SB_CHAR:
3186 bitset_set (sbcset, start_elem.opr.ch);
3187 break;
3188 #ifdef RE_ENABLE_I18N
3189 case MB_CHAR:
3190 /* Check whether the array has enough space. */
3191 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3192 {
3193 wchar_t *new_mbchars;
3194 /* Not enough, realloc it. */
3195 /* +1 in case of mbcset->nmbchars is 0. */
3196 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3197 /* Use realloc since array is NULL if *alloc == 0. */
3198 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3199 mbchar_alloc);
3200 if (BE (new_mbchars == NULL, 0))
3201 goto parse_bracket_exp_espace;
3202 mbcset->mbchars = new_mbchars;
3203 }
3204 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3205 break;
3206 #endif /* RE_ENABLE_I18N */
3207 case EQUIV_CLASS:
3208 *err = build_equiv_class (sbcset,
3209 #ifdef RE_ENABLE_I18N
3210 mbcset, &equiv_class_alloc,
3211 #endif /* RE_ENABLE_I18N */
3212 start_elem.opr.name);
3213 if (BE (*err != REG_NOERROR, 0))
3214 goto parse_bracket_exp_free_return;
3215 break;
3216 case COLL_SYM:
3217 *err = build_collating_symbol (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219 mbcset, &coll_sym_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221 start_elem.opr.name);
3222 if (BE (*err != REG_NOERROR, 0))
3223 goto parse_bracket_exp_free_return;
3224 break;
3225 case CHAR_CLASS:
3226 *err = build_charclass (regexp->trans, sbcset,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset, &char_class_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem.opr.name, syntax);
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3233 break;
3234 default:
3235 assert (0);
3236 break;
3237 }
3238 }
3239 if (BE (token->type == END_OF_RE, 0))
3240 {
3241 *err = REG_EBRACK;
3242 goto parse_bracket_exp_free_return;
3243 }
3244 if (token->type == OP_CLOSE_BRACKET)
3245 break;
3246 }
3247
3248 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3249
3250 /* If it is non-matching list. */
3251 if (non_match)
3252 bitset_not (sbcset);
3253
3254 #ifdef RE_ENABLE_I18N
3255 /* Ensure only single byte characters are set. */
3256 if (dfa->mb_cur_max > 1)
3257 bitset_mask (sbcset, dfa->sb_char);
3258
3259 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3260 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3261 || mbcset->non_match)))
3262 {
3263 bin_tree_t *mbc_tree;
3264 int sbc_idx;
3265 /* Build a tree for complex bracket. */
3266 dfa->has_mb_node = 1;
3267 br_token.type = COMPLEX_BRACKET;
3268 br_token.opr.mbcset = mbcset;
3269 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270 if (BE (mbc_tree == NULL, 0))
3271 goto parse_bracket_exp_espace;
3272 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3273 if (sbcset[sbc_idx])
3274 break;
3275 /* If there are no bits set in sbcset, there is no point
3276 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3277 if (sbc_idx < BITSET_WORDS)
3278 {
3279 /* Build a tree for simple bracket. */
3280 br_token.type = SIMPLE_BRACKET;
3281 br_token.opr.sbcset = sbcset;
3282 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283 if (BE (work_tree == NULL, 0))
3284 goto parse_bracket_exp_espace;
3285
3286 /* Then join them by ALT node. */
3287 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3288 if (BE (work_tree == NULL, 0))
3289 goto parse_bracket_exp_espace;
3290 }
3291 else
3292 {
3293 re_free (sbcset);
3294 work_tree = mbc_tree;
3295 }
3296 }
3297 else
3298 #endif /* not RE_ENABLE_I18N */
3299 {
3300 #ifdef RE_ENABLE_I18N
3301 free_charset (mbcset);
3302 #endif
3303 /* Build a tree for simple bracket. */
3304 br_token.type = SIMPLE_BRACKET;
3305 br_token.opr.sbcset = sbcset;
3306 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3307 if (BE (work_tree == NULL, 0))
3308 goto parse_bracket_exp_espace;
3309 }
3310 return work_tree;
3311
3312 parse_bracket_exp_espace:
3313 *err = REG_ESPACE;
3314 parse_bracket_exp_free_return:
3315 re_free (sbcset);
3316 #ifdef RE_ENABLE_I18N
3317 free_charset (mbcset);
3318 #endif /* RE_ENABLE_I18N */
3319 return NULL;
3320 }
3321
3322 /* Parse an element in the bracket expression. */
3323
3324 static reg_errcode_t
3325 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3326 re_token_t *token, int token_len, re_dfa_t *dfa,
3327 reg_syntax_t syntax, int accept_hyphen)
3328 {
3329 #ifdef RE_ENABLE_I18N
3330 int cur_char_size;
3331 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3332 if (cur_char_size > 1)
3333 {
3334 elem->type = MB_CHAR;
3335 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3336 re_string_skip_bytes (regexp, cur_char_size);
3337 return REG_NOERROR;
3338 }
3339 #endif /* RE_ENABLE_I18N */
3340 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3341 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3342 || token->type == OP_OPEN_EQUIV_CLASS)
3343 return parse_bracket_symbol (elem, regexp, token);
3344 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3345 {
3346 /* A '-' must only appear as anything but a range indicator before
3347 the closing bracket. Everything else is an error. */
3348 re_token_t token2;
3349 (void) peek_token_bracket (&token2, regexp, syntax);
3350 if (token2.type != OP_CLOSE_BRACKET)
3351 /* The actual error value is not standardized since this whole
3352 case is undefined. But ERANGE makes good sense. */
3353 return REG_ERANGE;
3354 }
3355 elem->type = SB_CHAR;
3356 elem->opr.ch = token->opr.c;
3357 return REG_NOERROR;
3358 }
3359
3360 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3361 such as [:<character_class>:], [.<collating_element>.], and
3362 [=<equivalent_class>=]. */
3363
3364 static reg_errcode_t
3365 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3366 re_token_t *token)
3367 {
3368 unsigned char ch, delim = token->opr.c;
3369 int i = 0;
3370 if (re_string_eoi(regexp))
3371 return REG_EBRACK;
3372 for (;; ++i)
3373 {
3374 if (i >= BRACKET_NAME_BUF_SIZE)
3375 return REG_EBRACK;
3376 if (token->type == OP_OPEN_CHAR_CLASS)
3377 ch = re_string_fetch_byte_case (regexp);
3378 else
3379 ch = re_string_fetch_byte (regexp);
3380 if (re_string_eoi(regexp))
3381 return REG_EBRACK;
3382 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3383 break;
3384 elem->opr.name[i] = ch;
3385 }
3386 re_string_skip_bytes (regexp, 1);
3387 elem->opr.name[i] = '\0';
3388 switch (token->type)
3389 {
3390 case OP_OPEN_COLL_ELEM:
3391 elem->type = COLL_SYM;
3392 break;
3393 case OP_OPEN_EQUIV_CLASS:
3394 elem->type = EQUIV_CLASS;
3395 break;
3396 case OP_OPEN_CHAR_CLASS:
3397 elem->type = CHAR_CLASS;
3398 break;
3399 default:
3400 break;
3401 }
3402 return REG_NOERROR;
3403 }
3404
3405 /* Helper function for parse_bracket_exp.
3406 Build the equivalence class which is represented by NAME.
3407 The result are written to MBCSET and SBCSET.
3408 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3409 is a pointer argument sinse we may update it. */
3410
3411 static reg_errcode_t
3412 #ifdef RE_ENABLE_I18N
3413 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3414 int *equiv_class_alloc, const unsigned char *name)
3415 #else /* not RE_ENABLE_I18N */
3416 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3417 #endif /* not RE_ENABLE_I18N */
3418 {
3419 #ifdef _LIBC
3420 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3421 if (nrules != 0)
3422 {
3423 const int32_t *table, *indirect;
3424 const unsigned char *weights, *extra, *cp;
3425 unsigned char char_buf[2];
3426 int32_t idx1, idx2;
3427 unsigned int ch;
3428 size_t len;
3429 /* This #include defines a local function! */
3430 # include <locale/weight.h>
3431 /* Calculate the index for equivalence class. */
3432 cp = name;
3433 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3434 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3435 _NL_COLLATE_WEIGHTMB);
3436 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3437 _NL_COLLATE_EXTRAMB);
3438 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3439 _NL_COLLATE_INDIRECTMB);
3440 idx1 = findidx (&cp, -1);
3441 if (BE (idx1 == 0 || *cp != '\0', 0))
3442 /* This isn't a valid character. */
3443 return REG_ECOLLATE;
3444
3445 /* Build single byte matcing table for this equivalence class. */
3446 len = weights[idx1 & 0xffffff];
3447 for (ch = 0; ch < SBC_MAX; ++ch)
3448 {
3449 char_buf[0] = ch;
3450 cp = char_buf;
3451 idx2 = findidx (&cp, 1);
3452 /*
3453 idx2 = table[ch];
3454 */
3455 if (idx2 == 0)
3456 /* This isn't a valid character. */
3457 continue;
3458 /* Compare only if the length matches and the collation rule
3459 index is the same. */
3460 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3461 {
3462 int cnt = 0;
3463
3464 while (cnt <= len &&
3465 weights[(idx1 & 0xffffff) + 1 + cnt]
3466 == weights[(idx2 & 0xffffff) + 1 + cnt])
3467 ++cnt;
3468
3469 if (cnt > len)
3470 bitset_set (sbcset, ch);
3471 }
3472 }
3473 /* Check whether the array has enough space. */
3474 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3475 {
3476 /* Not enough, realloc it. */
3477 /* +1 in case of mbcset->nequiv_classes is 0. */
3478 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3479 /* Use realloc since the array is NULL if *alloc == 0. */
3480 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3481 int32_t,
3482 new_equiv_class_alloc);
3483 if (BE (new_equiv_classes == NULL, 0))
3484 return REG_ESPACE;
3485 mbcset->equiv_classes = new_equiv_classes;
3486 *equiv_class_alloc = new_equiv_class_alloc;
3487 }
3488 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3489 }
3490 else
3491 #endif /* _LIBC */
3492 {
3493 if (BE (strlen ((const char *) name) != 1, 0))
3494 return REG_ECOLLATE;
3495 bitset_set (sbcset, *name);
3496 }
3497 return REG_NOERROR;
3498 }
3499
3500 /* Helper function for parse_bracket_exp.
3501 Build the character class which is represented by NAME.
3502 The result are written to MBCSET and SBCSET.
3503 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3504 is a pointer argument sinse we may update it. */
3505
3506 static reg_errcode_t
3507 #ifdef RE_ENABLE_I18N
3508 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3509 re_charset_t *mbcset, int *char_class_alloc,
3510 const unsigned char *class_name, reg_syntax_t syntax)
3511 #else /* not RE_ENABLE_I18N */
3512 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3513 const unsigned char *class_name, reg_syntax_t syntax)
3514 #endif /* not RE_ENABLE_I18N */
3515 {
3516 int i;
3517 const char *name = (const char *) class_name;
3518
3519 /* In case of REG_ICASE "upper" and "lower" match the both of
3520 upper and lower cases. */
3521 if ((syntax & RE_ICASE)
3522 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3523 name = "alpha";
3524
3525 #ifdef RE_ENABLE_I18N
3526 /* Check the space of the arrays. */
3527 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3528 {
3529 /* Not enough, realloc it. */
3530 /* +1 in case of mbcset->nchar_classes is 0. */
3531 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3532 /* Use realloc since array is NULL if *alloc == 0. */
3533 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3534 new_char_class_alloc);
3535 if (BE (new_char_classes == NULL, 0))
3536 return REG_ESPACE;
3537 mbcset->char_classes = new_char_classes;
3538 *char_class_alloc = new_char_class_alloc;
3539 }
3540 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3541 #endif /* RE_ENABLE_I18N */
3542
3543 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3544 do { \
3545 if (BE (trans != NULL, 0)) \
3546 { \
3547 for (i = 0; i < SBC_MAX; ++i) \
3548 if (ctype_func (i)) \
3549 bitset_set (sbcset, trans[i]); \
3550 } \
3551 else \
3552 { \
3553 for (i = 0; i < SBC_MAX; ++i) \
3554 if (ctype_func (i)) \
3555 bitset_set (sbcset, i); \
3556 } \
3557 } while (0)
3558
3559 if (strcmp (name, "alnum") == 0)
3560 BUILD_CHARCLASS_LOOP (isalnum);
3561 else if (strcmp (name, "cntrl") == 0)
3562 BUILD_CHARCLASS_LOOP (iscntrl);
3563 else if (strcmp (name, "lower") == 0)
3564 BUILD_CHARCLASS_LOOP (islower);
3565 else if (strcmp (name, "space") == 0)
3566 BUILD_CHARCLASS_LOOP (isspace);
3567 else if (strcmp (name, "alpha") == 0)
3568 BUILD_CHARCLASS_LOOP (isalpha);
3569 else if (strcmp (name, "digit") == 0)
3570 BUILD_CHARCLASS_LOOP (isdigit);
3571 else if (strcmp (name, "print") == 0)
3572 BUILD_CHARCLASS_LOOP (isprint);
3573 else if (strcmp (name, "upper") == 0)
3574 BUILD_CHARCLASS_LOOP (isupper);
3575 else if (strcmp (name, "blank") == 0)
3576 BUILD_CHARCLASS_LOOP (isblank);
3577 else if (strcmp (name, "graph") == 0)
3578 BUILD_CHARCLASS_LOOP (isgraph);
3579 else if (strcmp (name, "punct") == 0)
3580 BUILD_CHARCLASS_LOOP (ispunct);
3581 else if (strcmp (name, "xdigit") == 0)
3582 BUILD_CHARCLASS_LOOP (isxdigit);
3583 else
3584 return REG_ECTYPE;
3585
3586 return REG_NOERROR;
3587 }
3588
3589 static bin_tree_t *
3590 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3591 const unsigned char *class_name,
3592 const unsigned char *extra, int non_match,
3593 reg_errcode_t *err)
3594 {
3595 re_bitset_ptr_t sbcset;
3596 #ifdef RE_ENABLE_I18N
3597 re_charset_t *mbcset;
3598 int alloc = 0;
3599 #endif /* not RE_ENABLE_I18N */
3600 reg_errcode_t ret;
3601 re_token_t br_token;
3602 bin_tree_t *tree;
3603
3604 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3605 #ifdef RE_ENABLE_I18N
3606 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3607 #endif /* RE_ENABLE_I18N */
3608
3609 #ifdef RE_ENABLE_I18N
3610 if (BE (sbcset == NULL || mbcset == NULL, 0))
3611 #else /* not RE_ENABLE_I18N */
3612 if (BE (sbcset == NULL, 0))
3613 #endif /* not RE_ENABLE_I18N */
3614 {
3615 *err = REG_ESPACE;
3616 return NULL;
3617 }
3618
3619 if (non_match)
3620 {
3621 #ifdef RE_ENABLE_I18N
3622 mbcset->non_match = 1;
3623 #endif /* not RE_ENABLE_I18N */
3624 }
3625
3626 /* We don't care the syntax in this case. */
3627 ret = build_charclass (trans, sbcset,
3628 #ifdef RE_ENABLE_I18N
3629 mbcset, &alloc,
3630 #endif /* RE_ENABLE_I18N */
3631 class_name, 0);
3632
3633 if (BE (ret != REG_NOERROR, 0))
3634 {
3635 re_free (sbcset);
3636 #ifdef RE_ENABLE_I18N
3637 free_charset (mbcset);
3638 #endif /* RE_ENABLE_I18N */
3639 *err = ret;
3640 return NULL;
3641 }
3642 /* \w match '_' also. */
3643 for (; *extra; extra++)
3644 bitset_set (sbcset, *extra);
3645
3646 /* If it is non-matching list. */
3647 if (non_match)
3648 bitset_not (sbcset);
3649
3650 #ifdef RE_ENABLE_I18N
3651 /* Ensure only single byte characters are set. */
3652 if (dfa->mb_cur_max > 1)
3653 bitset_mask (sbcset, dfa->sb_char);
3654 #endif
3655
3656 /* Build a tree for simple bracket. */
3657 br_token.type = SIMPLE_BRACKET;
3658 br_token.opr.sbcset = sbcset;
3659 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3660 if (BE (tree == NULL, 0))
3661 goto build_word_op_espace;
3662
3663 #ifdef RE_ENABLE_I18N
3664 if (dfa->mb_cur_max > 1)
3665 {
3666 bin_tree_t *mbc_tree;
3667 /* Build a tree for complex bracket. */
3668 br_token.type = COMPLEX_BRACKET;
3669 br_token.opr.mbcset = mbcset;
3670 dfa->has_mb_node = 1;
3671 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3672 if (BE (mbc_tree == NULL, 0))
3673 goto build_word_op_espace;
3674 /* Then join them by ALT node. */
3675 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3676 if (BE (mbc_tree != NULL, 1))
3677 return tree;
3678 }
3679 else
3680 {
3681 free_charset (mbcset);
3682 return tree;
3683 }
3684 #else /* not RE_ENABLE_I18N */
3685 return tree;
3686 #endif /* not RE_ENABLE_I18N */
3687
3688 build_word_op_espace:
3689 re_free (sbcset);
3690 #ifdef RE_ENABLE_I18N
3691 free_charset (mbcset);
3692 #endif /* RE_ENABLE_I18N */
3693 *err = REG_ESPACE;
3694 return NULL;
3695 }
3696
3697 /* This is intended for the expressions like "a{1,3}".
3698 Fetch a number from `input', and return the number.
3699 Return -1, if the number field is empty like "{,1}".
3700 Return -2, If an error is occured. */
3701
3702 static int
3703 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3704 {
3705 int num = -1;
3706 unsigned char c;
3707 while (1)
3708 {
3709 fetch_token (token, input, syntax);
3710 c = token->opr.c;
3711 if (BE (token->type == END_OF_RE, 0))
3712 return -2;
3713 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3714 break;
3715 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3716 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3717 num = (num > RE_DUP_MAX) ? -2 : num;
3718 }
3719 return num;
3720 }
3721 \f
3722 #ifdef RE_ENABLE_I18N
3723 static void
3724 free_charset (re_charset_t *cset)
3725 {
3726 re_free (cset->mbchars);
3727 # ifdef _LIBC
3728 re_free (cset->coll_syms);
3729 re_free (cset->equiv_classes);
3730 re_free (cset->range_starts);
3731 re_free (cset->range_ends);
3732 # endif
3733 re_free (cset->char_classes);
3734 re_free (cset);
3735 }
3736 #endif /* RE_ENABLE_I18N */
3737 \f
3738 /* Functions for binary tree operation. */
3739
3740 /* Create a tree node. */
3741
3742 static bin_tree_t *
3743 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3744 re_token_type_t type)
3745 {
3746 re_token_t t;
3747 t.type = type;
3748 return create_token_tree (dfa, left, right, &t);
3749 }
3750
3751 static bin_tree_t *
3752 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3753 const re_token_t *token)
3754 {
3755 bin_tree_t *tree;
3756 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3757 {
3758 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3759
3760 if (storage == NULL)
3761 return NULL;
3762 storage->next = dfa->str_tree_storage;
3763 dfa->str_tree_storage = storage;
3764 dfa->str_tree_storage_idx = 0;
3765 }
3766 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3767
3768 tree->parent = NULL;
3769 tree->left = left;
3770 tree->right = right;
3771 tree->token = *token;
3772 tree->token.duplicated = 0;
3773 tree->token.opt_subexp = 0;
3774 tree->first = NULL;
3775 tree->next = NULL;
3776 tree->node_idx = -1;
3777
3778 if (left != NULL)
3779 left->parent = tree;
3780 if (right != NULL)
3781 right->parent = tree;
3782 return tree;
3783 }
3784
3785 /* Mark the tree SRC as an optional subexpression.
3786 To be called from preorder or postorder. */
3787
3788 static reg_errcode_t
3789 mark_opt_subexp (void *extra, bin_tree_t *node)
3790 {
3791 int idx = (int) (long) extra;
3792 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3793 node->token.opt_subexp = 1;
3794
3795 return REG_NOERROR;
3796 }
3797
3798 /* Free the allocated memory inside NODE. */
3799
3800 static void
3801 free_token (re_token_t *node)
3802 {
3803 #ifdef RE_ENABLE_I18N
3804 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3805 free_charset (node->opr.mbcset);
3806 else
3807 #endif /* RE_ENABLE_I18N */
3808 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3809 re_free (node->opr.sbcset);
3810 }
3811
3812 /* Worker function for tree walking. Free the allocated memory inside NODE
3813 and its children. */
3814
3815 static reg_errcode_t
3816 free_tree (void *extra, bin_tree_t *node)
3817 {
3818 free_token (&node->token);
3819 return REG_NOERROR;
3820 }
3821
3822
3823 /* Duplicate the node SRC, and return new node. This is a preorder
3824 visit similar to the one implemented by the generic visitor, but
3825 we need more infrastructure to maintain two parallel trees --- so,
3826 it's easier to duplicate. */
3827
3828 static bin_tree_t *
3829 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3830 {
3831 const bin_tree_t *node;
3832 bin_tree_t *dup_root;
3833 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3834
3835 for (node = root; ; )
3836 {
3837 /* Create a new tree and link it back to the current parent. */
3838 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3839 if (*p_new == NULL)
3840 return NULL;
3841 (*p_new)->parent = dup_node;
3842 (*p_new)->token.duplicated = 1;
3843 dup_node = *p_new;
3844
3845 /* Go to the left node, or up and to the right. */
3846 if (node->left)
3847 {
3848 node = node->left;
3849 p_new = &dup_node->left;
3850 }
3851 else
3852 {
3853 const bin_tree_t *prev = NULL;
3854 while (node->right == prev || node->right == NULL)
3855 {
3856 prev = node;
3857 node = node->parent;
3858 dup_node = dup_node->parent;
3859 if (!node)
3860 return dup_root;
3861 }
3862 node = node->right;
3863 p_new = &dup_node->right;
3864 }
3865 }
3866 }