]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
More regex memory leak fixes and tests.
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 \f
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129 const char __re_error_msgid[] attribute_hidden =
130 {
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203 \f
204 /* Entry points for GNU code. */
205
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
209
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
212
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218 {
219 reg_errcode_t ret;
220
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225
226 /* Match anchors at newline. */
227 bufp->newline_anchor = 1;
228
229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230
231 if (!ret)
232 return NULL;
233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options;
245
246
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
250
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
253
254 reg_syntax_t
255 re_set_syntax (syntax)
256 reg_syntax_t syntax;
257 {
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266
267 int
268 re_compile_fastmap (bufp)
269 struct re_pattern_buffer *bufp;
270 {
271 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272 char *fastmap = bufp->fastmap;
273
274 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276 if (dfa->init_state != dfa->init_state_word)
277 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278 if (dfa->init_state != dfa->init_state_nl)
279 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280 if (dfa->init_state != dfa->init_state_begbuf)
281 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282 bufp->fastmap_accurate = 1;
283 return 0;
284 }
285 #ifdef _LIBC
286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
288
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
292 {
293 fastmap[ch] = 1;
294 if (icase)
295 fastmap[tolower (ch)] = 1;
296 }
297
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
300
301 static void
302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 char *fastmap)
304 {
305 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306 int node_cnt;
307 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 {
310 int node = init_state->nodes.elems[node_cnt];
311 re_token_type_t type = dfa->nodes[node].type;
312
313 if (type == CHARACTER)
314 {
315 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 {
319 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, 0, buf[0]);
335 }
336 #endif
337 }
338 else if (type == SIMPLE_BRACKET)
339 {
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
349 }
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
352 {
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 int i;
355
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
365 {
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
371 }
372 # endif /* _LIBC */
373
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
383 ))
384 {
385 unsigned char c = 0;
386 do
387 {
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
392 }
393 while (++c != 0);
394 }
395
396 else
397 {
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
400 {
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 {
408 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 }
412 }
413 }
414 }
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
421 {
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
426 }
427 }
428 }
429 \f
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
432
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
435
436 `buffer' to the compiled pattern;
437 `used' to the length of the compiled pattern;
438 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 `fastmap' to an allocated space for the fastmap;
443 `fastmap_accurate' to zero;
444 `re_nsub' to the number of subexpressions in PATTERN.
445
446 PATTERN is the address of the pattern string.
447
448 CFLAGS is a series of bits which affect compilation.
449
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
452
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
455
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
458
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
462
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
465
466 int
467 regcomp (preg, pattern, cflags)
468 regex_t *__restrict preg;
469 const char *__restrict pattern;
470 int cflags;
471 {
472 reg_errcode_t ret;
473 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
474 : RE_SYNTAX_POSIX_BASIC);
475
476 preg->buffer = NULL;
477 preg->allocated = 0;
478 preg->used = 0;
479
480 /* Try to allocate space for the fastmap. */
481 preg->fastmap = re_malloc (char, SBC_MAX);
482 if (BE (preg->fastmap == NULL, 0))
483 return REG_ESPACE;
484
485 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
486
487 /* If REG_NEWLINE is set, newlines are treated differently. */
488 if (cflags & REG_NEWLINE)
489 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
490 syntax &= ~RE_DOT_NEWLINE;
491 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
492 /* It also changes the matching behavior. */
493 preg->newline_anchor = 1;
494 }
495 else
496 preg->newline_anchor = 0;
497 preg->no_sub = !!(cflags & REG_NOSUB);
498 preg->translate = NULL;
499
500 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
501
502 /* POSIX doesn't distinguish between an unmatched open-group and an
503 unmatched close-group: both are REG_EPAREN. */
504 if (ret == REG_ERPAREN)
505 ret = REG_EPAREN;
506
507 /* We have already checked preg->fastmap != NULL. */
508 if (BE (ret == REG_NOERROR, 1))
509 /* Compute the fastmap now, since regexec cannot modify the pattern
510 buffer. This function never fails in this implementation. */
511 (void) re_compile_fastmap (preg);
512 else
513 {
514 /* Some error occurred while compiling the expression. */
515 re_free (preg->fastmap);
516 preg->fastmap = NULL;
517 }
518
519 return (int) ret;
520 }
521 #ifdef _LIBC
522 weak_alias (__regcomp, regcomp)
523 #endif
524
525 /* Returns a message corresponding to an error code, ERRCODE, returned
526 from either regcomp or regexec. We don't use PREG here. */
527
528 size_t
529 regerror (errcode, preg, errbuf, errbuf_size)
530 int errcode;
531 const regex_t *__restrict preg;
532 char *__restrict errbuf;
533 size_t errbuf_size;
534 {
535 const char *msg;
536 size_t msg_size;
537
538 if (BE (errcode < 0
539 || errcode >= (int) (sizeof (__re_error_msgid_idx)
540 / sizeof (__re_error_msgid_idx[0])), 0))
541 /* Only error codes returned by the rest of the code should be passed
542 to this routine. If we are given anything else, or if other regex
543 code generates an invalid error code, then the program has a bug.
544 Dump core so we can fix it. */
545 abort ();
546
547 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
548
549 msg_size = strlen (msg) + 1; /* Includes the null. */
550
551 if (BE (errbuf_size != 0, 1))
552 {
553 if (BE (msg_size > errbuf_size, 0))
554 {
555 #if defined HAVE_MEMPCPY || defined _LIBC
556 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
557 #else
558 memcpy (errbuf, msg, errbuf_size - 1);
559 errbuf[errbuf_size - 1] = 0;
560 #endif
561 }
562 else
563 memcpy (errbuf, msg, msg_size);
564 }
565
566 return msg_size;
567 }
568 #ifdef _LIBC
569 weak_alias (__regerror, regerror)
570 #endif
571
572
573 #ifdef RE_ENABLE_I18N
574 /* This static array is used for the map to single-byte characters when
575 UTF-8 is used. Otherwise we would allocate memory just to initialize
576 it the same all the time. UTF-8 is the preferred encoding so this is
577 a worthwhile optimization. */
578 static const bitset_t utf8_sb_map =
579 {
580 /* Set the first 128 bits. */
581 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
582 };
583 #endif
584
585
586 static void
587 free_dfa_content (re_dfa_t *dfa)
588 {
589 int i, j;
590
591 if (dfa->nodes)
592 for (i = 0; i < dfa->nodes_len; ++i)
593 free_token (dfa->nodes + i);
594 re_free (dfa->nexts);
595 for (i = 0; i < dfa->nodes_len; ++i)
596 {
597 if (dfa->eclosures != NULL)
598 re_node_set_free (dfa->eclosures + i);
599 if (dfa->inveclosures != NULL)
600 re_node_set_free (dfa->inveclosures + i);
601 if (dfa->edests != NULL)
602 re_node_set_free (dfa->edests + i);
603 }
604 re_free (dfa->edests);
605 re_free (dfa->eclosures);
606 re_free (dfa->inveclosures);
607 re_free (dfa->nodes);
608
609 if (dfa->state_table)
610 for (i = 0; i <= dfa->state_hash_mask; ++i)
611 {
612 struct re_state_table_entry *entry = dfa->state_table + i;
613 for (j = 0; j < entry->num; ++j)
614 {
615 re_dfastate_t *state = entry->array[j];
616 free_state (state);
617 }
618 re_free (entry->array);
619 }
620 re_free (dfa->state_table);
621 #ifdef RE_ENABLE_I18N
622 if (dfa->sb_char != utf8_sb_map)
623 re_free (dfa->sb_char);
624 #endif
625 re_free (dfa->subexp_map);
626 #ifdef DEBUG
627 re_free (dfa->re_str);
628 #endif
629
630 re_free (dfa);
631 }
632
633
634 /* Free dynamically allocated space used by PREG. */
635
636 void
637 regfree (preg)
638 regex_t *preg;
639 {
640 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641 if (BE (dfa != NULL, 1))
642 free_dfa_content (dfa);
643 preg->buffer = NULL;
644 preg->allocated = 0;
645
646 re_free (preg->fastmap);
647 preg->fastmap = NULL;
648
649 re_free (preg->translate);
650 preg->translate = NULL;
651 }
652 #ifdef _LIBC
653 weak_alias (__regfree, regfree)
654 #endif
655 \f
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
658
659 #if defined _REGEX_RE_COMP || defined _LIBC
660
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf;
663
664 char *
665 # ifdef _LIBC
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
669 weak_function
670 # endif
671 re_comp (s)
672 const char *s;
673 {
674 reg_errcode_t ret;
675 char *fastmap;
676
677 if (!s)
678 {
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
681 return 0;
682 }
683
684 if (re_comp_buf.buffer)
685 {
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
691 }
692
693 if (re_comp_buf.fastmap == NULL)
694 {
695 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
699 }
700
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
703
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
706
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709 if (!ret)
710 return NULL;
711
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714 }
715
716 #ifdef _LIBC
717 libc_freeres_fn (free_mem)
718 {
719 __regfree (&re_comp_buf);
720 }
721 #endif
722
723 #endif /* _REGEX_RE_COMP */
724 \f
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
728
729 static reg_errcode_t
730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
731 reg_syntax_t syntax)
732 {
733 reg_errcode_t err = REG_NOERROR;
734 re_dfa_t *dfa;
735 re_string_t regexp;
736
737 /* Initialize the pattern buffer. */
738 preg->fastmap_accurate = 0;
739 preg->syntax = syntax;
740 preg->not_bol = preg->not_eol = 0;
741 preg->used = 0;
742 preg->re_nsub = 0;
743 preg->can_be_null = 0;
744 preg->regs_allocated = REGS_UNALLOCATED;
745
746 /* Initialize the dfa. */
747 dfa = (re_dfa_t *) preg->buffer;
748 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749 {
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
755 if (dfa == NULL)
756 return REG_ESPACE;
757 preg->allocated = sizeof (re_dfa_t);
758 preg->buffer = (unsigned char *) dfa;
759 }
760 preg->used = sizeof (re_dfa_t);
761
762 err = init_dfa (dfa, length);
763 if (BE (err != REG_NOERROR, 0))
764 {
765 free_dfa_content (dfa);
766 preg->buffer = NULL;
767 preg->allocated = 0;
768 return err;
769 }
770 #ifdef DEBUG
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa->re_str = re_malloc (char, length + 1);
773 strncpy (dfa->re_str, pattern, length + 1);
774 #endif
775
776 __libc_lock_init (dfa->lock);
777
778 err = re_string_construct (&regexp, pattern, length, preg->translate,
779 syntax & RE_ICASE, dfa);
780 if (BE (err != REG_NOERROR, 0))
781 {
782 re_compile_internal_free_return:
783 free_workarea_compile (preg);
784 re_string_destruct (&regexp);
785 free_dfa_content (dfa);
786 preg->buffer = NULL;
787 preg->allocated = 0;
788 return err;
789 }
790
791 /* Parse the regular expression, and build a structure tree. */
792 preg->re_nsub = 0;
793 dfa->str_tree = parse (&regexp, preg, syntax, &err);
794 if (BE (dfa->str_tree == NULL, 0))
795 goto re_compile_internal_free_return;
796
797 /* Analyze the tree and create the nfa. */
798 err = analyze (preg);
799 if (BE (err != REG_NOERROR, 0))
800 goto re_compile_internal_free_return;
801
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
805 optimize_utf8 (dfa);
806 #endif
807
808 /* Then create the initial state of the dfa. */
809 err = create_initial_state (dfa);
810
811 /* Release work areas. */
812 free_workarea_compile (preg);
813 re_string_destruct (&regexp);
814
815 if (BE (err != REG_NOERROR, 0))
816 {
817 free_dfa_content (dfa);
818 preg->buffer = NULL;
819 preg->allocated = 0;
820 }
821
822 return err;
823 }
824
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
827
828 static reg_errcode_t
829 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 {
831 unsigned int table_size;
832 #ifndef _LIBC
833 char *codeset_name;
834 #endif
835
836 memset (dfa, '\0', sizeof (re_dfa_t));
837
838 /* Force allocation of str_tree_storage the first time. */
839 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
840
841 /* Avoid overflows. */
842 if (pat_len == SIZE_MAX)
843 return REG_ESPACE;
844
845 dfa->nodes_alloc = pat_len + 1;
846 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
847
848 /* table_size = 2 ^ ceil(log pat_len) */
849 for (table_size = 1; ; table_size <<= 1)
850 if (table_size > pat_len)
851 break;
852
853 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
854 dfa->state_hash_mask = table_size - 1;
855
856 dfa->mb_cur_max = MB_CUR_MAX;
857 #ifdef _LIBC
858 if (dfa->mb_cur_max == 6
859 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
860 dfa->is_utf8 = 1;
861 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
862 != 0);
863 #else
864 # ifdef HAVE_LANGINFO_CODESET
865 codeset_name = nl_langinfo (CODESET);
866 # else
867 codeset_name = getenv ("LC_ALL");
868 if (codeset_name == NULL || codeset_name[0] == '\0')
869 codeset_name = getenv ("LC_CTYPE");
870 if (codeset_name == NULL || codeset_name[0] == '\0')
871 codeset_name = getenv ("LANG");
872 if (codeset_name == NULL)
873 codeset_name = "";
874 else if (strchr (codeset_name, '.') != NULL)
875 codeset_name = strchr (codeset_name, '.') + 1;
876 # endif
877
878 if (strcasecmp (codeset_name, "UTF-8") == 0
879 || strcasecmp (codeset_name, "UTF8") == 0)
880 dfa->is_utf8 = 1;
881
882 /* We check exhaustively in the loop below if this charset is a
883 superset of ASCII. */
884 dfa->map_notascii = 0;
885 #endif
886
887 #ifdef RE_ENABLE_I18N
888 if (dfa->mb_cur_max > 1)
889 {
890 if (dfa->is_utf8)
891 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
892 else
893 {
894 int i, j, ch;
895
896 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
897 if (BE (dfa->sb_char == NULL, 0))
898 return REG_ESPACE;
899
900 /* Set the bits corresponding to single byte chars. */
901 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
902 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
903 {
904 wint_t wch = __btowc (ch);
905 if (wch != WEOF)
906 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
907 # ifndef _LIBC
908 if (isascii (ch) && wch != ch)
909 dfa->map_notascii = 1;
910 # endif
911 }
912 }
913 }
914 #endif
915
916 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
917 return REG_ESPACE;
918 return REG_NOERROR;
919 }
920
921 /* Initialize WORD_CHAR table, which indicate which character is
922 "word". In this case "word" means that it is the word construction
923 character used by some operators like "\<", "\>", etc. */
924
925 static void
926 internal_function
927 init_word_char (re_dfa_t *dfa)
928 {
929 int i, j, ch;
930 dfa->word_ops_used = 1;
931 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
932 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
933 if (isalnum (ch) || ch == '_')
934 dfa->word_char[i] |= (bitset_word_t) 1 << j;
935 }
936
937 /* Free the work area which are only used while compiling. */
938
939 static void
940 free_workarea_compile (regex_t *preg)
941 {
942 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
943 bin_tree_storage_t *storage, *next;
944 for (storage = dfa->str_tree_storage; storage; storage = next)
945 {
946 next = storage->next;
947 re_free (storage);
948 }
949 dfa->str_tree_storage = NULL;
950 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
951 dfa->str_tree = NULL;
952 re_free (dfa->org_indices);
953 dfa->org_indices = NULL;
954 }
955
956 /* Create initial states for all contexts. */
957
958 static reg_errcode_t
959 create_initial_state (re_dfa_t *dfa)
960 {
961 int first, i;
962 reg_errcode_t err;
963 re_node_set init_nodes;
964
965 /* Initial states have the epsilon closure of the node which is
966 the first node of the regular expression. */
967 first = dfa->str_tree->first->node_idx;
968 dfa->init_node = first;
969 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
970 if (BE (err != REG_NOERROR, 0))
971 return err;
972
973 /* The back-references which are in initial states can epsilon transit,
974 since in this case all of the subexpressions can be null.
975 Then we add epsilon closures of the nodes which are the next nodes of
976 the back-references. */
977 if (dfa->nbackref > 0)
978 for (i = 0; i < init_nodes.nelem; ++i)
979 {
980 int node_idx = init_nodes.elems[i];
981 re_token_type_t type = dfa->nodes[node_idx].type;
982
983 int clexp_idx;
984 if (type != OP_BACK_REF)
985 continue;
986 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
987 {
988 re_token_t *clexp_node;
989 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
990 if (clexp_node->type == OP_CLOSE_SUBEXP
991 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
992 break;
993 }
994 if (clexp_idx == init_nodes.nelem)
995 continue;
996
997 if (type == OP_BACK_REF)
998 {
999 int dest_idx = dfa->edests[node_idx].elems[0];
1000 if (!re_node_set_contains (&init_nodes, dest_idx))
1001 {
1002 reg_errcode_t err = re_node_set_merge (&init_nodes,
1003 dfa->eclosures
1004 + dest_idx);
1005 if (err != REG_NOERROR)
1006 return err;
1007 i = 0;
1008 }
1009 }
1010 }
1011
1012 /* It must be the first time to invoke acquire_state. */
1013 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1014 /* We don't check ERR here, since the initial state must not be NULL. */
1015 if (BE (dfa->init_state == NULL, 0))
1016 return err;
1017 if (dfa->init_state->has_constraint)
1018 {
1019 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1020 CONTEXT_WORD);
1021 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1022 CONTEXT_NEWLINE);
1023 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1024 &init_nodes,
1025 CONTEXT_NEWLINE
1026 | CONTEXT_BEGBUF);
1027 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1028 || dfa->init_state_begbuf == NULL, 0))
1029 return err;
1030 }
1031 else
1032 dfa->init_state_word = dfa->init_state_nl
1033 = dfa->init_state_begbuf = dfa->init_state;
1034
1035 re_node_set_free (&init_nodes);
1036 return REG_NOERROR;
1037 }
1038 \f
1039 #ifdef RE_ENABLE_I18N
1040 /* If it is possible to do searching in single byte encoding instead of UTF-8
1041 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1042 DFA nodes where needed. */
1043
1044 static void
1045 optimize_utf8 (re_dfa_t *dfa)
1046 {
1047 int node, i, mb_chars = 0, has_period = 0;
1048
1049 for (node = 0; node < dfa->nodes_len; ++node)
1050 switch (dfa->nodes[node].type)
1051 {
1052 case CHARACTER:
1053 if (dfa->nodes[node].opr.c >= 0x80)
1054 mb_chars = 1;
1055 break;
1056 case ANCHOR:
1057 switch (dfa->nodes[node].opr.ctx_type)
1058 {
1059 case LINE_FIRST:
1060 case LINE_LAST:
1061 case BUF_FIRST:
1062 case BUF_LAST:
1063 break;
1064 default:
1065 /* Word anchors etc. cannot be handled. It's okay to test
1066 opr.ctx_type since constraints (for all DFA nodes) are
1067 created by ORing one or more opr.ctx_type values. */
1068 return;
1069 }
1070 break;
1071 case OP_PERIOD:
1072 has_period = 1;
1073 break;
1074 case OP_BACK_REF:
1075 case OP_ALT:
1076 case END_OF_RE:
1077 case OP_DUP_ASTERISK:
1078 case OP_OPEN_SUBEXP:
1079 case OP_CLOSE_SUBEXP:
1080 break;
1081 case COMPLEX_BRACKET:
1082 return;
1083 case SIMPLE_BRACKET:
1084 /* Just double check. The non-ASCII range starts at 0x80. */
1085 assert (0x80 % BITSET_WORD_BITS == 0);
1086 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1087 if (dfa->nodes[node].opr.sbcset[i])
1088 return;
1089 break;
1090 default:
1091 abort ();
1092 }
1093
1094 if (mb_chars || has_period)
1095 for (node = 0; node < dfa->nodes_len; ++node)
1096 {
1097 if (dfa->nodes[node].type == CHARACTER
1098 && dfa->nodes[node].opr.c >= 0x80)
1099 dfa->nodes[node].mb_partial = 0;
1100 else if (dfa->nodes[node].type == OP_PERIOD)
1101 dfa->nodes[node].type = OP_UTF8_PERIOD;
1102 }
1103
1104 /* The search can be in single byte locale. */
1105 dfa->mb_cur_max = 1;
1106 dfa->is_utf8 = 0;
1107 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1108 }
1109 #endif
1110 \f
1111 /* Analyze the structure tree, and calculate "first", "next", "edest",
1112 "eclosure", and "inveclosure". */
1113
1114 static reg_errcode_t
1115 analyze (regex_t *preg)
1116 {
1117 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1118 reg_errcode_t ret;
1119
1120 /* Allocate arrays. */
1121 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1122 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1123 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1124 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1125 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1126 || dfa->eclosures == NULL, 0))
1127 return REG_ESPACE;
1128
1129 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1130 if (dfa->subexp_map != NULL)
1131 {
1132 int i;
1133 for (i = 0; i < preg->re_nsub; i++)
1134 dfa->subexp_map[i] = i;
1135 preorder (dfa->str_tree, optimize_subexps, dfa);
1136 for (i = 0; i < preg->re_nsub; i++)
1137 if (dfa->subexp_map[i] != i)
1138 break;
1139 if (i == preg->re_nsub)
1140 {
1141 free (dfa->subexp_map);
1142 dfa->subexp_map = NULL;
1143 }
1144 }
1145
1146 ret = postorder (dfa->str_tree, lower_subexps, preg);
1147 if (BE (ret != REG_NOERROR, 0))
1148 return ret;
1149 ret = postorder (dfa->str_tree, calc_first, dfa);
1150 if (BE (ret != REG_NOERROR, 0))
1151 return ret;
1152 preorder (dfa->str_tree, calc_next, dfa);
1153 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1154 if (BE (ret != REG_NOERROR, 0))
1155 return ret;
1156 ret = calc_eclosure (dfa);
1157 if (BE (ret != REG_NOERROR, 0))
1158 return ret;
1159
1160 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1161 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1162 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1163 || dfa->nbackref)
1164 {
1165 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1166 if (BE (dfa->inveclosures == NULL, 0))
1167 return REG_ESPACE;
1168 ret = calc_inveclosure (dfa);
1169 }
1170
1171 return ret;
1172 }
1173
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175 implement parse tree visits. Instead, we use parent pointers and
1176 some hairy code in these two functions. */
1177 static reg_errcode_t
1178 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1179 void *extra)
1180 {
1181 bin_tree_t *node, *prev;
1182
1183 for (node = root; ; )
1184 {
1185 /* Descend down the tree, preferably to the left (or to the right
1186 if that's the only child). */
1187 while (node->left || node->right)
1188 if (node->left)
1189 node = node->left;
1190 else
1191 node = node->right;
1192
1193 do
1194 {
1195 reg_errcode_t err = fn (extra, node);
1196 if (BE (err != REG_NOERROR, 0))
1197 return err;
1198 if (node->parent == NULL)
1199 return REG_NOERROR;
1200 prev = node;
1201 node = node->parent;
1202 }
1203 /* Go up while we have a node that is reached from the right. */
1204 while (node->right == prev || node->right == NULL);
1205 node = node->right;
1206 }
1207 }
1208
1209 static reg_errcode_t
1210 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211 void *extra)
1212 {
1213 bin_tree_t *node;
1214
1215 for (node = root; ; )
1216 {
1217 reg_errcode_t err = fn (extra, node);
1218 if (BE (err != REG_NOERROR, 0))
1219 return err;
1220
1221 /* Go to the left node, or up and to the right. */
1222 if (node->left)
1223 node = node->left;
1224 else
1225 {
1226 bin_tree_t *prev = NULL;
1227 while (node->right == prev || node->right == NULL)
1228 {
1229 prev = node;
1230 node = node->parent;
1231 if (!node)
1232 return REG_NOERROR;
1233 }
1234 node = node->right;
1235 }
1236 }
1237 }
1238
1239 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1240 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1241 backreferences as well. Requires a preorder visit. */
1242 static reg_errcode_t
1243 optimize_subexps (void *extra, bin_tree_t *node)
1244 {
1245 re_dfa_t *dfa = (re_dfa_t *) extra;
1246
1247 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1248 {
1249 int idx = node->token.opr.idx;
1250 node->token.opr.idx = dfa->subexp_map[idx];
1251 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1252 }
1253
1254 else if (node->token.type == SUBEXP
1255 && node->left && node->left->token.type == SUBEXP)
1256 {
1257 int other_idx = node->left->token.opr.idx;
1258
1259 node->left = node->left->left;
1260 if (node->left)
1261 node->left->parent = node;
1262
1263 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1264 if (other_idx < BITSET_WORD_BITS)
1265 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1266 }
1267
1268 return REG_NOERROR;
1269 }
1270
1271 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1272 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1273 static reg_errcode_t
1274 lower_subexps (void *extra, bin_tree_t *node)
1275 {
1276 regex_t *preg = (regex_t *) extra;
1277 reg_errcode_t err = REG_NOERROR;
1278
1279 if (node->left && node->left->token.type == SUBEXP)
1280 {
1281 node->left = lower_subexp (&err, preg, node->left);
1282 if (node->left)
1283 node->left->parent = node;
1284 }
1285 if (node->right && node->right->token.type == SUBEXP)
1286 {
1287 node->right = lower_subexp (&err, preg, node->right);
1288 if (node->right)
1289 node->right->parent = node;
1290 }
1291
1292 return err;
1293 }
1294
1295 static bin_tree_t *
1296 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1297 {
1298 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1299 bin_tree_t *body = node->left;
1300 bin_tree_t *op, *cls, *tree1, *tree;
1301
1302 if (preg->no_sub
1303 /* We do not optimize empty subexpressions, because otherwise we may
1304 have bad CONCAT nodes with NULL children. This is obviously not
1305 very common, so we do not lose much. An example that triggers
1306 this case is the sed "script" /\(\)/x. */
1307 && node->left != NULL
1308 && (node->token.opr.idx >= BITSET_WORD_BITS
1309 || !(dfa->used_bkref_map
1310 & ((bitset_word_t) 1 << node->token.opr.idx))))
1311 return node->left;
1312
1313 /* Convert the SUBEXP node to the concatenation of an
1314 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1315 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1316 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1317 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1318 tree = create_tree (dfa, op, tree1, CONCAT);
1319 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1320 {
1321 *err = REG_ESPACE;
1322 return NULL;
1323 }
1324
1325 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1326 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1327 return tree;
1328 }
1329
1330 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1331 nodes. Requires a postorder visit. */
1332 static reg_errcode_t
1333 calc_first (void *extra, bin_tree_t *node)
1334 {
1335 re_dfa_t *dfa = (re_dfa_t *) extra;
1336 if (node->token.type == CONCAT)
1337 {
1338 node->first = node->left->first;
1339 node->node_idx = node->left->node_idx;
1340 }
1341 else
1342 {
1343 node->first = node;
1344 node->node_idx = re_dfa_add_node (dfa, node->token);
1345 if (BE (node->node_idx == -1, 0))
1346 return REG_ESPACE;
1347 if (node->token.type == ANCHOR)
1348 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1349 }
1350 return REG_NOERROR;
1351 }
1352
1353 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1354 static reg_errcode_t
1355 calc_next (void *extra, bin_tree_t *node)
1356 {
1357 switch (node->token.type)
1358 {
1359 case OP_DUP_ASTERISK:
1360 node->left->next = node;
1361 break;
1362 case CONCAT:
1363 node->left->next = node->right->first;
1364 node->right->next = node->next;
1365 break;
1366 default:
1367 if (node->left)
1368 node->left->next = node->next;
1369 if (node->right)
1370 node->right->next = node->next;
1371 break;
1372 }
1373 return REG_NOERROR;
1374 }
1375
1376 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1377 static reg_errcode_t
1378 link_nfa_nodes (void *extra, bin_tree_t *node)
1379 {
1380 re_dfa_t *dfa = (re_dfa_t *) extra;
1381 int idx = node->node_idx;
1382 reg_errcode_t err = REG_NOERROR;
1383
1384 switch (node->token.type)
1385 {
1386 case CONCAT:
1387 break;
1388
1389 case END_OF_RE:
1390 assert (node->next == NULL);
1391 break;
1392
1393 case OP_DUP_ASTERISK:
1394 case OP_ALT:
1395 {
1396 int left, right;
1397 dfa->has_plural_match = 1;
1398 if (node->left != NULL)
1399 left = node->left->first->node_idx;
1400 else
1401 left = node->next->node_idx;
1402 if (node->right != NULL)
1403 right = node->right->first->node_idx;
1404 else
1405 right = node->next->node_idx;
1406 assert (left > -1);
1407 assert (right > -1);
1408 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1409 }
1410 break;
1411
1412 case ANCHOR:
1413 case OP_OPEN_SUBEXP:
1414 case OP_CLOSE_SUBEXP:
1415 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1416 break;
1417
1418 case OP_BACK_REF:
1419 dfa->nexts[idx] = node->next->node_idx;
1420 if (node->token.type == OP_BACK_REF)
1421 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1422 break;
1423
1424 default:
1425 assert (!IS_EPSILON_NODE (node->token.type));
1426 dfa->nexts[idx] = node->next->node_idx;
1427 break;
1428 }
1429
1430 return err;
1431 }
1432
1433 /* Duplicate the epsilon closure of the node ROOT_NODE.
1434 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1435 to their own constraint. */
1436
1437 static reg_errcode_t
1438 internal_function
1439 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1440 int root_node, unsigned int init_constraint)
1441 {
1442 int org_node, clone_node, ret;
1443 unsigned int constraint = init_constraint;
1444 for (org_node = top_org_node, clone_node = top_clone_node;;)
1445 {
1446 int org_dest, clone_dest;
1447 if (dfa->nodes[org_node].type == OP_BACK_REF)
1448 {
1449 /* If the back reference epsilon-transit, its destination must
1450 also have the constraint. Then duplicate the epsilon closure
1451 of the destination of the back reference, and store it in
1452 edests of the back reference. */
1453 org_dest = dfa->nexts[org_node];
1454 re_node_set_empty (dfa->edests + clone_node);
1455 clone_dest = duplicate_node (dfa, org_dest, constraint);
1456 if (BE (clone_dest == -1, 0))
1457 return REG_ESPACE;
1458 dfa->nexts[clone_node] = dfa->nexts[org_node];
1459 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1460 if (BE (ret < 0, 0))
1461 return REG_ESPACE;
1462 }
1463 else if (dfa->edests[org_node].nelem == 0)
1464 {
1465 /* In case of the node can't epsilon-transit, don't duplicate the
1466 destination and store the original destination as the
1467 destination of the node. */
1468 dfa->nexts[clone_node] = dfa->nexts[org_node];
1469 break;
1470 }
1471 else if (dfa->edests[org_node].nelem == 1)
1472 {
1473 /* In case of the node can epsilon-transit, and it has only one
1474 destination. */
1475 org_dest = dfa->edests[org_node].elems[0];
1476 re_node_set_empty (dfa->edests + clone_node);
1477 /* If the node is root_node itself, it means the epsilon clsoure
1478 has a loop. Then tie it to the destination of the root_node. */
1479 if (org_node == root_node && clone_node != org_node)
1480 {
1481 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1482 if (BE (ret < 0, 0))
1483 return REG_ESPACE;
1484 break;
1485 }
1486 /* In case of the node has another constraint, add it. */
1487 constraint |= dfa->nodes[org_node].constraint;
1488 clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 if (BE (clone_dest == -1, 0))
1490 return REG_ESPACE;
1491 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492 if (BE (ret < 0, 0))
1493 return REG_ESPACE;
1494 }
1495 else /* dfa->edests[org_node].nelem == 2 */
1496 {
1497 /* In case of the node can epsilon-transit, and it has two
1498 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1499 org_dest = dfa->edests[org_node].elems[0];
1500 re_node_set_empty (dfa->edests + clone_node);
1501 /* Search for a duplicated node which satisfies the constraint. */
1502 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1503 if (clone_dest == -1)
1504 {
1505 /* There is no such duplicated node, create a new one. */
1506 reg_errcode_t err;
1507 clone_dest = duplicate_node (dfa, org_dest, constraint);
1508 if (BE (clone_dest == -1, 0))
1509 return REG_ESPACE;
1510 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1511 if (BE (ret < 0, 0))
1512 return REG_ESPACE;
1513 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1514 root_node, constraint);
1515 if (BE (err != REG_NOERROR, 0))
1516 return err;
1517 }
1518 else
1519 {
1520 /* There is a duplicated node which satisfies the constraint,
1521 use it to avoid infinite loop. */
1522 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523 if (BE (ret < 0, 0))
1524 return REG_ESPACE;
1525 }
1526
1527 org_dest = dfa->edests[org_node].elems[1];
1528 clone_dest = duplicate_node (dfa, org_dest, constraint);
1529 if (BE (clone_dest == -1, 0))
1530 return REG_ESPACE;
1531 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1532 if (BE (ret < 0, 0))
1533 return REG_ESPACE;
1534 }
1535 org_node = org_dest;
1536 clone_node = clone_dest;
1537 }
1538 return REG_NOERROR;
1539 }
1540
1541 /* Search for a node which is duplicated from the node ORG_NODE, and
1542 satisfies the constraint CONSTRAINT. */
1543
1544 static int
1545 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1546 unsigned int constraint)
1547 {
1548 int idx;
1549 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1550 {
1551 if (org_node == dfa->org_indices[idx]
1552 && constraint == dfa->nodes[idx].constraint)
1553 return idx; /* Found. */
1554 }
1555 return -1; /* Not found. */
1556 }
1557
1558 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1559 Return the index of the new node, or -1 if insufficient storage is
1560 available. */
1561
1562 static int
1563 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1564 {
1565 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1566 if (BE (dup_idx != -1, 1))
1567 {
1568 dfa->nodes[dup_idx].constraint = constraint;
1569 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1570 dfa->nodes[dup_idx].duplicated = 1;
1571
1572 /* Store the index of the original node. */
1573 dfa->org_indices[dup_idx] = org_idx;
1574 }
1575 return dup_idx;
1576 }
1577
1578 static reg_errcode_t
1579 calc_inveclosure (re_dfa_t *dfa)
1580 {
1581 int src, idx, ret;
1582 for (idx = 0; idx < dfa->nodes_len; ++idx)
1583 re_node_set_init_empty (dfa->inveclosures + idx);
1584
1585 for (src = 0; src < dfa->nodes_len; ++src)
1586 {
1587 int *elems = dfa->eclosures[src].elems;
1588 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1589 {
1590 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1591 if (BE (ret == -1, 0))
1592 return REG_ESPACE;
1593 }
1594 }
1595
1596 return REG_NOERROR;
1597 }
1598
1599 /* Calculate "eclosure" for all the node in DFA. */
1600
1601 static reg_errcode_t
1602 calc_eclosure (re_dfa_t *dfa)
1603 {
1604 int node_idx, incomplete;
1605 #ifdef DEBUG
1606 assert (dfa->nodes_len > 0);
1607 #endif
1608 incomplete = 0;
1609 /* For each nodes, calculate epsilon closure. */
1610 for (node_idx = 0; ; ++node_idx)
1611 {
1612 reg_errcode_t err;
1613 re_node_set eclosure_elem;
1614 if (node_idx == dfa->nodes_len)
1615 {
1616 if (!incomplete)
1617 break;
1618 incomplete = 0;
1619 node_idx = 0;
1620 }
1621
1622 #ifdef DEBUG
1623 assert (dfa->eclosures[node_idx].nelem != -1);
1624 #endif
1625
1626 /* If we have already calculated, skip it. */
1627 if (dfa->eclosures[node_idx].nelem != 0)
1628 continue;
1629 /* Calculate epsilon closure of `node_idx'. */
1630 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1631 if (BE (err != REG_NOERROR, 0))
1632 return err;
1633
1634 if (dfa->eclosures[node_idx].nelem == 0)
1635 {
1636 incomplete = 1;
1637 re_node_set_free (&eclosure_elem);
1638 }
1639 }
1640 return REG_NOERROR;
1641 }
1642
1643 /* Calculate epsilon closure of NODE. */
1644
1645 static reg_errcode_t
1646 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1647 {
1648 reg_errcode_t err;
1649 int i;
1650 re_node_set eclosure;
1651 int ret;
1652 int incomplete = 0;
1653 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1654 if (BE (err != REG_NOERROR, 0))
1655 return err;
1656
1657 /* This indicates that we are calculating this node now.
1658 We reference this value to avoid infinite loop. */
1659 dfa->eclosures[node].nelem = -1;
1660
1661 /* If the current node has constraints, duplicate all nodes
1662 since they must inherit the constraints. */
1663 if (dfa->nodes[node].constraint
1664 && dfa->edests[node].nelem
1665 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1666 {
1667 err = duplicate_node_closure (dfa, node, node, node,
1668 dfa->nodes[node].constraint);
1669 if (BE (err != REG_NOERROR, 0))
1670 return err;
1671 }
1672
1673 /* Expand each epsilon destination nodes. */
1674 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1675 for (i = 0; i < dfa->edests[node].nelem; ++i)
1676 {
1677 re_node_set eclosure_elem;
1678 int edest = dfa->edests[node].elems[i];
1679 /* If calculating the epsilon closure of `edest' is in progress,
1680 return intermediate result. */
1681 if (dfa->eclosures[edest].nelem == -1)
1682 {
1683 incomplete = 1;
1684 continue;
1685 }
1686 /* If we haven't calculated the epsilon closure of `edest' yet,
1687 calculate now. Otherwise use calculated epsilon closure. */
1688 if (dfa->eclosures[edest].nelem == 0)
1689 {
1690 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1691 if (BE (err != REG_NOERROR, 0))
1692 return err;
1693 }
1694 else
1695 eclosure_elem = dfa->eclosures[edest];
1696 /* Merge the epsilon closure of `edest'. */
1697 err = re_node_set_merge (&eclosure, &eclosure_elem);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1700 /* If the epsilon closure of `edest' is incomplete,
1701 the epsilon closure of this node is also incomplete. */
1702 if (dfa->eclosures[edest].nelem == 0)
1703 {
1704 incomplete = 1;
1705 re_node_set_free (&eclosure_elem);
1706 }
1707 }
1708
1709 /* An epsilon closure includes itself. */
1710 ret = re_node_set_insert (&eclosure, node);
1711 if (BE (ret < 0, 0))
1712 return REG_ESPACE;
1713 if (incomplete && !root)
1714 dfa->eclosures[node].nelem = 0;
1715 else
1716 dfa->eclosures[node] = eclosure;
1717 *new_set = eclosure;
1718 return REG_NOERROR;
1719 }
1720 \f
1721 /* Functions for token which are used in the parser. */
1722
1723 /* Fetch a token from INPUT.
1724 We must not use this function inside bracket expressions. */
1725
1726 static void
1727 internal_function
1728 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1729 {
1730 re_string_skip_bytes (input, peek_token (result, input, syntax));
1731 }
1732
1733 /* Peek a token from INPUT, and return the length of the token.
1734 We must not use this function inside bracket expressions. */
1735
1736 static int
1737 internal_function
1738 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1739 {
1740 unsigned char c;
1741
1742 if (re_string_eoi (input))
1743 {
1744 token->type = END_OF_RE;
1745 return 0;
1746 }
1747
1748 c = re_string_peek_byte (input, 0);
1749 token->opr.c = c;
1750
1751 token->word_char = 0;
1752 #ifdef RE_ENABLE_I18N
1753 token->mb_partial = 0;
1754 if (input->mb_cur_max > 1 &&
1755 !re_string_first_byte (input, re_string_cur_idx (input)))
1756 {
1757 token->type = CHARACTER;
1758 token->mb_partial = 1;
1759 return 1;
1760 }
1761 #endif
1762 if (c == '\\')
1763 {
1764 unsigned char c2;
1765 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1766 {
1767 token->type = BACK_SLASH;
1768 return 1;
1769 }
1770
1771 c2 = re_string_peek_byte_case (input, 1);
1772 token->opr.c = c2;
1773 token->type = CHARACTER;
1774 #ifdef RE_ENABLE_I18N
1775 if (input->mb_cur_max > 1)
1776 {
1777 wint_t wc = re_string_wchar_at (input,
1778 re_string_cur_idx (input) + 1);
1779 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1780 }
1781 else
1782 #endif
1783 token->word_char = IS_WORD_CHAR (c2) != 0;
1784
1785 switch (c2)
1786 {
1787 case '|':
1788 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1789 token->type = OP_ALT;
1790 break;
1791 case '1': case '2': case '3': case '4': case '5':
1792 case '6': case '7': case '8': case '9':
1793 if (!(syntax & RE_NO_BK_REFS))
1794 {
1795 token->type = OP_BACK_REF;
1796 token->opr.idx = c2 - '1';
1797 }
1798 break;
1799 case '<':
1800 if (!(syntax & RE_NO_GNU_OPS))
1801 {
1802 token->type = ANCHOR;
1803 token->opr.ctx_type = WORD_FIRST;
1804 }
1805 break;
1806 case '>':
1807 if (!(syntax & RE_NO_GNU_OPS))
1808 {
1809 token->type = ANCHOR;
1810 token->opr.ctx_type = WORD_LAST;
1811 }
1812 break;
1813 case 'b':
1814 if (!(syntax & RE_NO_GNU_OPS))
1815 {
1816 token->type = ANCHOR;
1817 token->opr.ctx_type = WORD_DELIM;
1818 }
1819 break;
1820 case 'B':
1821 if (!(syntax & RE_NO_GNU_OPS))
1822 {
1823 token->type = ANCHOR;
1824 token->opr.ctx_type = NOT_WORD_DELIM;
1825 }
1826 break;
1827 case 'w':
1828 if (!(syntax & RE_NO_GNU_OPS))
1829 token->type = OP_WORD;
1830 break;
1831 case 'W':
1832 if (!(syntax & RE_NO_GNU_OPS))
1833 token->type = OP_NOTWORD;
1834 break;
1835 case 's':
1836 if (!(syntax & RE_NO_GNU_OPS))
1837 token->type = OP_SPACE;
1838 break;
1839 case 'S':
1840 if (!(syntax & RE_NO_GNU_OPS))
1841 token->type = OP_NOTSPACE;
1842 break;
1843 case '`':
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 {
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = BUF_FIRST;
1848 }
1849 break;
1850 case '\'':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = BUF_LAST;
1855 }
1856 break;
1857 case '(':
1858 if (!(syntax & RE_NO_BK_PARENS))
1859 token->type = OP_OPEN_SUBEXP;
1860 break;
1861 case ')':
1862 if (!(syntax & RE_NO_BK_PARENS))
1863 token->type = OP_CLOSE_SUBEXP;
1864 break;
1865 case '+':
1866 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1867 token->type = OP_DUP_PLUS;
1868 break;
1869 case '?':
1870 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1871 token->type = OP_DUP_QUESTION;
1872 break;
1873 case '{':
1874 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1875 token->type = OP_OPEN_DUP_NUM;
1876 break;
1877 case '}':
1878 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1879 token->type = OP_CLOSE_DUP_NUM;
1880 break;
1881 default:
1882 break;
1883 }
1884 return 2;
1885 }
1886
1887 token->type = CHARACTER;
1888 #ifdef RE_ENABLE_I18N
1889 if (input->mb_cur_max > 1)
1890 {
1891 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893 }
1894 else
1895 #endif
1896 token->word_char = IS_WORD_CHAR (token->opr.c);
1897
1898 switch (c)
1899 {
1900 case '\n':
1901 if (syntax & RE_NEWLINE_ALT)
1902 token->type = OP_ALT;
1903 break;
1904 case '|':
1905 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1906 token->type = OP_ALT;
1907 break;
1908 case '*':
1909 token->type = OP_DUP_ASTERISK;
1910 break;
1911 case '+':
1912 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1913 token->type = OP_DUP_PLUS;
1914 break;
1915 case '?':
1916 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1917 token->type = OP_DUP_QUESTION;
1918 break;
1919 case '{':
1920 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1921 token->type = OP_OPEN_DUP_NUM;
1922 break;
1923 case '}':
1924 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1925 token->type = OP_CLOSE_DUP_NUM;
1926 break;
1927 case '(':
1928 if (syntax & RE_NO_BK_PARENS)
1929 token->type = OP_OPEN_SUBEXP;
1930 break;
1931 case ')':
1932 if (syntax & RE_NO_BK_PARENS)
1933 token->type = OP_CLOSE_SUBEXP;
1934 break;
1935 case '[':
1936 token->type = OP_OPEN_BRACKET;
1937 break;
1938 case '.':
1939 token->type = OP_PERIOD;
1940 break;
1941 case '^':
1942 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1943 re_string_cur_idx (input) != 0)
1944 {
1945 char prev = re_string_peek_byte (input, -1);
1946 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1947 break;
1948 }
1949 token->type = ANCHOR;
1950 token->opr.ctx_type = LINE_FIRST;
1951 break;
1952 case '$':
1953 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1954 re_string_cur_idx (input) + 1 != re_string_length (input))
1955 {
1956 re_token_t next;
1957 re_string_skip_bytes (input, 1);
1958 peek_token (&next, input, syntax);
1959 re_string_skip_bytes (input, -1);
1960 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1961 break;
1962 }
1963 token->type = ANCHOR;
1964 token->opr.ctx_type = LINE_LAST;
1965 break;
1966 default:
1967 break;
1968 }
1969 return 1;
1970 }
1971
1972 /* Peek a token from INPUT, and return the length of the token.
1973 We must not use this function out of bracket expressions. */
1974
1975 static int
1976 internal_function
1977 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1978 {
1979 unsigned char c;
1980 if (re_string_eoi (input))
1981 {
1982 token->type = END_OF_RE;
1983 return 0;
1984 }
1985 c = re_string_peek_byte (input, 0);
1986 token->opr.c = c;
1987
1988 #ifdef RE_ENABLE_I18N
1989 if (input->mb_cur_max > 1 &&
1990 !re_string_first_byte (input, re_string_cur_idx (input)))
1991 {
1992 token->type = CHARACTER;
1993 return 1;
1994 }
1995 #endif /* RE_ENABLE_I18N */
1996
1997 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1998 && re_string_cur_idx (input) + 1 < re_string_length (input))
1999 {
2000 /* In this case, '\' escape a character. */
2001 unsigned char c2;
2002 re_string_skip_bytes (input, 1);
2003 c2 = re_string_peek_byte (input, 0);
2004 token->opr.c = c2;
2005 token->type = CHARACTER;
2006 return 1;
2007 }
2008 if (c == '[') /* '[' is a special char in a bracket exps. */
2009 {
2010 unsigned char c2;
2011 int token_len;
2012 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2013 c2 = re_string_peek_byte (input, 1);
2014 else
2015 c2 = 0;
2016 token->opr.c = c2;
2017 token_len = 2;
2018 switch (c2)
2019 {
2020 case '.':
2021 token->type = OP_OPEN_COLL_ELEM;
2022 break;
2023 case '=':
2024 token->type = OP_OPEN_EQUIV_CLASS;
2025 break;
2026 case ':':
2027 if (syntax & RE_CHAR_CLASSES)
2028 {
2029 token->type = OP_OPEN_CHAR_CLASS;
2030 break;
2031 }
2032 /* else fall through. */
2033 default:
2034 token->type = CHARACTER;
2035 token->opr.c = c;
2036 token_len = 1;
2037 break;
2038 }
2039 return token_len;
2040 }
2041 switch (c)
2042 {
2043 case '-':
2044 token->type = OP_CHARSET_RANGE;
2045 break;
2046 case ']':
2047 token->type = OP_CLOSE_BRACKET;
2048 break;
2049 case '^':
2050 token->type = OP_NON_MATCH_LIST;
2051 break;
2052 default:
2053 token->type = CHARACTER;
2054 }
2055 return 1;
2056 }
2057 \f
2058 /* Functions for parser. */
2059
2060 /* Entry point of the parser.
2061 Parse the regular expression REGEXP and return the structure tree.
2062 If an error is occured, ERR is set by error code, and return NULL.
2063 This function build the following tree, from regular expression <reg_exp>:
2064 CAT
2065 / \
2066 / \
2067 <reg_exp> EOR
2068
2069 CAT means concatenation.
2070 EOR means end of regular expression. */
2071
2072 static bin_tree_t *
2073 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2074 reg_errcode_t *err)
2075 {
2076 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2077 bin_tree_t *tree, *eor, *root;
2078 re_token_t current_token;
2079 dfa->syntax = syntax;
2080 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2081 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2082 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2083 return NULL;
2084 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2085 if (tree != NULL)
2086 root = create_tree (dfa, tree, eor, CONCAT);
2087 else
2088 root = eor;
2089 if (BE (eor == NULL || root == NULL, 0))
2090 {
2091 *err = REG_ESPACE;
2092 return NULL;
2093 }
2094 return root;
2095 }
2096
2097 /* This function build the following tree, from regular expression
2098 <branch1>|<branch2>:
2099 ALT
2100 / \
2101 / \
2102 <branch1> <branch2>
2103
2104 ALT means alternative, which represents the operator `|'. */
2105
2106 static bin_tree_t *
2107 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2108 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2109 {
2110 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111 bin_tree_t *tree, *branch = NULL;
2112 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2113 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114 return NULL;
2115
2116 while (token->type == OP_ALT)
2117 {
2118 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2119 if (token->type != OP_ALT && token->type != END_OF_RE
2120 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2121 {
2122 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2123 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2124 return NULL;
2125 }
2126 else
2127 branch = NULL;
2128 tree = create_tree (dfa, tree, branch, OP_ALT);
2129 if (BE (tree == NULL, 0))
2130 {
2131 *err = REG_ESPACE;
2132 return NULL;
2133 }
2134 }
2135 return tree;
2136 }
2137
2138 /* This function build the following tree, from regular expression
2139 <exp1><exp2>:
2140 CAT
2141 / \
2142 / \
2143 <exp1> <exp2>
2144
2145 CAT means concatenation. */
2146
2147 static bin_tree_t *
2148 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150 {
2151 bin_tree_t *tree, *exp;
2152 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2153 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2154 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155 return NULL;
2156
2157 while (token->type != OP_ALT && token->type != END_OF_RE
2158 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2159 {
2160 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2161 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2162 {
2163 return NULL;
2164 }
2165 if (tree != NULL && exp != NULL)
2166 {
2167 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2168 if (newtree == NULL)
2169 {
2170 postorder (exp, free_tree, NULL);
2171 postorder (tree, free_tree, NULL);
2172 *err = REG_ESPACE;
2173 return NULL;
2174 }
2175 tree = newtree;
2176 }
2177 else if (tree == NULL)
2178 tree = exp;
2179 /* Otherwise exp == NULL, we don't need to create new tree. */
2180 }
2181 return tree;
2182 }
2183
2184 /* This function build the following tree, from regular expression a*:
2185 *
2186 |
2187 a
2188 */
2189
2190 static bin_tree_t *
2191 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2192 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2193 {
2194 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2195 bin_tree_t *tree;
2196 switch (token->type)
2197 {
2198 case CHARACTER:
2199 tree = create_token_tree (dfa, NULL, NULL, token);
2200 if (BE (tree == NULL, 0))
2201 {
2202 *err = REG_ESPACE;
2203 return NULL;
2204 }
2205 #ifdef RE_ENABLE_I18N
2206 if (dfa->mb_cur_max > 1)
2207 {
2208 while (!re_string_eoi (regexp)
2209 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2210 {
2211 bin_tree_t *mbc_remain;
2212 fetch_token (token, regexp, syntax);
2213 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2214 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2215 if (BE (mbc_remain == NULL || tree == NULL, 0))
2216 {
2217 *err = REG_ESPACE;
2218 return NULL;
2219 }
2220 }
2221 }
2222 #endif
2223 break;
2224 case OP_OPEN_SUBEXP:
2225 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2226 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227 return NULL;
2228 break;
2229 case OP_OPEN_BRACKET:
2230 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2231 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2232 return NULL;
2233 break;
2234 case OP_BACK_REF:
2235 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2236 {
2237 *err = REG_ESUBREG;
2238 return NULL;
2239 }
2240 dfa->used_bkref_map |= 1 << token->opr.idx;
2241 tree = create_token_tree (dfa, NULL, NULL, token);
2242 if (BE (tree == NULL, 0))
2243 {
2244 *err = REG_ESPACE;
2245 return NULL;
2246 }
2247 ++dfa->nbackref;
2248 dfa->has_mb_node = 1;
2249 break;
2250 case OP_OPEN_DUP_NUM:
2251 if (syntax & RE_CONTEXT_INVALID_DUP)
2252 {
2253 *err = REG_BADRPT;
2254 return NULL;
2255 }
2256 /* FALLTHROUGH */
2257 case OP_DUP_ASTERISK:
2258 case OP_DUP_PLUS:
2259 case OP_DUP_QUESTION:
2260 if (syntax & RE_CONTEXT_INVALID_OPS)
2261 {
2262 *err = REG_BADRPT;
2263 return NULL;
2264 }
2265 else if (syntax & RE_CONTEXT_INDEP_OPS)
2266 {
2267 fetch_token (token, regexp, syntax);
2268 return parse_expression (regexp, preg, token, syntax, nest, err);
2269 }
2270 /* else fall through */
2271 case OP_CLOSE_SUBEXP:
2272 if ((token->type == OP_CLOSE_SUBEXP) &&
2273 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2274 {
2275 *err = REG_ERPAREN;
2276 return NULL;
2277 }
2278 /* else fall through */
2279 case OP_CLOSE_DUP_NUM:
2280 /* We treat it as a normal character. */
2281
2282 /* Then we can these characters as normal characters. */
2283 token->type = CHARACTER;
2284 /* mb_partial and word_char bits should be initialized already
2285 by peek_token. */
2286 tree = create_token_tree (dfa, NULL, NULL, token);
2287 if (BE (tree == NULL, 0))
2288 {
2289 *err = REG_ESPACE;
2290 return NULL;
2291 }
2292 break;
2293 case ANCHOR:
2294 if ((token->opr.ctx_type
2295 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2296 && dfa->word_ops_used == 0)
2297 init_word_char (dfa);
2298 if (token->opr.ctx_type == WORD_DELIM
2299 || token->opr.ctx_type == NOT_WORD_DELIM)
2300 {
2301 bin_tree_t *tree_first, *tree_last;
2302 if (token->opr.ctx_type == WORD_DELIM)
2303 {
2304 token->opr.ctx_type = WORD_FIRST;
2305 tree_first = create_token_tree (dfa, NULL, NULL, token);
2306 token->opr.ctx_type = WORD_LAST;
2307 }
2308 else
2309 {
2310 token->opr.ctx_type = INSIDE_WORD;
2311 tree_first = create_token_tree (dfa, NULL, NULL, token);
2312 token->opr.ctx_type = INSIDE_NOTWORD;
2313 }
2314 tree_last = create_token_tree (dfa, NULL, NULL, token);
2315 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2316 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2317 {
2318 *err = REG_ESPACE;
2319 return NULL;
2320 }
2321 }
2322 else
2323 {
2324 tree = create_token_tree (dfa, NULL, NULL, token);
2325 if (BE (tree == NULL, 0))
2326 {
2327 *err = REG_ESPACE;
2328 return NULL;
2329 }
2330 }
2331 /* We must return here, since ANCHORs can't be followed
2332 by repetition operators.
2333 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2334 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2335 fetch_token (token, regexp, syntax);
2336 return tree;
2337 case OP_PERIOD:
2338 tree = create_token_tree (dfa, NULL, NULL, token);
2339 if (BE (tree == NULL, 0))
2340 {
2341 *err = REG_ESPACE;
2342 return NULL;
2343 }
2344 if (dfa->mb_cur_max > 1)
2345 dfa->has_mb_node = 1;
2346 break;
2347 case OP_WORD:
2348 case OP_NOTWORD:
2349 tree = build_charclass_op (dfa, regexp->trans,
2350 (const unsigned char *) "alnum",
2351 (const unsigned char *) "_",
2352 token->type == OP_NOTWORD, err);
2353 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2354 return NULL;
2355 break;
2356 case OP_SPACE:
2357 case OP_NOTSPACE:
2358 tree = build_charclass_op (dfa, regexp->trans,
2359 (const unsigned char *) "space",
2360 (const unsigned char *) "",
2361 token->type == OP_NOTSPACE, err);
2362 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2363 return NULL;
2364 break;
2365 case OP_ALT:
2366 case END_OF_RE:
2367 return NULL;
2368 case BACK_SLASH:
2369 *err = REG_EESCAPE;
2370 return NULL;
2371 default:
2372 /* Must not happen? */
2373 #ifdef DEBUG
2374 assert (0);
2375 #endif
2376 return NULL;
2377 }
2378 fetch_token (token, regexp, syntax);
2379
2380 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2381 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2382 {
2383 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2384 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385 return NULL;
2386 /* In BRE consecutive duplications are not allowed. */
2387 if ((syntax & RE_CONTEXT_INVALID_DUP)
2388 && (token->type == OP_DUP_ASTERISK
2389 || token->type == OP_OPEN_DUP_NUM))
2390 {
2391 *err = REG_BADRPT;
2392 return NULL;
2393 }
2394 }
2395
2396 return tree;
2397 }
2398
2399 /* This function build the following tree, from regular expression
2400 (<reg_exp>):
2401 SUBEXP
2402 |
2403 <reg_exp>
2404 */
2405
2406 static bin_tree_t *
2407 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2408 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2409 {
2410 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2411 bin_tree_t *tree;
2412 size_t cur_nsub;
2413 cur_nsub = preg->re_nsub++;
2414
2415 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2416
2417 /* The subexpression may be a null string. */
2418 if (token->type == OP_CLOSE_SUBEXP)
2419 tree = NULL;
2420 else
2421 {
2422 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2423 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2424 {
2425 if (tree != NULL)
2426 postorder (tree, free_tree, NULL);
2427 *err = REG_EPAREN;
2428 }
2429 if (BE (*err != REG_NOERROR, 0))
2430 return NULL;
2431 }
2432
2433 if (cur_nsub <= '9' - '1')
2434 dfa->completed_bkref_map |= 1 << cur_nsub;
2435
2436 tree = create_tree (dfa, tree, NULL, SUBEXP);
2437 if (BE (tree == NULL, 0))
2438 {
2439 *err = REG_ESPACE;
2440 return NULL;
2441 }
2442 tree->token.opr.idx = cur_nsub;
2443 return tree;
2444 }
2445
2446 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2447
2448 static bin_tree_t *
2449 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2450 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451 {
2452 bin_tree_t *tree = NULL, *old_tree = NULL;
2453 int i, start, end, start_idx = re_string_cur_idx (regexp);
2454 re_token_t start_token = *token;
2455
2456 if (token->type == OP_OPEN_DUP_NUM)
2457 {
2458 end = 0;
2459 start = fetch_number (regexp, token, syntax);
2460 if (start == -1)
2461 {
2462 if (token->type == CHARACTER && token->opr.c == ',')
2463 start = 0; /* We treat "{,m}" as "{0,m}". */
2464 else
2465 {
2466 *err = REG_BADBR; /* <re>{} is invalid. */
2467 return NULL;
2468 }
2469 }
2470 if (BE (start != -2, 1))
2471 {
2472 /* We treat "{n}" as "{n,n}". */
2473 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2474 : ((token->type == CHARACTER && token->opr.c == ',')
2475 ? fetch_number (regexp, token, syntax) : -2));
2476 }
2477 if (BE (start == -2 || end == -2, 0))
2478 {
2479 /* Invalid sequence. */
2480 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481 {
2482 if (token->type == END_OF_RE)
2483 *err = REG_EBRACE;
2484 else
2485 *err = REG_BADBR;
2486
2487 return NULL;
2488 }
2489
2490 /* If the syntax bit is set, rollback. */
2491 re_string_set_index (regexp, start_idx);
2492 *token = start_token;
2493 token->type = CHARACTER;
2494 /* mb_partial and word_char bits should be already initialized by
2495 peek_token. */
2496 return elem;
2497 }
2498
2499 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2500 {
2501 /* First number greater than second. */
2502 *err = REG_BADBR;
2503 return NULL;
2504 }
2505 }
2506 else
2507 {
2508 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2509 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2510 }
2511
2512 fetch_token (token, regexp, syntax);
2513
2514 if (BE (elem == NULL, 0))
2515 return NULL;
2516 if (BE (start == 0 && end == 0, 0))
2517 {
2518 postorder (elem, free_tree, NULL);
2519 return NULL;
2520 }
2521
2522 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2523 if (BE (start > 0, 0))
2524 {
2525 tree = elem;
2526 for (i = 2; i <= start; ++i)
2527 {
2528 elem = duplicate_tree (elem, dfa);
2529 tree = create_tree (dfa, tree, elem, CONCAT);
2530 if (BE (elem == NULL || tree == NULL, 0))
2531 goto parse_dup_op_espace;
2532 }
2533
2534 if (start == end)
2535 return tree;
2536
2537 /* Duplicate ELEM before it is marked optional. */
2538 elem = duplicate_tree (elem, dfa);
2539 old_tree = tree;
2540 }
2541 else
2542 old_tree = NULL;
2543
2544 if (elem->token.type == SUBEXP)
2545 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2546
2547 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2548 if (BE (tree == NULL, 0))
2549 goto parse_dup_op_espace;
2550
2551 /* This loop is actually executed only when end != -1,
2552 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2553 already created the start+1-th copy. */
2554 for (i = start + 2; i <= end; ++i)
2555 {
2556 elem = duplicate_tree (elem, dfa);
2557 tree = create_tree (dfa, tree, elem, CONCAT);
2558 if (BE (elem == NULL || tree == NULL, 0))
2559 goto parse_dup_op_espace;
2560
2561 tree = create_tree (dfa, tree, NULL, OP_ALT);
2562 if (BE (tree == NULL, 0))
2563 goto parse_dup_op_espace;
2564 }
2565
2566 if (old_tree)
2567 tree = create_tree (dfa, old_tree, tree, CONCAT);
2568
2569 return tree;
2570
2571 parse_dup_op_espace:
2572 *err = REG_ESPACE;
2573 return NULL;
2574 }
2575
2576 /* Size of the names for collating symbol/equivalence_class/character_class.
2577 I'm not sure, but maybe enough. */
2578 #define BRACKET_NAME_BUF_SIZE 32
2579
2580 #ifndef _LIBC
2581 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2582 Build the range expression which starts from START_ELEM, and ends
2583 at END_ELEM. The result are written to MBCSET and SBCSET.
2584 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2585 mbcset->range_ends, is a pointer argument sinse we may
2586 update it. */
2587
2588 static reg_errcode_t
2589 internal_function
2590 # ifdef RE_ENABLE_I18N
2591 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2592 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2593 # else /* not RE_ENABLE_I18N */
2594 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2595 bracket_elem_t *end_elem)
2596 # endif /* not RE_ENABLE_I18N */
2597 {
2598 unsigned int start_ch, end_ch;
2599 /* Equivalence Classes and Character Classes can't be a range start/end. */
2600 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2601 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2602 0))
2603 return REG_ERANGE;
2604
2605 /* We can handle no multi character collating elements without libc
2606 support. */
2607 if (BE ((start_elem->type == COLL_SYM
2608 && strlen ((char *) start_elem->opr.name) > 1)
2609 || (end_elem->type == COLL_SYM
2610 && strlen ((char *) end_elem->opr.name) > 1), 0))
2611 return REG_ECOLLATE;
2612
2613 # ifdef RE_ENABLE_I18N
2614 {
2615 wchar_t wc;
2616 wint_t start_wc;
2617 wint_t end_wc;
2618 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2619
2620 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2621 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2622 : 0));
2623 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2624 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2625 : 0));
2626 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2627 ? __btowc (start_ch) : start_elem->opr.wch);
2628 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2629 ? __btowc (end_ch) : end_elem->opr.wch);
2630 if (start_wc == WEOF || end_wc == WEOF)
2631 return REG_ECOLLATE;
2632 cmp_buf[0] = start_wc;
2633 cmp_buf[4] = end_wc;
2634 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2635 return REG_ERANGE;
2636
2637 /* Got valid collation sequence values, add them as a new entry.
2638 However, for !_LIBC we have no collation elements: if the
2639 character set is single byte, the single byte character set
2640 that we build below suffices. parse_bracket_exp passes
2641 no MBCSET if dfa->mb_cur_max == 1. */
2642 if (mbcset)
2643 {
2644 /* Check the space of the arrays. */
2645 if (BE (*range_alloc == mbcset->nranges, 0))
2646 {
2647 /* There is not enough space, need realloc. */
2648 wchar_t *new_array_start, *new_array_end;
2649 int new_nranges;
2650
2651 /* +1 in case of mbcset->nranges is 0. */
2652 new_nranges = 2 * mbcset->nranges + 1;
2653 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2654 are NULL if *range_alloc == 0. */
2655 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2656 new_nranges);
2657 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2658 new_nranges);
2659
2660 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2661 return REG_ESPACE;
2662
2663 mbcset->range_starts = new_array_start;
2664 mbcset->range_ends = new_array_end;
2665 *range_alloc = new_nranges;
2666 }
2667
2668 mbcset->range_starts[mbcset->nranges] = start_wc;
2669 mbcset->range_ends[mbcset->nranges++] = end_wc;
2670 }
2671
2672 /* Build the table for single byte characters. */
2673 for (wc = 0; wc < SBC_MAX; ++wc)
2674 {
2675 cmp_buf[2] = wc;
2676 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2677 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2678 bitset_set (sbcset, wc);
2679 }
2680 }
2681 # else /* not RE_ENABLE_I18N */
2682 {
2683 unsigned int ch;
2684 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2685 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2686 : 0));
2687 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2688 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2689 : 0));
2690 if (start_ch > end_ch)
2691 return REG_ERANGE;
2692 /* Build the table for single byte characters. */
2693 for (ch = 0; ch < SBC_MAX; ++ch)
2694 if (start_ch <= ch && ch <= end_ch)
2695 bitset_set (sbcset, ch);
2696 }
2697 # endif /* not RE_ENABLE_I18N */
2698 return REG_NOERROR;
2699 }
2700 #endif /* not _LIBC */
2701
2702 #ifndef _LIBC
2703 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2704 Build the collating element which is represented by NAME.
2705 The result are written to MBCSET and SBCSET.
2706 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2707 pointer argument since we may update it. */
2708
2709 static reg_errcode_t
2710 internal_function
2711 # ifdef RE_ENABLE_I18N
2712 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2713 int *coll_sym_alloc, const unsigned char *name)
2714 # else /* not RE_ENABLE_I18N */
2715 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2716 # endif /* not RE_ENABLE_I18N */
2717 {
2718 size_t name_len = strlen ((const char *) name);
2719 if (BE (name_len != 1, 0))
2720 return REG_ECOLLATE;
2721 else
2722 {
2723 bitset_set (sbcset, name[0]);
2724 return REG_NOERROR;
2725 }
2726 }
2727 #endif /* not _LIBC */
2728
2729 /* This function parse bracket expression like "[abc]", "[a-c]",
2730 "[[.a-a.]]" etc. */
2731
2732 static bin_tree_t *
2733 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2734 reg_syntax_t syntax, reg_errcode_t *err)
2735 {
2736 #ifdef _LIBC
2737 const unsigned char *collseqmb;
2738 const char *collseqwc;
2739 uint32_t nrules;
2740 int32_t table_size;
2741 const int32_t *symb_table;
2742 const unsigned char *extra;
2743
2744 /* Local function for parse_bracket_exp used in _LIBC environement.
2745 Seek the collating symbol entry correspondings to NAME.
2746 Return the index of the symbol in the SYMB_TABLE. */
2747
2748 auto inline int32_t
2749 __attribute ((always_inline))
2750 seek_collating_symbol_entry (name, name_len)
2751 const unsigned char *name;
2752 size_t name_len;
2753 {
2754 int32_t hash = elem_hash ((const char *) name, name_len);
2755 int32_t elem = hash % table_size;
2756 if (symb_table[2 * elem] != 0)
2757 {
2758 int32_t second = hash % (table_size - 2) + 1;
2759
2760 do
2761 {
2762 /* First compare the hashing value. */
2763 if (symb_table[2 * elem] == hash
2764 /* Compare the length of the name. */
2765 && name_len == extra[symb_table[2 * elem + 1]]
2766 /* Compare the name. */
2767 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2768 name_len) == 0)
2769 {
2770 /* Yep, this is the entry. */
2771 break;
2772 }
2773
2774 /* Next entry. */
2775 elem += second;
2776 }
2777 while (symb_table[2 * elem] != 0);
2778 }
2779 return elem;
2780 }
2781
2782 /* Local function for parse_bracket_exp used in _LIBC environment.
2783 Look up the collation sequence value of BR_ELEM.
2784 Return the value if succeeded, UINT_MAX otherwise. */
2785
2786 auto inline unsigned int
2787 __attribute ((always_inline))
2788 lookup_collation_sequence_value (br_elem)
2789 bracket_elem_t *br_elem;
2790 {
2791 if (br_elem->type == SB_CHAR)
2792 {
2793 /*
2794 if (MB_CUR_MAX == 1)
2795 */
2796 if (nrules == 0)
2797 return collseqmb[br_elem->opr.ch];
2798 else
2799 {
2800 wint_t wc = __btowc (br_elem->opr.ch);
2801 return __collseq_table_lookup (collseqwc, wc);
2802 }
2803 }
2804 else if (br_elem->type == MB_CHAR)
2805 {
2806 if (nrules != 0)
2807 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808 }
2809 else if (br_elem->type == COLL_SYM)
2810 {
2811 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2812 if (nrules != 0)
2813 {
2814 int32_t elem, idx;
2815 elem = seek_collating_symbol_entry (br_elem->opr.name,
2816 sym_name_len);
2817 if (symb_table[2 * elem] != 0)
2818 {
2819 /* We found the entry. */
2820 idx = symb_table[2 * elem + 1];
2821 /* Skip the name of collating element name. */
2822 idx += 1 + extra[idx];
2823 /* Skip the byte sequence of the collating element. */
2824 idx += 1 + extra[idx];
2825 /* Adjust for the alignment. */
2826 idx = (idx + 3) & ~3;
2827 /* Skip the multibyte collation sequence value. */
2828 idx += sizeof (unsigned int);
2829 /* Skip the wide char sequence of the collating element. */
2830 idx += sizeof (unsigned int) *
2831 (1 + *(unsigned int *) (extra + idx));
2832 /* Return the collation sequence value. */
2833 return *(unsigned int *) (extra + idx);
2834 }
2835 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836 {
2837 /* No valid character. Match it as a single byte
2838 character. */
2839 return collseqmb[br_elem->opr.name[0]];
2840 }
2841 }
2842 else if (sym_name_len == 1)
2843 return collseqmb[br_elem->opr.name[0]];
2844 }
2845 return UINT_MAX;
2846 }
2847
2848 /* Local function for parse_bracket_exp used in _LIBC environement.
2849 Build the range expression which starts from START_ELEM, and ends
2850 at END_ELEM. The result are written to MBCSET and SBCSET.
2851 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2852 mbcset->range_ends, is a pointer argument sinse we may
2853 update it. */
2854
2855 auto inline reg_errcode_t
2856 __attribute ((always_inline))
2857 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2858 re_charset_t *mbcset;
2859 int *range_alloc;
2860 bitset_t sbcset;
2861 bracket_elem_t *start_elem, *end_elem;
2862 {
2863 unsigned int ch;
2864 uint32_t start_collseq;
2865 uint32_t end_collseq;
2866
2867 /* Equivalence Classes and Character Classes can't be a range
2868 start/end. */
2869 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2870 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2871 0))
2872 return REG_ERANGE;
2873
2874 start_collseq = lookup_collation_sequence_value (start_elem);
2875 end_collseq = lookup_collation_sequence_value (end_elem);
2876 /* Check start/end collation sequence values. */
2877 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2878 return REG_ECOLLATE;
2879 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880 return REG_ERANGE;
2881
2882 /* Got valid collation sequence values, add them as a new entry.
2883 However, if we have no collation elements, and the character set
2884 is single byte, the single byte character set that we
2885 build below suffices. */
2886 if (nrules > 0 || dfa->mb_cur_max > 1)
2887 {
2888 /* Check the space of the arrays. */
2889 if (BE (*range_alloc == mbcset->nranges, 0))
2890 {
2891 /* There is not enough space, need realloc. */
2892 uint32_t *new_array_start;
2893 uint32_t *new_array_end;
2894 int new_nranges;
2895
2896 /* +1 in case of mbcset->nranges is 0. */
2897 new_nranges = 2 * mbcset->nranges + 1;
2898 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899 new_nranges);
2900 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901 new_nranges);
2902
2903 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904 return REG_ESPACE;
2905
2906 mbcset->range_starts = new_array_start;
2907 mbcset->range_ends = new_array_end;
2908 *range_alloc = new_nranges;
2909 }
2910
2911 mbcset->range_starts[mbcset->nranges] = start_collseq;
2912 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913 }
2914
2915 /* Build the table for single byte characters. */
2916 for (ch = 0; ch < SBC_MAX; ch++)
2917 {
2918 uint32_t ch_collseq;
2919 /*
2920 if (MB_CUR_MAX == 1)
2921 */
2922 if (nrules == 0)
2923 ch_collseq = collseqmb[ch];
2924 else
2925 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2926 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2927 bitset_set (sbcset, ch);
2928 }
2929 return REG_NOERROR;
2930 }
2931
2932 /* Local function for parse_bracket_exp used in _LIBC environement.
2933 Build the collating element which is represented by NAME.
2934 The result are written to MBCSET and SBCSET.
2935 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2936 pointer argument sinse we may update it. */
2937
2938 auto inline reg_errcode_t
2939 __attribute ((always_inline))
2940 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2941 re_charset_t *mbcset;
2942 int *coll_sym_alloc;
2943 bitset_t sbcset;
2944 const unsigned char *name;
2945 {
2946 int32_t elem, idx;
2947 size_t name_len = strlen ((const char *) name);
2948 if (nrules != 0)
2949 {
2950 elem = seek_collating_symbol_entry (name, name_len);
2951 if (symb_table[2 * elem] != 0)
2952 {
2953 /* We found the entry. */
2954 idx = symb_table[2 * elem + 1];
2955 /* Skip the name of collating element name. */
2956 idx += 1 + extra[idx];
2957 }
2958 else if (symb_table[2 * elem] == 0 && name_len == 1)
2959 {
2960 /* No valid character, treat it as a normal
2961 character. */
2962 bitset_set (sbcset, name[0]);
2963 return REG_NOERROR;
2964 }
2965 else
2966 return REG_ECOLLATE;
2967
2968 /* Got valid collation sequence, add it as a new entry. */
2969 /* Check the space of the arrays. */
2970 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971 {
2972 /* Not enough, realloc it. */
2973 /* +1 in case of mbcset->ncoll_syms is 0. */
2974 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2975 /* Use realloc since mbcset->coll_syms is NULL
2976 if *alloc == 0. */
2977 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2978 new_coll_sym_alloc);
2979 if (BE (new_coll_syms == NULL, 0))
2980 return REG_ESPACE;
2981 mbcset->coll_syms = new_coll_syms;
2982 *coll_sym_alloc = new_coll_sym_alloc;
2983 }
2984 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2985 return REG_NOERROR;
2986 }
2987 else
2988 {
2989 if (BE (name_len != 1, 0))
2990 return REG_ECOLLATE;
2991 else
2992 {
2993 bitset_set (sbcset, name[0]);
2994 return REG_NOERROR;
2995 }
2996 }
2997 }
2998 #endif
2999
3000 re_token_t br_token;
3001 re_bitset_ptr_t sbcset;
3002 #ifdef RE_ENABLE_I18N
3003 re_charset_t *mbcset;
3004 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3005 int equiv_class_alloc = 0, char_class_alloc = 0;
3006 #endif /* not RE_ENABLE_I18N */
3007 int non_match = 0;
3008 bin_tree_t *work_tree;
3009 int token_len;
3010 int first_round = 1;
3011 #ifdef _LIBC
3012 collseqmb = (const unsigned char *)
3013 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3014 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3015 if (nrules)
3016 {
3017 /*
3018 if (MB_CUR_MAX > 1)
3019 */
3020 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3021 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3022 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3023 _NL_COLLATE_SYMB_TABLEMB);
3024 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3025 _NL_COLLATE_SYMB_EXTRAMB);
3026 }
3027 #endif
3028 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3029 #ifdef RE_ENABLE_I18N
3030 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3031 #endif /* RE_ENABLE_I18N */
3032 #ifdef RE_ENABLE_I18N
3033 if (BE (sbcset == NULL || mbcset == NULL, 0))
3034 #else
3035 if (BE (sbcset == NULL, 0))
3036 #endif /* RE_ENABLE_I18N */
3037 {
3038 re_free (sbcset);
3039 #ifdef RE_ENABLE_I18N
3040 re_free (mbcset);
3041 #endif
3042 *err = REG_ESPACE;
3043 return NULL;
3044 }
3045
3046 token_len = peek_token_bracket (token, regexp, syntax);
3047 if (BE (token->type == END_OF_RE, 0))
3048 {
3049 *err = REG_BADPAT;
3050 goto parse_bracket_exp_free_return;
3051 }
3052 if (token->type == OP_NON_MATCH_LIST)
3053 {
3054 #ifdef RE_ENABLE_I18N
3055 mbcset->non_match = 1;
3056 #endif /* not RE_ENABLE_I18N */
3057 non_match = 1;
3058 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3059 bitset_set (sbcset, '\n');
3060 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3061 token_len = peek_token_bracket (token, regexp, syntax);
3062 if (BE (token->type == END_OF_RE, 0))
3063 {
3064 *err = REG_BADPAT;
3065 goto parse_bracket_exp_free_return;
3066 }
3067 }
3068
3069 /* We treat the first ']' as a normal character. */
3070 if (token->type == OP_CLOSE_BRACKET)
3071 token->type = CHARACTER;
3072
3073 while (1)
3074 {
3075 bracket_elem_t start_elem, end_elem;
3076 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3077 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3078 reg_errcode_t ret;
3079 int token_len2 = 0, is_range_exp = 0;
3080 re_token_t token2;
3081
3082 start_elem.opr.name = start_name_buf;
3083 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3084 syntax, first_round);
3085 if (BE (ret != REG_NOERROR, 0))
3086 {
3087 *err = ret;
3088 goto parse_bracket_exp_free_return;
3089 }
3090 first_round = 0;
3091
3092 /* Get information about the next token. We need it in any case. */
3093 token_len = peek_token_bracket (token, regexp, syntax);
3094
3095 /* Do not check for ranges if we know they are not allowed. */
3096 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3097 {
3098 if (BE (token->type == END_OF_RE, 0))
3099 {
3100 *err = REG_EBRACK;
3101 goto parse_bracket_exp_free_return;
3102 }
3103 if (token->type == OP_CHARSET_RANGE)
3104 {
3105 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3106 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3107 if (BE (token2.type == END_OF_RE, 0))
3108 {
3109 *err = REG_EBRACK;
3110 goto parse_bracket_exp_free_return;
3111 }
3112 if (token2.type == OP_CLOSE_BRACKET)
3113 {
3114 /* We treat the last '-' as a normal character. */
3115 re_string_skip_bytes (regexp, -token_len);
3116 token->type = CHARACTER;
3117 }
3118 else
3119 is_range_exp = 1;
3120 }
3121 }
3122
3123 if (is_range_exp == 1)
3124 {
3125 end_elem.opr.name = end_name_buf;
3126 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3127 dfa, syntax, 1);
3128 if (BE (ret != REG_NOERROR, 0))
3129 {
3130 *err = ret;
3131 goto parse_bracket_exp_free_return;
3132 }
3133
3134 token_len = peek_token_bracket (token, regexp, syntax);
3135
3136 #ifdef _LIBC
3137 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3138 &start_elem, &end_elem);
3139 #else
3140 # ifdef RE_ENABLE_I18N
3141 *err = build_range_exp (sbcset,
3142 dfa->mb_cur_max > 1 ? mbcset : NULL,
3143 &range_alloc, &start_elem, &end_elem);
3144 # else
3145 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3146 # endif
3147 #endif /* RE_ENABLE_I18N */
3148 if (BE (*err != REG_NOERROR, 0))
3149 goto parse_bracket_exp_free_return;
3150 }
3151 else
3152 {
3153 switch (start_elem.type)
3154 {
3155 case SB_CHAR:
3156 bitset_set (sbcset, start_elem.opr.ch);
3157 break;
3158 #ifdef RE_ENABLE_I18N
3159 case MB_CHAR:
3160 /* Check whether the array has enough space. */
3161 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3162 {
3163 wchar_t *new_mbchars;
3164 /* Not enough, realloc it. */
3165 /* +1 in case of mbcset->nmbchars is 0. */
3166 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3167 /* Use realloc since array is NULL if *alloc == 0. */
3168 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3169 mbchar_alloc);
3170 if (BE (new_mbchars == NULL, 0))
3171 goto parse_bracket_exp_espace;
3172 mbcset->mbchars = new_mbchars;
3173 }
3174 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3175 break;
3176 #endif /* RE_ENABLE_I18N */
3177 case EQUIV_CLASS:
3178 *err = build_equiv_class (sbcset,
3179 #ifdef RE_ENABLE_I18N
3180 mbcset, &equiv_class_alloc,
3181 #endif /* RE_ENABLE_I18N */
3182 start_elem.opr.name);
3183 if (BE (*err != REG_NOERROR, 0))
3184 goto parse_bracket_exp_free_return;
3185 break;
3186 case COLL_SYM:
3187 *err = build_collating_symbol (sbcset,
3188 #ifdef RE_ENABLE_I18N
3189 mbcset, &coll_sym_alloc,
3190 #endif /* RE_ENABLE_I18N */
3191 start_elem.opr.name);
3192 if (BE (*err != REG_NOERROR, 0))
3193 goto parse_bracket_exp_free_return;
3194 break;
3195 case CHAR_CLASS:
3196 *err = build_charclass (regexp->trans, sbcset,
3197 #ifdef RE_ENABLE_I18N
3198 mbcset, &char_class_alloc,
3199 #endif /* RE_ENABLE_I18N */
3200 start_elem.opr.name, syntax);
3201 if (BE (*err != REG_NOERROR, 0))
3202 goto parse_bracket_exp_free_return;
3203 break;
3204 default:
3205 assert (0);
3206 break;
3207 }
3208 }
3209 if (BE (token->type == END_OF_RE, 0))
3210 {
3211 *err = REG_EBRACK;
3212 goto parse_bracket_exp_free_return;
3213 }
3214 if (token->type == OP_CLOSE_BRACKET)
3215 break;
3216 }
3217
3218 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3219
3220 /* If it is non-matching list. */
3221 if (non_match)
3222 bitset_not (sbcset);
3223
3224 #ifdef RE_ENABLE_I18N
3225 /* Ensure only single byte characters are set. */
3226 if (dfa->mb_cur_max > 1)
3227 bitset_mask (sbcset, dfa->sb_char);
3228
3229 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3230 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3231 || mbcset->non_match)))
3232 {
3233 bin_tree_t *mbc_tree;
3234 int sbc_idx;
3235 /* Build a tree for complex bracket. */
3236 dfa->has_mb_node = 1;
3237 br_token.type = COMPLEX_BRACKET;
3238 br_token.opr.mbcset = mbcset;
3239 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3240 if (BE (mbc_tree == NULL, 0))
3241 goto parse_bracket_exp_espace;
3242 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3243 if (sbcset[sbc_idx])
3244 break;
3245 /* If there are no bits set in sbcset, there is no point
3246 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3247 if (sbc_idx < BITSET_WORDS)
3248 {
3249 /* Build a tree for simple bracket. */
3250 br_token.type = SIMPLE_BRACKET;
3251 br_token.opr.sbcset = sbcset;
3252 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3253 if (BE (work_tree == NULL, 0))
3254 goto parse_bracket_exp_espace;
3255
3256 /* Then join them by ALT node. */
3257 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3258 if (BE (work_tree == NULL, 0))
3259 goto parse_bracket_exp_espace;
3260 }
3261 else
3262 {
3263 re_free (sbcset);
3264 work_tree = mbc_tree;
3265 }
3266 }
3267 else
3268 #endif /* not RE_ENABLE_I18N */
3269 {
3270 #ifdef RE_ENABLE_I18N
3271 free_charset (mbcset);
3272 #endif
3273 /* Build a tree for simple bracket. */
3274 br_token.type = SIMPLE_BRACKET;
3275 br_token.opr.sbcset = sbcset;
3276 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277 if (BE (work_tree == NULL, 0))
3278 goto parse_bracket_exp_espace;
3279 }
3280 return work_tree;
3281
3282 parse_bracket_exp_espace:
3283 *err = REG_ESPACE;
3284 parse_bracket_exp_free_return:
3285 re_free (sbcset);
3286 #ifdef RE_ENABLE_I18N
3287 free_charset (mbcset);
3288 #endif /* RE_ENABLE_I18N */
3289 return NULL;
3290 }
3291
3292 /* Parse an element in the bracket expression. */
3293
3294 static reg_errcode_t
3295 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3296 re_token_t *token, int token_len, re_dfa_t *dfa,
3297 reg_syntax_t syntax, int accept_hyphen)
3298 {
3299 #ifdef RE_ENABLE_I18N
3300 int cur_char_size;
3301 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3302 if (cur_char_size > 1)
3303 {
3304 elem->type = MB_CHAR;
3305 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3306 re_string_skip_bytes (regexp, cur_char_size);
3307 return REG_NOERROR;
3308 }
3309 #endif /* RE_ENABLE_I18N */
3310 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3311 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3312 || token->type == OP_OPEN_EQUIV_CLASS)
3313 return parse_bracket_symbol (elem, regexp, token);
3314 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3315 {
3316 /* A '-' must only appear as anything but a range indicator before
3317 the closing bracket. Everything else is an error. */
3318 re_token_t token2;
3319 (void) peek_token_bracket (&token2, regexp, syntax);
3320 if (token2.type != OP_CLOSE_BRACKET)
3321 /* The actual error value is not standardized since this whole
3322 case is undefined. But ERANGE makes good sense. */
3323 return REG_ERANGE;
3324 }
3325 elem->type = SB_CHAR;
3326 elem->opr.ch = token->opr.c;
3327 return REG_NOERROR;
3328 }
3329
3330 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3331 such as [:<character_class>:], [.<collating_element>.], and
3332 [=<equivalent_class>=]. */
3333
3334 static reg_errcode_t
3335 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3336 re_token_t *token)
3337 {
3338 unsigned char ch, delim = token->opr.c;
3339 int i = 0;
3340 if (re_string_eoi(regexp))
3341 return REG_EBRACK;
3342 for (;; ++i)
3343 {
3344 if (i >= BRACKET_NAME_BUF_SIZE)
3345 return REG_EBRACK;
3346 if (token->type == OP_OPEN_CHAR_CLASS)
3347 ch = re_string_fetch_byte_case (regexp);
3348 else
3349 ch = re_string_fetch_byte (regexp);
3350 if (re_string_eoi(regexp))
3351 return REG_EBRACK;
3352 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3353 break;
3354 elem->opr.name[i] = ch;
3355 }
3356 re_string_skip_bytes (regexp, 1);
3357 elem->opr.name[i] = '\0';
3358 switch (token->type)
3359 {
3360 case OP_OPEN_COLL_ELEM:
3361 elem->type = COLL_SYM;
3362 break;
3363 case OP_OPEN_EQUIV_CLASS:
3364 elem->type = EQUIV_CLASS;
3365 break;
3366 case OP_OPEN_CHAR_CLASS:
3367 elem->type = CHAR_CLASS;
3368 break;
3369 default:
3370 break;
3371 }
3372 return REG_NOERROR;
3373 }
3374
3375 /* Helper function for parse_bracket_exp.
3376 Build the equivalence class which is represented by NAME.
3377 The result are written to MBCSET and SBCSET.
3378 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3379 is a pointer argument sinse we may update it. */
3380
3381 static reg_errcode_t
3382 #ifdef RE_ENABLE_I18N
3383 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384 int *equiv_class_alloc, const unsigned char *name)
3385 #else /* not RE_ENABLE_I18N */
3386 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3387 #endif /* not RE_ENABLE_I18N */
3388 {
3389 #ifdef _LIBC
3390 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3391 if (nrules != 0)
3392 {
3393 const int32_t *table, *indirect;
3394 const unsigned char *weights, *extra, *cp;
3395 unsigned char char_buf[2];
3396 int32_t idx1, idx2;
3397 unsigned int ch;
3398 size_t len;
3399 /* This #include defines a local function! */
3400 # include <locale/weight.h>
3401 /* Calculate the index for equivalence class. */
3402 cp = name;
3403 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3404 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3405 _NL_COLLATE_WEIGHTMB);
3406 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3407 _NL_COLLATE_EXTRAMB);
3408 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3409 _NL_COLLATE_INDIRECTMB);
3410 idx1 = findidx (&cp);
3411 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3412 /* This isn't a valid character. */
3413 return REG_ECOLLATE;
3414
3415 /* Build single byte matcing table for this equivalence class. */
3416 char_buf[1] = (unsigned char) '\0';
3417 len = weights[idx1 & 0xffffff];
3418 for (ch = 0; ch < SBC_MAX; ++ch)
3419 {
3420 char_buf[0] = ch;
3421 cp = char_buf;
3422 idx2 = findidx (&cp);
3423 /*
3424 idx2 = table[ch];
3425 */
3426 if (idx2 == 0)
3427 /* This isn't a valid character. */
3428 continue;
3429 /* Compare only if the length matches and the collation rule
3430 index is the same. */
3431 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3432 {
3433 int cnt = 0;
3434
3435 while (cnt <= len &&
3436 weights[(idx1 & 0xffffff) + 1 + cnt]
3437 == weights[(idx2 & 0xffffff) + 1 + cnt])
3438 ++cnt;
3439
3440 if (cnt > len)
3441 bitset_set (sbcset, ch);
3442 }
3443 }
3444 /* Check whether the array has enough space. */
3445 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3446 {
3447 /* Not enough, realloc it. */
3448 /* +1 in case of mbcset->nequiv_classes is 0. */
3449 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3450 /* Use realloc since the array is NULL if *alloc == 0. */
3451 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3452 int32_t,
3453 new_equiv_class_alloc);
3454 if (BE (new_equiv_classes == NULL, 0))
3455 return REG_ESPACE;
3456 mbcset->equiv_classes = new_equiv_classes;
3457 *equiv_class_alloc = new_equiv_class_alloc;
3458 }
3459 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3460 }
3461 else
3462 #endif /* _LIBC */
3463 {
3464 if (BE (strlen ((const char *) name) != 1, 0))
3465 return REG_ECOLLATE;
3466 bitset_set (sbcset, *name);
3467 }
3468 return REG_NOERROR;
3469 }
3470
3471 /* Helper function for parse_bracket_exp.
3472 Build the character class which is represented by NAME.
3473 The result are written to MBCSET and SBCSET.
3474 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3475 is a pointer argument sinse we may update it. */
3476
3477 static reg_errcode_t
3478 #ifdef RE_ENABLE_I18N
3479 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3480 re_charset_t *mbcset, int *char_class_alloc,
3481 const unsigned char *class_name, reg_syntax_t syntax)
3482 #else /* not RE_ENABLE_I18N */
3483 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3484 const unsigned char *class_name, reg_syntax_t syntax)
3485 #endif /* not RE_ENABLE_I18N */
3486 {
3487 int i;
3488 const char *name = (const char *) class_name;
3489
3490 /* In case of REG_ICASE "upper" and "lower" match the both of
3491 upper and lower cases. */
3492 if ((syntax & RE_ICASE)
3493 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3494 name = "alpha";
3495
3496 #ifdef RE_ENABLE_I18N
3497 /* Check the space of the arrays. */
3498 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3499 {
3500 /* Not enough, realloc it. */
3501 /* +1 in case of mbcset->nchar_classes is 0. */
3502 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3503 /* Use realloc since array is NULL if *alloc == 0. */
3504 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3505 new_char_class_alloc);
3506 if (BE (new_char_classes == NULL, 0))
3507 return REG_ESPACE;
3508 mbcset->char_classes = new_char_classes;
3509 *char_class_alloc = new_char_class_alloc;
3510 }
3511 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3512 #endif /* RE_ENABLE_I18N */
3513
3514 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3515 do { \
3516 if (BE (trans != NULL, 0)) \
3517 { \
3518 for (i = 0; i < SBC_MAX; ++i) \
3519 if (ctype_func (i)) \
3520 bitset_set (sbcset, trans[i]); \
3521 } \
3522 else \
3523 { \
3524 for (i = 0; i < SBC_MAX; ++i) \
3525 if (ctype_func (i)) \
3526 bitset_set (sbcset, i); \
3527 } \
3528 } while (0)
3529
3530 if (strcmp (name, "alnum") == 0)
3531 BUILD_CHARCLASS_LOOP (isalnum);
3532 else if (strcmp (name, "cntrl") == 0)
3533 BUILD_CHARCLASS_LOOP (iscntrl);
3534 else if (strcmp (name, "lower") == 0)
3535 BUILD_CHARCLASS_LOOP (islower);
3536 else if (strcmp (name, "space") == 0)
3537 BUILD_CHARCLASS_LOOP (isspace);
3538 else if (strcmp (name, "alpha") == 0)
3539 BUILD_CHARCLASS_LOOP (isalpha);
3540 else if (strcmp (name, "digit") == 0)
3541 BUILD_CHARCLASS_LOOP (isdigit);
3542 else if (strcmp (name, "print") == 0)
3543 BUILD_CHARCLASS_LOOP (isprint);
3544 else if (strcmp (name, "upper") == 0)
3545 BUILD_CHARCLASS_LOOP (isupper);
3546 else if (strcmp (name, "blank") == 0)
3547 BUILD_CHARCLASS_LOOP (isblank);
3548 else if (strcmp (name, "graph") == 0)
3549 BUILD_CHARCLASS_LOOP (isgraph);
3550 else if (strcmp (name, "punct") == 0)
3551 BUILD_CHARCLASS_LOOP (ispunct);
3552 else if (strcmp (name, "xdigit") == 0)
3553 BUILD_CHARCLASS_LOOP (isxdigit);
3554 else
3555 return REG_ECTYPE;
3556
3557 return REG_NOERROR;
3558 }
3559
3560 static bin_tree_t *
3561 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3562 const unsigned char *class_name,
3563 const unsigned char *extra, int non_match,
3564 reg_errcode_t *err)
3565 {
3566 re_bitset_ptr_t sbcset;
3567 #ifdef RE_ENABLE_I18N
3568 re_charset_t *mbcset;
3569 int alloc = 0;
3570 #endif /* not RE_ENABLE_I18N */
3571 reg_errcode_t ret;
3572 re_token_t br_token;
3573 bin_tree_t *tree;
3574
3575 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3576 #ifdef RE_ENABLE_I18N
3577 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3578 #endif /* RE_ENABLE_I18N */
3579
3580 #ifdef RE_ENABLE_I18N
3581 if (BE (sbcset == NULL || mbcset == NULL, 0))
3582 #else /* not RE_ENABLE_I18N */
3583 if (BE (sbcset == NULL, 0))
3584 #endif /* not RE_ENABLE_I18N */
3585 {
3586 *err = REG_ESPACE;
3587 return NULL;
3588 }
3589
3590 if (non_match)
3591 {
3592 #ifdef RE_ENABLE_I18N
3593 mbcset->non_match = 1;
3594 #endif /* not RE_ENABLE_I18N */
3595 }
3596
3597 /* We don't care the syntax in this case. */
3598 ret = build_charclass (trans, sbcset,
3599 #ifdef RE_ENABLE_I18N
3600 mbcset, &alloc,
3601 #endif /* RE_ENABLE_I18N */
3602 class_name, 0);
3603
3604 if (BE (ret != REG_NOERROR, 0))
3605 {
3606 re_free (sbcset);
3607 #ifdef RE_ENABLE_I18N
3608 free_charset (mbcset);
3609 #endif /* RE_ENABLE_I18N */
3610 *err = ret;
3611 return NULL;
3612 }
3613 /* \w match '_' also. */
3614 for (; *extra; extra++)
3615 bitset_set (sbcset, *extra);
3616
3617 /* If it is non-matching list. */
3618 if (non_match)
3619 bitset_not (sbcset);
3620
3621 #ifdef RE_ENABLE_I18N
3622 /* Ensure only single byte characters are set. */
3623 if (dfa->mb_cur_max > 1)
3624 bitset_mask (sbcset, dfa->sb_char);
3625 #endif
3626
3627 /* Build a tree for simple bracket. */
3628 br_token.type = SIMPLE_BRACKET;
3629 br_token.opr.sbcset = sbcset;
3630 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3631 if (BE (tree == NULL, 0))
3632 goto build_word_op_espace;
3633
3634 #ifdef RE_ENABLE_I18N
3635 if (dfa->mb_cur_max > 1)
3636 {
3637 bin_tree_t *mbc_tree;
3638 /* Build a tree for complex bracket. */
3639 br_token.type = COMPLEX_BRACKET;
3640 br_token.opr.mbcset = mbcset;
3641 dfa->has_mb_node = 1;
3642 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3643 if (BE (mbc_tree == NULL, 0))
3644 goto build_word_op_espace;
3645 /* Then join them by ALT node. */
3646 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3647 if (BE (mbc_tree != NULL, 1))
3648 return tree;
3649 }
3650 else
3651 {
3652 free_charset (mbcset);
3653 return tree;
3654 }
3655 #else /* not RE_ENABLE_I18N */
3656 return tree;
3657 #endif /* not RE_ENABLE_I18N */
3658
3659 build_word_op_espace:
3660 re_free (sbcset);
3661 #ifdef RE_ENABLE_I18N
3662 free_charset (mbcset);
3663 #endif /* RE_ENABLE_I18N */
3664 *err = REG_ESPACE;
3665 return NULL;
3666 }
3667
3668 /* This is intended for the expressions like "a{1,3}".
3669 Fetch a number from `input', and return the number.
3670 Return -1, if the number field is empty like "{,1}".
3671 Return -2, If an error is occured. */
3672
3673 static int
3674 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3675 {
3676 int num = -1;
3677 unsigned char c;
3678 while (1)
3679 {
3680 fetch_token (token, input, syntax);
3681 c = token->opr.c;
3682 if (BE (token->type == END_OF_RE, 0))
3683 return -2;
3684 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3685 break;
3686 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3687 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3688 num = (num > RE_DUP_MAX) ? -2 : num;
3689 }
3690 return num;
3691 }
3692 \f
3693 #ifdef RE_ENABLE_I18N
3694 static void
3695 free_charset (re_charset_t *cset)
3696 {
3697 re_free (cset->mbchars);
3698 # ifdef _LIBC
3699 re_free (cset->coll_syms);
3700 re_free (cset->equiv_classes);
3701 re_free (cset->range_starts);
3702 re_free (cset->range_ends);
3703 # endif
3704 re_free (cset->char_classes);
3705 re_free (cset);
3706 }
3707 #endif /* RE_ENABLE_I18N */
3708 \f
3709 /* Functions for binary tree operation. */
3710
3711 /* Create a tree node. */
3712
3713 static bin_tree_t *
3714 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3715 re_token_type_t type)
3716 {
3717 re_token_t t;
3718 t.type = type;
3719 return create_token_tree (dfa, left, right, &t);
3720 }
3721
3722 static bin_tree_t *
3723 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3724 const re_token_t *token)
3725 {
3726 bin_tree_t *tree;
3727 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3728 {
3729 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3730
3731 if (storage == NULL)
3732 return NULL;
3733 storage->next = dfa->str_tree_storage;
3734 dfa->str_tree_storage = storage;
3735 dfa->str_tree_storage_idx = 0;
3736 }
3737 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3738
3739 tree->parent = NULL;
3740 tree->left = left;
3741 tree->right = right;
3742 tree->token = *token;
3743 tree->token.duplicated = 0;
3744 tree->token.opt_subexp = 0;
3745 tree->first = NULL;
3746 tree->next = NULL;
3747 tree->node_idx = -1;
3748
3749 if (left != NULL)
3750 left->parent = tree;
3751 if (right != NULL)
3752 right->parent = tree;
3753 return tree;
3754 }
3755
3756 /* Mark the tree SRC as an optional subexpression.
3757 To be called from preorder or postorder. */
3758
3759 static reg_errcode_t
3760 mark_opt_subexp (void *extra, bin_tree_t *node)
3761 {
3762 int idx = (int) (long) extra;
3763 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3764 node->token.opt_subexp = 1;
3765
3766 return REG_NOERROR;
3767 }
3768
3769 /* Free the allocated memory inside NODE. */
3770
3771 static void
3772 free_token (re_token_t *node)
3773 {
3774 #ifdef RE_ENABLE_I18N
3775 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3776 free_charset (node->opr.mbcset);
3777 else
3778 #endif /* RE_ENABLE_I18N */
3779 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3780 re_free (node->opr.sbcset);
3781 }
3782
3783 /* Worker function for tree walking. Free the allocated memory inside NODE
3784 and its children. */
3785
3786 static reg_errcode_t
3787 free_tree (void *extra, bin_tree_t *node)
3788 {
3789 free_token (&node->token);
3790 return REG_NOERROR;
3791 }
3792
3793
3794 /* Duplicate the node SRC, and return new node. This is a preorder
3795 visit similar to the one implemented by the generic visitor, but
3796 we need more infrastructure to maintain two parallel trees --- so,
3797 it's easier to duplicate. */
3798
3799 static bin_tree_t *
3800 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3801 {
3802 const bin_tree_t *node;
3803 bin_tree_t *dup_root;
3804 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3805
3806 for (node = root; ; )
3807 {
3808 /* Create a new tree and link it back to the current parent. */
3809 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3810 if (*p_new == NULL)
3811 return NULL;
3812 (*p_new)->parent = dup_node;
3813 (*p_new)->token.duplicated = 1;
3814 dup_node = *p_new;
3815
3816 /* Go to the left node, or up and to the right. */
3817 if (node->left)
3818 {
3819 node = node->left;
3820 p_new = &dup_node->left;
3821 }
3822 else
3823 {
3824 const bin_tree_t *prev = NULL;
3825 while (node->right == prev || node->right == NULL)
3826 {
3827 prev = node;
3828 node = node->parent;
3829 dup_node = dup_node->parent;
3830 if (!node)
3831 return dup_root;
3832 }
3833 node = node->right;
3834 p_new = &dup_node->right;
3835 }
3836 }
3837 }