]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
regcomp.c: do not ignore memory allocation failure
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 \f
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129 const char __re_error_msgid[] attribute_hidden =
130 {
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203 \f
204 /* Entry points for GNU code. */
205
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
209
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
212
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218 {
219 reg_errcode_t ret;
220
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225
226 /* Match anchors at newline. */
227 bufp->newline_anchor = 1;
228
229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230
231 if (!ret)
232 return NULL;
233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options;
245
246
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
250
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
253
254 reg_syntax_t
255 re_set_syntax (syntax)
256 reg_syntax_t syntax;
257 {
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266
267 int
268 re_compile_fastmap (bufp)
269 struct re_pattern_buffer *bufp;
270 {
271 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272 char *fastmap = bufp->fastmap;
273
274 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276 if (dfa->init_state != dfa->init_state_word)
277 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278 if (dfa->init_state != dfa->init_state_nl)
279 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280 if (dfa->init_state != dfa->init_state_begbuf)
281 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282 bufp->fastmap_accurate = 1;
283 return 0;
284 }
285 #ifdef _LIBC
286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
288
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
292 {
293 fastmap[ch] = 1;
294 if (icase)
295 fastmap[tolower (ch)] = 1;
296 }
297
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
300
301 static void
302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 char *fastmap)
304 {
305 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306 int node_cnt;
307 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 {
310 int node = init_state->nodes.elems[node_cnt];
311 re_token_type_t type = dfa->nodes[node].type;
312
313 if (type == CHARACTER)
314 {
315 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 {
319 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, 0, buf[0]);
335 }
336 #endif
337 }
338 else if (type == SIMPLE_BRACKET)
339 {
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
349 }
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
352 {
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 int i;
355
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
365 {
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
371 }
372 # endif /* _LIBC */
373
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
383 ))
384 {
385 unsigned char c = 0;
386 do
387 {
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
392 }
393 while (++c != 0);
394 }
395
396 else
397 {
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
400 {
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 {
408 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 }
412 }
413 }
414 }
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
421 {
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
426 }
427 }
428 }
429 \f
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
432
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
435
436 `buffer' to the compiled pattern;
437 `used' to the length of the compiled pattern;
438 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 `fastmap' to an allocated space for the fastmap;
443 `fastmap_accurate' to zero;
444 `re_nsub' to the number of subexpressions in PATTERN.
445
446 PATTERN is the address of the pattern string.
447
448 CFLAGS is a series of bits which affect compilation.
449
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
452
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
455
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
458
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
462
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
465
466 int
467 regcomp (preg, pattern, cflags)
468 regex_t *__restrict preg;
469 const char *__restrict pattern;
470 int cflags;
471 {
472 reg_errcode_t ret;
473 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
474 : RE_SYNTAX_POSIX_BASIC);
475
476 preg->buffer = NULL;
477 preg->allocated = 0;
478 preg->used = 0;
479
480 /* Try to allocate space for the fastmap. */
481 preg->fastmap = re_malloc (char, SBC_MAX);
482 if (BE (preg->fastmap == NULL, 0))
483 return REG_ESPACE;
484
485 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
486
487 /* If REG_NEWLINE is set, newlines are treated differently. */
488 if (cflags & REG_NEWLINE)
489 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
490 syntax &= ~RE_DOT_NEWLINE;
491 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
492 /* It also changes the matching behavior. */
493 preg->newline_anchor = 1;
494 }
495 else
496 preg->newline_anchor = 0;
497 preg->no_sub = !!(cflags & REG_NOSUB);
498 preg->translate = NULL;
499
500 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
501
502 /* POSIX doesn't distinguish between an unmatched open-group and an
503 unmatched close-group: both are REG_EPAREN. */
504 if (ret == REG_ERPAREN)
505 ret = REG_EPAREN;
506
507 /* We have already checked preg->fastmap != NULL. */
508 if (BE (ret == REG_NOERROR, 1))
509 /* Compute the fastmap now, since regexec cannot modify the pattern
510 buffer. This function never fails in this implementation. */
511 (void) re_compile_fastmap (preg);
512 else
513 {
514 /* Some error occurred while compiling the expression. */
515 re_free (preg->fastmap);
516 preg->fastmap = NULL;
517 }
518
519 return (int) ret;
520 }
521 #ifdef _LIBC
522 weak_alias (__regcomp, regcomp)
523 #endif
524
525 /* Returns a message corresponding to an error code, ERRCODE, returned
526 from either regcomp or regexec. We don't use PREG here. */
527
528 size_t
529 regerror (errcode, preg, errbuf, errbuf_size)
530 int errcode;
531 const regex_t *__restrict preg;
532 char *__restrict errbuf;
533 size_t errbuf_size;
534 {
535 const char *msg;
536 size_t msg_size;
537
538 if (BE (errcode < 0
539 || errcode >= (int) (sizeof (__re_error_msgid_idx)
540 / sizeof (__re_error_msgid_idx[0])), 0))
541 /* Only error codes returned by the rest of the code should be passed
542 to this routine. If we are given anything else, or if other regex
543 code generates an invalid error code, then the program has a bug.
544 Dump core so we can fix it. */
545 abort ();
546
547 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
548
549 msg_size = strlen (msg) + 1; /* Includes the null. */
550
551 if (BE (errbuf_size != 0, 1))
552 {
553 if (BE (msg_size > errbuf_size, 0))
554 {
555 #if defined HAVE_MEMPCPY || defined _LIBC
556 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
557 #else
558 memcpy (errbuf, msg, errbuf_size - 1);
559 errbuf[errbuf_size - 1] = 0;
560 #endif
561 }
562 else
563 memcpy (errbuf, msg, msg_size);
564 }
565
566 return msg_size;
567 }
568 #ifdef _LIBC
569 weak_alias (__regerror, regerror)
570 #endif
571
572
573 #ifdef RE_ENABLE_I18N
574 /* This static array is used for the map to single-byte characters when
575 UTF-8 is used. Otherwise we would allocate memory just to initialize
576 it the same all the time. UTF-8 is the preferred encoding so this is
577 a worthwhile optimization. */
578 static const bitset_t utf8_sb_map =
579 {
580 /* Set the first 128 bits. */
581 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
582 };
583 #endif
584
585
586 static void
587 free_dfa_content (re_dfa_t *dfa)
588 {
589 int i, j;
590
591 if (dfa->nodes)
592 for (i = 0; i < dfa->nodes_len; ++i)
593 free_token (dfa->nodes + i);
594 re_free (dfa->nexts);
595 for (i = 0; i < dfa->nodes_len; ++i)
596 {
597 if (dfa->eclosures != NULL)
598 re_node_set_free (dfa->eclosures + i);
599 if (dfa->inveclosures != NULL)
600 re_node_set_free (dfa->inveclosures + i);
601 if (dfa->edests != NULL)
602 re_node_set_free (dfa->edests + i);
603 }
604 re_free (dfa->edests);
605 re_free (dfa->eclosures);
606 re_free (dfa->inveclosures);
607 re_free (dfa->nodes);
608
609 if (dfa->state_table)
610 for (i = 0; i <= dfa->state_hash_mask; ++i)
611 {
612 struct re_state_table_entry *entry = dfa->state_table + i;
613 for (j = 0; j < entry->num; ++j)
614 {
615 re_dfastate_t *state = entry->array[j];
616 free_state (state);
617 }
618 re_free (entry->array);
619 }
620 re_free (dfa->state_table);
621 #ifdef RE_ENABLE_I18N
622 if (dfa->sb_char != utf8_sb_map)
623 re_free (dfa->sb_char);
624 #endif
625 re_free (dfa->subexp_map);
626 #ifdef DEBUG
627 re_free (dfa->re_str);
628 #endif
629
630 re_free (dfa);
631 }
632
633
634 /* Free dynamically allocated space used by PREG. */
635
636 void
637 regfree (preg)
638 regex_t *preg;
639 {
640 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641 if (BE (dfa != NULL, 1))
642 free_dfa_content (dfa);
643 preg->buffer = NULL;
644 preg->allocated = 0;
645
646 re_free (preg->fastmap);
647 preg->fastmap = NULL;
648
649 re_free (preg->translate);
650 preg->translate = NULL;
651 }
652 #ifdef _LIBC
653 weak_alias (__regfree, regfree)
654 #endif
655 \f
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
658
659 #if defined _REGEX_RE_COMP || defined _LIBC
660
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf;
663
664 char *
665 # ifdef _LIBC
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
669 weak_function
670 # endif
671 re_comp (s)
672 const char *s;
673 {
674 reg_errcode_t ret;
675 char *fastmap;
676
677 if (!s)
678 {
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
681 return 0;
682 }
683
684 if (re_comp_buf.buffer)
685 {
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
691 }
692
693 if (re_comp_buf.fastmap == NULL)
694 {
695 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
699 }
700
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
703
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
706
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709 if (!ret)
710 return NULL;
711
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714 }
715
716 #ifdef _LIBC
717 libc_freeres_fn (free_mem)
718 {
719 __regfree (&re_comp_buf);
720 }
721 #endif
722
723 #endif /* _REGEX_RE_COMP */
724 \f
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
728
729 static reg_errcode_t
730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
731 reg_syntax_t syntax)
732 {
733 reg_errcode_t err = REG_NOERROR;
734 re_dfa_t *dfa;
735 re_string_t regexp;
736
737 /* Initialize the pattern buffer. */
738 preg->fastmap_accurate = 0;
739 preg->syntax = syntax;
740 preg->not_bol = preg->not_eol = 0;
741 preg->used = 0;
742 preg->re_nsub = 0;
743 preg->can_be_null = 0;
744 preg->regs_allocated = REGS_UNALLOCATED;
745
746 /* Initialize the dfa. */
747 dfa = (re_dfa_t *) preg->buffer;
748 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749 {
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
755 if (dfa == NULL)
756 return REG_ESPACE;
757 preg->allocated = sizeof (re_dfa_t);
758 preg->buffer = (unsigned char *) dfa;
759 }
760 preg->used = sizeof (re_dfa_t);
761
762 err = init_dfa (dfa, length);
763 if (BE (err != REG_NOERROR, 0))
764 {
765 free_dfa_content (dfa);
766 preg->buffer = NULL;
767 preg->allocated = 0;
768 return err;
769 }
770 #ifdef DEBUG
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa->re_str = re_malloc (char, length + 1);
773 strncpy (dfa->re_str, pattern, length + 1);
774 #endif
775
776 __libc_lock_init (dfa->lock);
777
778 err = re_string_construct (&regexp, pattern, length, preg->translate,
779 syntax & RE_ICASE, dfa);
780 if (BE (err != REG_NOERROR, 0))
781 {
782 re_compile_internal_free_return:
783 free_workarea_compile (preg);
784 re_string_destruct (&regexp);
785 free_dfa_content (dfa);
786 preg->buffer = NULL;
787 preg->allocated = 0;
788 return err;
789 }
790
791 /* Parse the regular expression, and build a structure tree. */
792 preg->re_nsub = 0;
793 dfa->str_tree = parse (&regexp, preg, syntax, &err);
794 if (BE (dfa->str_tree == NULL, 0))
795 goto re_compile_internal_free_return;
796
797 /* Analyze the tree and create the nfa. */
798 err = analyze (preg);
799 if (BE (err != REG_NOERROR, 0))
800 goto re_compile_internal_free_return;
801
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
805 optimize_utf8 (dfa);
806 #endif
807
808 /* Then create the initial state of the dfa. */
809 err = create_initial_state (dfa);
810
811 /* Release work areas. */
812 free_workarea_compile (preg);
813 re_string_destruct (&regexp);
814
815 if (BE (err != REG_NOERROR, 0))
816 {
817 free_dfa_content (dfa);
818 preg->buffer = NULL;
819 preg->allocated = 0;
820 }
821
822 return err;
823 }
824
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
827
828 static reg_errcode_t
829 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 {
831 unsigned int table_size;
832 #ifndef _LIBC
833 char *codeset_name;
834 #endif
835
836 memset (dfa, '\0', sizeof (re_dfa_t));
837
838 /* Force allocation of str_tree_storage the first time. */
839 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
840
841 /* Avoid overflows. */
842 if (pat_len == SIZE_MAX)
843 return REG_ESPACE;
844
845 dfa->nodes_alloc = pat_len + 1;
846 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
847
848 /* table_size = 2 ^ ceil(log pat_len) */
849 for (table_size = 1; ; table_size <<= 1)
850 if (table_size > pat_len)
851 break;
852
853 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
854 dfa->state_hash_mask = table_size - 1;
855
856 dfa->mb_cur_max = MB_CUR_MAX;
857 #ifdef _LIBC
858 if (dfa->mb_cur_max == 6
859 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
860 dfa->is_utf8 = 1;
861 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
862 != 0);
863 #else
864 # ifdef HAVE_LANGINFO_CODESET
865 codeset_name = nl_langinfo (CODESET);
866 # else
867 codeset_name = getenv ("LC_ALL");
868 if (codeset_name == NULL || codeset_name[0] == '\0')
869 codeset_name = getenv ("LC_CTYPE");
870 if (codeset_name == NULL || codeset_name[0] == '\0')
871 codeset_name = getenv ("LANG");
872 if (codeset_name == NULL)
873 codeset_name = "";
874 else if (strchr (codeset_name, '.') != NULL)
875 codeset_name = strchr (codeset_name, '.') + 1;
876 # endif
877
878 if (strcasecmp (codeset_name, "UTF-8") == 0
879 || strcasecmp (codeset_name, "UTF8") == 0)
880 dfa->is_utf8 = 1;
881
882 /* We check exhaustively in the loop below if this charset is a
883 superset of ASCII. */
884 dfa->map_notascii = 0;
885 #endif
886
887 #ifdef RE_ENABLE_I18N
888 if (dfa->mb_cur_max > 1)
889 {
890 if (dfa->is_utf8)
891 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
892 else
893 {
894 int i, j, ch;
895
896 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
897 if (BE (dfa->sb_char == NULL, 0))
898 return REG_ESPACE;
899
900 /* Set the bits corresponding to single byte chars. */
901 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
902 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
903 {
904 wint_t wch = __btowc (ch);
905 if (wch != WEOF)
906 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
907 # ifndef _LIBC
908 if (isascii (ch) && wch != ch)
909 dfa->map_notascii = 1;
910 # endif
911 }
912 }
913 }
914 #endif
915
916 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
917 return REG_ESPACE;
918 return REG_NOERROR;
919 }
920
921 /* Initialize WORD_CHAR table, which indicate which character is
922 "word". In this case "word" means that it is the word construction
923 character used by some operators like "\<", "\>", etc. */
924
925 static void
926 internal_function
927 init_word_char (re_dfa_t *dfa)
928 {
929 int i, j, ch;
930 dfa->word_ops_used = 1;
931 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
932 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
933 if (isalnum (ch) || ch == '_')
934 dfa->word_char[i] |= (bitset_word_t) 1 << j;
935 }
936
937 /* Free the work area which are only used while compiling. */
938
939 static void
940 free_workarea_compile (regex_t *preg)
941 {
942 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
943 bin_tree_storage_t *storage, *next;
944 for (storage = dfa->str_tree_storage; storage; storage = next)
945 {
946 next = storage->next;
947 re_free (storage);
948 }
949 dfa->str_tree_storage = NULL;
950 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
951 dfa->str_tree = NULL;
952 re_free (dfa->org_indices);
953 dfa->org_indices = NULL;
954 }
955
956 /* Create initial states for all contexts. */
957
958 static reg_errcode_t
959 create_initial_state (re_dfa_t *dfa)
960 {
961 int first, i;
962 reg_errcode_t err;
963 re_node_set init_nodes;
964
965 /* Initial states have the epsilon closure of the node which is
966 the first node of the regular expression. */
967 first = dfa->str_tree->first->node_idx;
968 dfa->init_node = first;
969 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
970 if (BE (err != REG_NOERROR, 0))
971 return err;
972
973 /* The back-references which are in initial states can epsilon transit,
974 since in this case all of the subexpressions can be null.
975 Then we add epsilon closures of the nodes which are the next nodes of
976 the back-references. */
977 if (dfa->nbackref > 0)
978 for (i = 0; i < init_nodes.nelem; ++i)
979 {
980 int node_idx = init_nodes.elems[i];
981 re_token_type_t type = dfa->nodes[node_idx].type;
982
983 int clexp_idx;
984 if (type != OP_BACK_REF)
985 continue;
986 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
987 {
988 re_token_t *clexp_node;
989 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
990 if (clexp_node->type == OP_CLOSE_SUBEXP
991 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
992 break;
993 }
994 if (clexp_idx == init_nodes.nelem)
995 continue;
996
997 if (type == OP_BACK_REF)
998 {
999 int dest_idx = dfa->edests[node_idx].elems[0];
1000 if (!re_node_set_contains (&init_nodes, dest_idx))
1001 {
1002 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1003 i = 0;
1004 }
1005 }
1006 }
1007
1008 /* It must be the first time to invoke acquire_state. */
1009 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1010 /* We don't check ERR here, since the initial state must not be NULL. */
1011 if (BE (dfa->init_state == NULL, 0))
1012 return err;
1013 if (dfa->init_state->has_constraint)
1014 {
1015 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1016 CONTEXT_WORD);
1017 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1018 CONTEXT_NEWLINE);
1019 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1020 &init_nodes,
1021 CONTEXT_NEWLINE
1022 | CONTEXT_BEGBUF);
1023 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1024 || dfa->init_state_begbuf == NULL, 0))
1025 return err;
1026 }
1027 else
1028 dfa->init_state_word = dfa->init_state_nl
1029 = dfa->init_state_begbuf = dfa->init_state;
1030
1031 re_node_set_free (&init_nodes);
1032 return REG_NOERROR;
1033 }
1034 \f
1035 #ifdef RE_ENABLE_I18N
1036 /* If it is possible to do searching in single byte encoding instead of UTF-8
1037 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1038 DFA nodes where needed. */
1039
1040 static void
1041 optimize_utf8 (re_dfa_t *dfa)
1042 {
1043 int node, i, mb_chars = 0, has_period = 0;
1044
1045 for (node = 0; node < dfa->nodes_len; ++node)
1046 switch (dfa->nodes[node].type)
1047 {
1048 case CHARACTER:
1049 if (dfa->nodes[node].opr.c >= 0x80)
1050 mb_chars = 1;
1051 break;
1052 case ANCHOR:
1053 switch (dfa->nodes[node].opr.ctx_type)
1054 {
1055 case LINE_FIRST:
1056 case LINE_LAST:
1057 case BUF_FIRST:
1058 case BUF_LAST:
1059 break;
1060 default:
1061 /* Word anchors etc. cannot be handled. It's okay to test
1062 opr.ctx_type since constraints (for all DFA nodes) are
1063 created by ORing one or more opr.ctx_type values. */
1064 return;
1065 }
1066 break;
1067 case OP_PERIOD:
1068 has_period = 1;
1069 break;
1070 case OP_BACK_REF:
1071 case OP_ALT:
1072 case END_OF_RE:
1073 case OP_DUP_ASTERISK:
1074 case OP_OPEN_SUBEXP:
1075 case OP_CLOSE_SUBEXP:
1076 break;
1077 case COMPLEX_BRACKET:
1078 return;
1079 case SIMPLE_BRACKET:
1080 /* Just double check. The non-ASCII range starts at 0x80. */
1081 assert (0x80 % BITSET_WORD_BITS == 0);
1082 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1083 if (dfa->nodes[node].opr.sbcset[i])
1084 return;
1085 break;
1086 default:
1087 abort ();
1088 }
1089
1090 if (mb_chars || has_period)
1091 for (node = 0; node < dfa->nodes_len; ++node)
1092 {
1093 if (dfa->nodes[node].type == CHARACTER
1094 && dfa->nodes[node].opr.c >= 0x80)
1095 dfa->nodes[node].mb_partial = 0;
1096 else if (dfa->nodes[node].type == OP_PERIOD)
1097 dfa->nodes[node].type = OP_UTF8_PERIOD;
1098 }
1099
1100 /* The search can be in single byte locale. */
1101 dfa->mb_cur_max = 1;
1102 dfa->is_utf8 = 0;
1103 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1104 }
1105 #endif
1106 \f
1107 /* Analyze the structure tree, and calculate "first", "next", "edest",
1108 "eclosure", and "inveclosure". */
1109
1110 static reg_errcode_t
1111 analyze (regex_t *preg)
1112 {
1113 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1114 reg_errcode_t ret;
1115
1116 /* Allocate arrays. */
1117 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1118 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1119 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1120 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1121 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1122 || dfa->eclosures == NULL, 0))
1123 return REG_ESPACE;
1124
1125 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1126 if (dfa->subexp_map != NULL)
1127 {
1128 int i;
1129 for (i = 0; i < preg->re_nsub; i++)
1130 dfa->subexp_map[i] = i;
1131 preorder (dfa->str_tree, optimize_subexps, dfa);
1132 for (i = 0; i < preg->re_nsub; i++)
1133 if (dfa->subexp_map[i] != i)
1134 break;
1135 if (i == preg->re_nsub)
1136 {
1137 free (dfa->subexp_map);
1138 dfa->subexp_map = NULL;
1139 }
1140 }
1141
1142 ret = postorder (dfa->str_tree, lower_subexps, preg);
1143 if (BE (ret != REG_NOERROR, 0))
1144 return ret;
1145 ret = postorder (dfa->str_tree, calc_first, dfa);
1146 if (BE (ret != REG_NOERROR, 0))
1147 return ret;
1148 preorder (dfa->str_tree, calc_next, dfa);
1149 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1150 if (BE (ret != REG_NOERROR, 0))
1151 return ret;
1152 ret = calc_eclosure (dfa);
1153 if (BE (ret != REG_NOERROR, 0))
1154 return ret;
1155
1156 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1157 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1158 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1159 || dfa->nbackref)
1160 {
1161 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1162 if (BE (dfa->inveclosures == NULL, 0))
1163 return REG_ESPACE;
1164 ret = calc_inveclosure (dfa);
1165 }
1166
1167 return ret;
1168 }
1169
1170 /* Our parse trees are very unbalanced, so we cannot use a stack to
1171 implement parse tree visits. Instead, we use parent pointers and
1172 some hairy code in these two functions. */
1173 static reg_errcode_t
1174 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1175 void *extra)
1176 {
1177 bin_tree_t *node, *prev;
1178
1179 for (node = root; ; )
1180 {
1181 /* Descend down the tree, preferably to the left (or to the right
1182 if that's the only child). */
1183 while (node->left || node->right)
1184 if (node->left)
1185 node = node->left;
1186 else
1187 node = node->right;
1188
1189 do
1190 {
1191 reg_errcode_t err = fn (extra, node);
1192 if (BE (err != REG_NOERROR, 0))
1193 return err;
1194 if (node->parent == NULL)
1195 return REG_NOERROR;
1196 prev = node;
1197 node = node->parent;
1198 }
1199 /* Go up while we have a node that is reached from the right. */
1200 while (node->right == prev || node->right == NULL);
1201 node = node->right;
1202 }
1203 }
1204
1205 static reg_errcode_t
1206 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1207 void *extra)
1208 {
1209 bin_tree_t *node;
1210
1211 for (node = root; ; )
1212 {
1213 reg_errcode_t err = fn (extra, node);
1214 if (BE (err != REG_NOERROR, 0))
1215 return err;
1216
1217 /* Go to the left node, or up and to the right. */
1218 if (node->left)
1219 node = node->left;
1220 else
1221 {
1222 bin_tree_t *prev = NULL;
1223 while (node->right == prev || node->right == NULL)
1224 {
1225 prev = node;
1226 node = node->parent;
1227 if (!node)
1228 return REG_NOERROR;
1229 }
1230 node = node->right;
1231 }
1232 }
1233 }
1234
1235 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1236 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1237 backreferences as well. Requires a preorder visit. */
1238 static reg_errcode_t
1239 optimize_subexps (void *extra, bin_tree_t *node)
1240 {
1241 re_dfa_t *dfa = (re_dfa_t *) extra;
1242
1243 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1244 {
1245 int idx = node->token.opr.idx;
1246 node->token.opr.idx = dfa->subexp_map[idx];
1247 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1248 }
1249
1250 else if (node->token.type == SUBEXP
1251 && node->left && node->left->token.type == SUBEXP)
1252 {
1253 int other_idx = node->left->token.opr.idx;
1254
1255 node->left = node->left->left;
1256 if (node->left)
1257 node->left->parent = node;
1258
1259 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1260 if (other_idx < BITSET_WORD_BITS)
1261 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1262 }
1263
1264 return REG_NOERROR;
1265 }
1266
1267 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1268 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1269 static reg_errcode_t
1270 lower_subexps (void *extra, bin_tree_t *node)
1271 {
1272 regex_t *preg = (regex_t *) extra;
1273 reg_errcode_t err = REG_NOERROR;
1274
1275 if (node->left && node->left->token.type == SUBEXP)
1276 {
1277 node->left = lower_subexp (&err, preg, node->left);
1278 if (node->left)
1279 node->left->parent = node;
1280 }
1281 if (node->right && node->right->token.type == SUBEXP)
1282 {
1283 node->right = lower_subexp (&err, preg, node->right);
1284 if (node->right)
1285 node->right->parent = node;
1286 }
1287
1288 return err;
1289 }
1290
1291 static bin_tree_t *
1292 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1293 {
1294 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1295 bin_tree_t *body = node->left;
1296 bin_tree_t *op, *cls, *tree1, *tree;
1297
1298 if (preg->no_sub
1299 /* We do not optimize empty subexpressions, because otherwise we may
1300 have bad CONCAT nodes with NULL children. This is obviously not
1301 very common, so we do not lose much. An example that triggers
1302 this case is the sed "script" /\(\)/x. */
1303 && node->left != NULL
1304 && (node->token.opr.idx >= BITSET_WORD_BITS
1305 || !(dfa->used_bkref_map
1306 & ((bitset_word_t) 1 << node->token.opr.idx))))
1307 return node->left;
1308
1309 /* Convert the SUBEXP node to the concatenation of an
1310 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1311 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1312 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1313 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1314 tree = create_tree (dfa, op, tree1, CONCAT);
1315 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1316 {
1317 *err = REG_ESPACE;
1318 return NULL;
1319 }
1320
1321 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1322 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1323 return tree;
1324 }
1325
1326 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1327 nodes. Requires a postorder visit. */
1328 static reg_errcode_t
1329 calc_first (void *extra, bin_tree_t *node)
1330 {
1331 re_dfa_t *dfa = (re_dfa_t *) extra;
1332 if (node->token.type == CONCAT)
1333 {
1334 node->first = node->left->first;
1335 node->node_idx = node->left->node_idx;
1336 }
1337 else
1338 {
1339 node->first = node;
1340 node->node_idx = re_dfa_add_node (dfa, node->token);
1341 if (BE (node->node_idx == -1, 0))
1342 return REG_ESPACE;
1343 if (node->token.type == ANCHOR)
1344 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1345 }
1346 return REG_NOERROR;
1347 }
1348
1349 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1350 static reg_errcode_t
1351 calc_next (void *extra, bin_tree_t *node)
1352 {
1353 switch (node->token.type)
1354 {
1355 case OP_DUP_ASTERISK:
1356 node->left->next = node;
1357 break;
1358 case CONCAT:
1359 node->left->next = node->right->first;
1360 node->right->next = node->next;
1361 break;
1362 default:
1363 if (node->left)
1364 node->left->next = node->next;
1365 if (node->right)
1366 node->right->next = node->next;
1367 break;
1368 }
1369 return REG_NOERROR;
1370 }
1371
1372 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1373 static reg_errcode_t
1374 link_nfa_nodes (void *extra, bin_tree_t *node)
1375 {
1376 re_dfa_t *dfa = (re_dfa_t *) extra;
1377 int idx = node->node_idx;
1378 reg_errcode_t err = REG_NOERROR;
1379
1380 switch (node->token.type)
1381 {
1382 case CONCAT:
1383 break;
1384
1385 case END_OF_RE:
1386 assert (node->next == NULL);
1387 break;
1388
1389 case OP_DUP_ASTERISK:
1390 case OP_ALT:
1391 {
1392 int left, right;
1393 dfa->has_plural_match = 1;
1394 if (node->left != NULL)
1395 left = node->left->first->node_idx;
1396 else
1397 left = node->next->node_idx;
1398 if (node->right != NULL)
1399 right = node->right->first->node_idx;
1400 else
1401 right = node->next->node_idx;
1402 assert (left > -1);
1403 assert (right > -1);
1404 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1405 }
1406 break;
1407
1408 case ANCHOR:
1409 case OP_OPEN_SUBEXP:
1410 case OP_CLOSE_SUBEXP:
1411 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1412 break;
1413
1414 case OP_BACK_REF:
1415 dfa->nexts[idx] = node->next->node_idx;
1416 if (node->token.type == OP_BACK_REF)
1417 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1418 break;
1419
1420 default:
1421 assert (!IS_EPSILON_NODE (node->token.type));
1422 dfa->nexts[idx] = node->next->node_idx;
1423 break;
1424 }
1425
1426 return err;
1427 }
1428
1429 /* Duplicate the epsilon closure of the node ROOT_NODE.
1430 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1431 to their own constraint. */
1432
1433 static reg_errcode_t
1434 internal_function
1435 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1436 int root_node, unsigned int init_constraint)
1437 {
1438 int org_node, clone_node, ret;
1439 unsigned int constraint = init_constraint;
1440 for (org_node = top_org_node, clone_node = top_clone_node;;)
1441 {
1442 int org_dest, clone_dest;
1443 if (dfa->nodes[org_node].type == OP_BACK_REF)
1444 {
1445 /* If the back reference epsilon-transit, its destination must
1446 also have the constraint. Then duplicate the epsilon closure
1447 of the destination of the back reference, and store it in
1448 edests of the back reference. */
1449 org_dest = dfa->nexts[org_node];
1450 re_node_set_empty (dfa->edests + clone_node);
1451 clone_dest = duplicate_node (dfa, org_dest, constraint);
1452 if (BE (clone_dest == -1, 0))
1453 return REG_ESPACE;
1454 dfa->nexts[clone_node] = dfa->nexts[org_node];
1455 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1456 if (BE (ret < 0, 0))
1457 return REG_ESPACE;
1458 }
1459 else if (dfa->edests[org_node].nelem == 0)
1460 {
1461 /* In case of the node can't epsilon-transit, don't duplicate the
1462 destination and store the original destination as the
1463 destination of the node. */
1464 dfa->nexts[clone_node] = dfa->nexts[org_node];
1465 break;
1466 }
1467 else if (dfa->edests[org_node].nelem == 1)
1468 {
1469 /* In case of the node can epsilon-transit, and it has only one
1470 destination. */
1471 org_dest = dfa->edests[org_node].elems[0];
1472 re_node_set_empty (dfa->edests + clone_node);
1473 /* If the node is root_node itself, it means the epsilon clsoure
1474 has a loop. Then tie it to the destination of the root_node. */
1475 if (org_node == root_node && clone_node != org_node)
1476 {
1477 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1478 if (BE (ret < 0, 0))
1479 return REG_ESPACE;
1480 break;
1481 }
1482 /* In case of the node has another constraint, add it. */
1483 constraint |= dfa->nodes[org_node].constraint;
1484 clone_dest = duplicate_node (dfa, org_dest, constraint);
1485 if (BE (clone_dest == -1, 0))
1486 return REG_ESPACE;
1487 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488 if (BE (ret < 0, 0))
1489 return REG_ESPACE;
1490 }
1491 else /* dfa->edests[org_node].nelem == 2 */
1492 {
1493 /* In case of the node can epsilon-transit, and it has two
1494 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1495 org_dest = dfa->edests[org_node].elems[0];
1496 re_node_set_empty (dfa->edests + clone_node);
1497 /* Search for a duplicated node which satisfies the constraint. */
1498 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1499 if (clone_dest == -1)
1500 {
1501 /* There is no such duplicated node, create a new one. */
1502 reg_errcode_t err;
1503 clone_dest = duplicate_node (dfa, org_dest, constraint);
1504 if (BE (clone_dest == -1, 0))
1505 return REG_ESPACE;
1506 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1507 if (BE (ret < 0, 0))
1508 return REG_ESPACE;
1509 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1510 root_node, constraint);
1511 if (BE (err != REG_NOERROR, 0))
1512 return err;
1513 }
1514 else
1515 {
1516 /* There is a duplicated node which satisfies the constraint,
1517 use it to avoid infinite loop. */
1518 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1521 }
1522
1523 org_dest = dfa->edests[org_node].elems[1];
1524 clone_dest = duplicate_node (dfa, org_dest, constraint);
1525 if (BE (clone_dest == -1, 0))
1526 return REG_ESPACE;
1527 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 if (BE (ret < 0, 0))
1529 return REG_ESPACE;
1530 }
1531 org_node = org_dest;
1532 clone_node = clone_dest;
1533 }
1534 return REG_NOERROR;
1535 }
1536
1537 /* Search for a node which is duplicated from the node ORG_NODE, and
1538 satisfies the constraint CONSTRAINT. */
1539
1540 static int
1541 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1542 unsigned int constraint)
1543 {
1544 int idx;
1545 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1546 {
1547 if (org_node == dfa->org_indices[idx]
1548 && constraint == dfa->nodes[idx].constraint)
1549 return idx; /* Found. */
1550 }
1551 return -1; /* Not found. */
1552 }
1553
1554 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1555 Return the index of the new node, or -1 if insufficient storage is
1556 available. */
1557
1558 static int
1559 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1560 {
1561 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1562 if (BE (dup_idx != -1, 1))
1563 {
1564 dfa->nodes[dup_idx].constraint = constraint;
1565 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1566 dfa->nodes[dup_idx].duplicated = 1;
1567
1568 /* Store the index of the original node. */
1569 dfa->org_indices[dup_idx] = org_idx;
1570 }
1571 return dup_idx;
1572 }
1573
1574 static reg_errcode_t
1575 calc_inveclosure (re_dfa_t *dfa)
1576 {
1577 int src, idx, ret;
1578 for (idx = 0; idx < dfa->nodes_len; ++idx)
1579 re_node_set_init_empty (dfa->inveclosures + idx);
1580
1581 for (src = 0; src < dfa->nodes_len; ++src)
1582 {
1583 int *elems = dfa->eclosures[src].elems;
1584 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1585 {
1586 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1587 if (BE (ret == -1, 0))
1588 return REG_ESPACE;
1589 }
1590 }
1591
1592 return REG_NOERROR;
1593 }
1594
1595 /* Calculate "eclosure" for all the node in DFA. */
1596
1597 static reg_errcode_t
1598 calc_eclosure (re_dfa_t *dfa)
1599 {
1600 int node_idx, incomplete;
1601 #ifdef DEBUG
1602 assert (dfa->nodes_len > 0);
1603 #endif
1604 incomplete = 0;
1605 /* For each nodes, calculate epsilon closure. */
1606 for (node_idx = 0; ; ++node_idx)
1607 {
1608 reg_errcode_t err;
1609 re_node_set eclosure_elem;
1610 if (node_idx == dfa->nodes_len)
1611 {
1612 if (!incomplete)
1613 break;
1614 incomplete = 0;
1615 node_idx = 0;
1616 }
1617
1618 #ifdef DEBUG
1619 assert (dfa->eclosures[node_idx].nelem != -1);
1620 #endif
1621
1622 /* If we have already calculated, skip it. */
1623 if (dfa->eclosures[node_idx].nelem != 0)
1624 continue;
1625 /* Calculate epsilon closure of `node_idx'. */
1626 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1627 if (BE (err != REG_NOERROR, 0))
1628 return err;
1629
1630 if (dfa->eclosures[node_idx].nelem == 0)
1631 {
1632 incomplete = 1;
1633 re_node_set_free (&eclosure_elem);
1634 }
1635 }
1636 return REG_NOERROR;
1637 }
1638
1639 /* Calculate epsilon closure of NODE. */
1640
1641 static reg_errcode_t
1642 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1643 {
1644 reg_errcode_t err;
1645 int i;
1646 re_node_set eclosure;
1647 int ret;
1648 int incomplete = 0;
1649 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1650 if (BE (err != REG_NOERROR, 0))
1651 return err;
1652
1653 /* This indicates that we are calculating this node now.
1654 We reference this value to avoid infinite loop. */
1655 dfa->eclosures[node].nelem = -1;
1656
1657 /* If the current node has constraints, duplicate all nodes
1658 since they must inherit the constraints. */
1659 if (dfa->nodes[node].constraint
1660 && dfa->edests[node].nelem
1661 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1662 {
1663 err = duplicate_node_closure (dfa, node, node, node,
1664 dfa->nodes[node].constraint);
1665 if (BE (err != REG_NOERROR, 0))
1666 return err;
1667 }
1668
1669 /* Expand each epsilon destination nodes. */
1670 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1671 for (i = 0; i < dfa->edests[node].nelem; ++i)
1672 {
1673 re_node_set eclosure_elem;
1674 int edest = dfa->edests[node].elems[i];
1675 /* If calculating the epsilon closure of `edest' is in progress,
1676 return intermediate result. */
1677 if (dfa->eclosures[edest].nelem == -1)
1678 {
1679 incomplete = 1;
1680 continue;
1681 }
1682 /* If we haven't calculated the epsilon closure of `edest' yet,
1683 calculate now. Otherwise use calculated epsilon closure. */
1684 if (dfa->eclosures[edest].nelem == 0)
1685 {
1686 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1687 if (BE (err != REG_NOERROR, 0))
1688 return err;
1689 }
1690 else
1691 eclosure_elem = dfa->eclosures[edest];
1692 /* Merge the epsilon closure of `edest'. */
1693 re_node_set_merge (&eclosure, &eclosure_elem);
1694 /* If the epsilon closure of `edest' is incomplete,
1695 the epsilon closure of this node is also incomplete. */
1696 if (dfa->eclosures[edest].nelem == 0)
1697 {
1698 incomplete = 1;
1699 re_node_set_free (&eclosure_elem);
1700 }
1701 }
1702
1703 /* An epsilon closure includes itself. */
1704 ret = re_node_set_insert (&eclosure, node);
1705 if (BE (ret < 0, 0))
1706 return REG_ESPACE;
1707 if (incomplete && !root)
1708 dfa->eclosures[node].nelem = 0;
1709 else
1710 dfa->eclosures[node] = eclosure;
1711 *new_set = eclosure;
1712 return REG_NOERROR;
1713 }
1714 \f
1715 /* Functions for token which are used in the parser. */
1716
1717 /* Fetch a token from INPUT.
1718 We must not use this function inside bracket expressions. */
1719
1720 static void
1721 internal_function
1722 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1723 {
1724 re_string_skip_bytes (input, peek_token (result, input, syntax));
1725 }
1726
1727 /* Peek a token from INPUT, and return the length of the token.
1728 We must not use this function inside bracket expressions. */
1729
1730 static int
1731 internal_function
1732 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1733 {
1734 unsigned char c;
1735
1736 if (re_string_eoi (input))
1737 {
1738 token->type = END_OF_RE;
1739 return 0;
1740 }
1741
1742 c = re_string_peek_byte (input, 0);
1743 token->opr.c = c;
1744
1745 token->word_char = 0;
1746 #ifdef RE_ENABLE_I18N
1747 token->mb_partial = 0;
1748 if (input->mb_cur_max > 1 &&
1749 !re_string_first_byte (input, re_string_cur_idx (input)))
1750 {
1751 token->type = CHARACTER;
1752 token->mb_partial = 1;
1753 return 1;
1754 }
1755 #endif
1756 if (c == '\\')
1757 {
1758 unsigned char c2;
1759 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1760 {
1761 token->type = BACK_SLASH;
1762 return 1;
1763 }
1764
1765 c2 = re_string_peek_byte_case (input, 1);
1766 token->opr.c = c2;
1767 token->type = CHARACTER;
1768 #ifdef RE_ENABLE_I18N
1769 if (input->mb_cur_max > 1)
1770 {
1771 wint_t wc = re_string_wchar_at (input,
1772 re_string_cur_idx (input) + 1);
1773 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1774 }
1775 else
1776 #endif
1777 token->word_char = IS_WORD_CHAR (c2) != 0;
1778
1779 switch (c2)
1780 {
1781 case '|':
1782 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1783 token->type = OP_ALT;
1784 break;
1785 case '1': case '2': case '3': case '4': case '5':
1786 case '6': case '7': case '8': case '9':
1787 if (!(syntax & RE_NO_BK_REFS))
1788 {
1789 token->type = OP_BACK_REF;
1790 token->opr.idx = c2 - '1';
1791 }
1792 break;
1793 case '<':
1794 if (!(syntax & RE_NO_GNU_OPS))
1795 {
1796 token->type = ANCHOR;
1797 token->opr.ctx_type = WORD_FIRST;
1798 }
1799 break;
1800 case '>':
1801 if (!(syntax & RE_NO_GNU_OPS))
1802 {
1803 token->type = ANCHOR;
1804 token->opr.ctx_type = WORD_LAST;
1805 }
1806 break;
1807 case 'b':
1808 if (!(syntax & RE_NO_GNU_OPS))
1809 {
1810 token->type = ANCHOR;
1811 token->opr.ctx_type = WORD_DELIM;
1812 }
1813 break;
1814 case 'B':
1815 if (!(syntax & RE_NO_GNU_OPS))
1816 {
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = NOT_WORD_DELIM;
1819 }
1820 break;
1821 case 'w':
1822 if (!(syntax & RE_NO_GNU_OPS))
1823 token->type = OP_WORD;
1824 break;
1825 case 'W':
1826 if (!(syntax & RE_NO_GNU_OPS))
1827 token->type = OP_NOTWORD;
1828 break;
1829 case 's':
1830 if (!(syntax & RE_NO_GNU_OPS))
1831 token->type = OP_SPACE;
1832 break;
1833 case 'S':
1834 if (!(syntax & RE_NO_GNU_OPS))
1835 token->type = OP_NOTSPACE;
1836 break;
1837 case '`':
1838 if (!(syntax & RE_NO_GNU_OPS))
1839 {
1840 token->type = ANCHOR;
1841 token->opr.ctx_type = BUF_FIRST;
1842 }
1843 break;
1844 case '\'':
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 {
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = BUF_LAST;
1849 }
1850 break;
1851 case '(':
1852 if (!(syntax & RE_NO_BK_PARENS))
1853 token->type = OP_OPEN_SUBEXP;
1854 break;
1855 case ')':
1856 if (!(syntax & RE_NO_BK_PARENS))
1857 token->type = OP_CLOSE_SUBEXP;
1858 break;
1859 case '+':
1860 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1861 token->type = OP_DUP_PLUS;
1862 break;
1863 case '?':
1864 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1865 token->type = OP_DUP_QUESTION;
1866 break;
1867 case '{':
1868 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1869 token->type = OP_OPEN_DUP_NUM;
1870 break;
1871 case '}':
1872 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1873 token->type = OP_CLOSE_DUP_NUM;
1874 break;
1875 default:
1876 break;
1877 }
1878 return 2;
1879 }
1880
1881 token->type = CHARACTER;
1882 #ifdef RE_ENABLE_I18N
1883 if (input->mb_cur_max > 1)
1884 {
1885 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1886 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1887 }
1888 else
1889 #endif
1890 token->word_char = IS_WORD_CHAR (token->opr.c);
1891
1892 switch (c)
1893 {
1894 case '\n':
1895 if (syntax & RE_NEWLINE_ALT)
1896 token->type = OP_ALT;
1897 break;
1898 case '|':
1899 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1900 token->type = OP_ALT;
1901 break;
1902 case '*':
1903 token->type = OP_DUP_ASTERISK;
1904 break;
1905 case '+':
1906 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_PLUS;
1908 break;
1909 case '?':
1910 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_QUESTION;
1912 break;
1913 case '{':
1914 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1915 token->type = OP_OPEN_DUP_NUM;
1916 break;
1917 case '}':
1918 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1919 token->type = OP_CLOSE_DUP_NUM;
1920 break;
1921 case '(':
1922 if (syntax & RE_NO_BK_PARENS)
1923 token->type = OP_OPEN_SUBEXP;
1924 break;
1925 case ')':
1926 if (syntax & RE_NO_BK_PARENS)
1927 token->type = OP_CLOSE_SUBEXP;
1928 break;
1929 case '[':
1930 token->type = OP_OPEN_BRACKET;
1931 break;
1932 case '.':
1933 token->type = OP_PERIOD;
1934 break;
1935 case '^':
1936 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1937 re_string_cur_idx (input) != 0)
1938 {
1939 char prev = re_string_peek_byte (input, -1);
1940 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1941 break;
1942 }
1943 token->type = ANCHOR;
1944 token->opr.ctx_type = LINE_FIRST;
1945 break;
1946 case '$':
1947 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1948 re_string_cur_idx (input) + 1 != re_string_length (input))
1949 {
1950 re_token_t next;
1951 re_string_skip_bytes (input, 1);
1952 peek_token (&next, input, syntax);
1953 re_string_skip_bytes (input, -1);
1954 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1955 break;
1956 }
1957 token->type = ANCHOR;
1958 token->opr.ctx_type = LINE_LAST;
1959 break;
1960 default:
1961 break;
1962 }
1963 return 1;
1964 }
1965
1966 /* Peek a token from INPUT, and return the length of the token.
1967 We must not use this function out of bracket expressions. */
1968
1969 static int
1970 internal_function
1971 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1972 {
1973 unsigned char c;
1974 if (re_string_eoi (input))
1975 {
1976 token->type = END_OF_RE;
1977 return 0;
1978 }
1979 c = re_string_peek_byte (input, 0);
1980 token->opr.c = c;
1981
1982 #ifdef RE_ENABLE_I18N
1983 if (input->mb_cur_max > 1 &&
1984 !re_string_first_byte (input, re_string_cur_idx (input)))
1985 {
1986 token->type = CHARACTER;
1987 return 1;
1988 }
1989 #endif /* RE_ENABLE_I18N */
1990
1991 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1992 && re_string_cur_idx (input) + 1 < re_string_length (input))
1993 {
1994 /* In this case, '\' escape a character. */
1995 unsigned char c2;
1996 re_string_skip_bytes (input, 1);
1997 c2 = re_string_peek_byte (input, 0);
1998 token->opr.c = c2;
1999 token->type = CHARACTER;
2000 return 1;
2001 }
2002 if (c == '[') /* '[' is a special char in a bracket exps. */
2003 {
2004 unsigned char c2;
2005 int token_len;
2006 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2007 c2 = re_string_peek_byte (input, 1);
2008 else
2009 c2 = 0;
2010 token->opr.c = c2;
2011 token_len = 2;
2012 switch (c2)
2013 {
2014 case '.':
2015 token->type = OP_OPEN_COLL_ELEM;
2016 break;
2017 case '=':
2018 token->type = OP_OPEN_EQUIV_CLASS;
2019 break;
2020 case ':':
2021 if (syntax & RE_CHAR_CLASSES)
2022 {
2023 token->type = OP_OPEN_CHAR_CLASS;
2024 break;
2025 }
2026 /* else fall through. */
2027 default:
2028 token->type = CHARACTER;
2029 token->opr.c = c;
2030 token_len = 1;
2031 break;
2032 }
2033 return token_len;
2034 }
2035 switch (c)
2036 {
2037 case '-':
2038 token->type = OP_CHARSET_RANGE;
2039 break;
2040 case ']':
2041 token->type = OP_CLOSE_BRACKET;
2042 break;
2043 case '^':
2044 token->type = OP_NON_MATCH_LIST;
2045 break;
2046 default:
2047 token->type = CHARACTER;
2048 }
2049 return 1;
2050 }
2051 \f
2052 /* Functions for parser. */
2053
2054 /* Entry point of the parser.
2055 Parse the regular expression REGEXP and return the structure tree.
2056 If an error is occured, ERR is set by error code, and return NULL.
2057 This function build the following tree, from regular expression <reg_exp>:
2058 CAT
2059 / \
2060 / \
2061 <reg_exp> EOR
2062
2063 CAT means concatenation.
2064 EOR means end of regular expression. */
2065
2066 static bin_tree_t *
2067 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2068 reg_errcode_t *err)
2069 {
2070 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2071 bin_tree_t *tree, *eor, *root;
2072 re_token_t current_token;
2073 dfa->syntax = syntax;
2074 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2075 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2076 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2077 return NULL;
2078 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2079 if (tree != NULL)
2080 root = create_tree (dfa, tree, eor, CONCAT);
2081 else
2082 root = eor;
2083 if (BE (eor == NULL || root == NULL, 0))
2084 {
2085 *err = REG_ESPACE;
2086 return NULL;
2087 }
2088 return root;
2089 }
2090
2091 /* This function build the following tree, from regular expression
2092 <branch1>|<branch2>:
2093 ALT
2094 / \
2095 / \
2096 <branch1> <branch2>
2097
2098 ALT means alternative, which represents the operator `|'. */
2099
2100 static bin_tree_t *
2101 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2102 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2103 {
2104 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105 bin_tree_t *tree, *branch = NULL;
2106 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2107 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2108 return NULL;
2109
2110 while (token->type == OP_ALT)
2111 {
2112 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2113 if (token->type != OP_ALT && token->type != END_OF_RE
2114 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2115 {
2116 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2117 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2118 return NULL;
2119 }
2120 else
2121 branch = NULL;
2122 tree = create_tree (dfa, tree, branch, OP_ALT);
2123 if (BE (tree == NULL, 0))
2124 {
2125 *err = REG_ESPACE;
2126 return NULL;
2127 }
2128 }
2129 return tree;
2130 }
2131
2132 /* This function build the following tree, from regular expression
2133 <exp1><exp2>:
2134 CAT
2135 / \
2136 / \
2137 <exp1> <exp2>
2138
2139 CAT means concatenation. */
2140
2141 static bin_tree_t *
2142 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2144 {
2145 bin_tree_t *tree, *exp;
2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2148 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149 return NULL;
2150
2151 while (token->type != OP_ALT && token->type != END_OF_RE
2152 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2153 {
2154 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2155 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2156 {
2157 return NULL;
2158 }
2159 if (tree != NULL && exp != NULL)
2160 {
2161 tree = create_tree (dfa, tree, exp, CONCAT);
2162 if (tree == NULL)
2163 {
2164 *err = REG_ESPACE;
2165 return NULL;
2166 }
2167 }
2168 else if (tree == NULL)
2169 tree = exp;
2170 /* Otherwise exp == NULL, we don't need to create new tree. */
2171 }
2172 return tree;
2173 }
2174
2175 /* This function build the following tree, from regular expression a*:
2176 *
2177 |
2178 a
2179 */
2180
2181 static bin_tree_t *
2182 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2184 {
2185 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2186 bin_tree_t *tree;
2187 switch (token->type)
2188 {
2189 case CHARACTER:
2190 tree = create_token_tree (dfa, NULL, NULL, token);
2191 if (BE (tree == NULL, 0))
2192 {
2193 *err = REG_ESPACE;
2194 return NULL;
2195 }
2196 #ifdef RE_ENABLE_I18N
2197 if (dfa->mb_cur_max > 1)
2198 {
2199 while (!re_string_eoi (regexp)
2200 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2201 {
2202 bin_tree_t *mbc_remain;
2203 fetch_token (token, regexp, syntax);
2204 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2205 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2206 if (BE (mbc_remain == NULL || tree == NULL, 0))
2207 {
2208 *err = REG_ESPACE;
2209 return NULL;
2210 }
2211 }
2212 }
2213 #endif
2214 break;
2215 case OP_OPEN_SUBEXP:
2216 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2217 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2218 return NULL;
2219 break;
2220 case OP_OPEN_BRACKET:
2221 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2222 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223 return NULL;
2224 break;
2225 case OP_BACK_REF:
2226 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2227 {
2228 *err = REG_ESUBREG;
2229 return NULL;
2230 }
2231 dfa->used_bkref_map |= 1 << token->opr.idx;
2232 tree = create_token_tree (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2234 {
2235 *err = REG_ESPACE;
2236 return NULL;
2237 }
2238 ++dfa->nbackref;
2239 dfa->has_mb_node = 1;
2240 break;
2241 case OP_OPEN_DUP_NUM:
2242 if (syntax & RE_CONTEXT_INVALID_DUP)
2243 {
2244 *err = REG_BADRPT;
2245 return NULL;
2246 }
2247 /* FALLTHROUGH */
2248 case OP_DUP_ASTERISK:
2249 case OP_DUP_PLUS:
2250 case OP_DUP_QUESTION:
2251 if (syntax & RE_CONTEXT_INVALID_OPS)
2252 {
2253 *err = REG_BADRPT;
2254 return NULL;
2255 }
2256 else if (syntax & RE_CONTEXT_INDEP_OPS)
2257 {
2258 fetch_token (token, regexp, syntax);
2259 return parse_expression (regexp, preg, token, syntax, nest, err);
2260 }
2261 /* else fall through */
2262 case OP_CLOSE_SUBEXP:
2263 if ((token->type == OP_CLOSE_SUBEXP) &&
2264 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2265 {
2266 *err = REG_ERPAREN;
2267 return NULL;
2268 }
2269 /* else fall through */
2270 case OP_CLOSE_DUP_NUM:
2271 /* We treat it as a normal character. */
2272
2273 /* Then we can these characters as normal characters. */
2274 token->type = CHARACTER;
2275 /* mb_partial and word_char bits should be initialized already
2276 by peek_token. */
2277 tree = create_token_tree (dfa, NULL, NULL, token);
2278 if (BE (tree == NULL, 0))
2279 {
2280 *err = REG_ESPACE;
2281 return NULL;
2282 }
2283 break;
2284 case ANCHOR:
2285 if ((token->opr.ctx_type
2286 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2287 && dfa->word_ops_used == 0)
2288 init_word_char (dfa);
2289 if (token->opr.ctx_type == WORD_DELIM
2290 || token->opr.ctx_type == NOT_WORD_DELIM)
2291 {
2292 bin_tree_t *tree_first, *tree_last;
2293 if (token->opr.ctx_type == WORD_DELIM)
2294 {
2295 token->opr.ctx_type = WORD_FIRST;
2296 tree_first = create_token_tree (dfa, NULL, NULL, token);
2297 token->opr.ctx_type = WORD_LAST;
2298 }
2299 else
2300 {
2301 token->opr.ctx_type = INSIDE_WORD;
2302 tree_first = create_token_tree (dfa, NULL, NULL, token);
2303 token->opr.ctx_type = INSIDE_NOTWORD;
2304 }
2305 tree_last = create_token_tree (dfa, NULL, NULL, token);
2306 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2307 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2308 {
2309 *err = REG_ESPACE;
2310 return NULL;
2311 }
2312 }
2313 else
2314 {
2315 tree = create_token_tree (dfa, NULL, NULL, token);
2316 if (BE (tree == NULL, 0))
2317 {
2318 *err = REG_ESPACE;
2319 return NULL;
2320 }
2321 }
2322 /* We must return here, since ANCHORs can't be followed
2323 by repetition operators.
2324 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2325 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2326 fetch_token (token, regexp, syntax);
2327 return tree;
2328 case OP_PERIOD:
2329 tree = create_token_tree (dfa, NULL, NULL, token);
2330 if (BE (tree == NULL, 0))
2331 {
2332 *err = REG_ESPACE;
2333 return NULL;
2334 }
2335 if (dfa->mb_cur_max > 1)
2336 dfa->has_mb_node = 1;
2337 break;
2338 case OP_WORD:
2339 case OP_NOTWORD:
2340 tree = build_charclass_op (dfa, regexp->trans,
2341 (const unsigned char *) "alnum",
2342 (const unsigned char *) "_",
2343 token->type == OP_NOTWORD, err);
2344 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2345 return NULL;
2346 break;
2347 case OP_SPACE:
2348 case OP_NOTSPACE:
2349 tree = build_charclass_op (dfa, regexp->trans,
2350 (const unsigned char *) "space",
2351 (const unsigned char *) "",
2352 token->type == OP_NOTSPACE, err);
2353 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2354 return NULL;
2355 break;
2356 case OP_ALT:
2357 case END_OF_RE:
2358 return NULL;
2359 case BACK_SLASH:
2360 *err = REG_EESCAPE;
2361 return NULL;
2362 default:
2363 /* Must not happen? */
2364 #ifdef DEBUG
2365 assert (0);
2366 #endif
2367 return NULL;
2368 }
2369 fetch_token (token, regexp, syntax);
2370
2371 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2372 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2373 {
2374 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2375 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2376 return NULL;
2377 /* In BRE consecutive duplications are not allowed. */
2378 if ((syntax & RE_CONTEXT_INVALID_DUP)
2379 && (token->type == OP_DUP_ASTERISK
2380 || token->type == OP_OPEN_DUP_NUM))
2381 {
2382 *err = REG_BADRPT;
2383 return NULL;
2384 }
2385 }
2386
2387 return tree;
2388 }
2389
2390 /* This function build the following tree, from regular expression
2391 (<reg_exp>):
2392 SUBEXP
2393 |
2394 <reg_exp>
2395 */
2396
2397 static bin_tree_t *
2398 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2399 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2400 {
2401 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2402 bin_tree_t *tree;
2403 size_t cur_nsub;
2404 cur_nsub = preg->re_nsub++;
2405
2406 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2407
2408 /* The subexpression may be a null string. */
2409 if (token->type == OP_CLOSE_SUBEXP)
2410 tree = NULL;
2411 else
2412 {
2413 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2414 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2415 *err = REG_EPAREN;
2416 if (BE (*err != REG_NOERROR, 0))
2417 return NULL;
2418 }
2419
2420 if (cur_nsub <= '9' - '1')
2421 dfa->completed_bkref_map |= 1 << cur_nsub;
2422
2423 tree = create_tree (dfa, tree, NULL, SUBEXP);
2424 if (BE (tree == NULL, 0))
2425 {
2426 *err = REG_ESPACE;
2427 return NULL;
2428 }
2429 tree->token.opr.idx = cur_nsub;
2430 return tree;
2431 }
2432
2433 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2434
2435 static bin_tree_t *
2436 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2437 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2438 {
2439 bin_tree_t *tree = NULL, *old_tree = NULL;
2440 int i, start, end, start_idx = re_string_cur_idx (regexp);
2441 re_token_t start_token = *token;
2442
2443 if (token->type == OP_OPEN_DUP_NUM)
2444 {
2445 end = 0;
2446 start = fetch_number (regexp, token, syntax);
2447 if (start == -1)
2448 {
2449 if (token->type == CHARACTER && token->opr.c == ',')
2450 start = 0; /* We treat "{,m}" as "{0,m}". */
2451 else
2452 {
2453 *err = REG_BADBR; /* <re>{} is invalid. */
2454 return NULL;
2455 }
2456 }
2457 if (BE (start != -2, 1))
2458 {
2459 /* We treat "{n}" as "{n,n}". */
2460 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2461 : ((token->type == CHARACTER && token->opr.c == ',')
2462 ? fetch_number (regexp, token, syntax) : -2));
2463 }
2464 if (BE (start == -2 || end == -2, 0))
2465 {
2466 /* Invalid sequence. */
2467 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2468 {
2469 if (token->type == END_OF_RE)
2470 *err = REG_EBRACE;
2471 else
2472 *err = REG_BADBR;
2473
2474 return NULL;
2475 }
2476
2477 /* If the syntax bit is set, rollback. */
2478 re_string_set_index (regexp, start_idx);
2479 *token = start_token;
2480 token->type = CHARACTER;
2481 /* mb_partial and word_char bits should be already initialized by
2482 peek_token. */
2483 return elem;
2484 }
2485
2486 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2487 {
2488 /* First number greater than second. */
2489 *err = REG_BADBR;
2490 return NULL;
2491 }
2492 }
2493 else
2494 {
2495 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2496 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2497 }
2498
2499 fetch_token (token, regexp, syntax);
2500
2501 if (BE (elem == NULL, 0))
2502 return NULL;
2503 if (BE (start == 0 && end == 0, 0))
2504 {
2505 postorder (elem, free_tree, NULL);
2506 return NULL;
2507 }
2508
2509 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2510 if (BE (start > 0, 0))
2511 {
2512 tree = elem;
2513 for (i = 2; i <= start; ++i)
2514 {
2515 elem = duplicate_tree (elem, dfa);
2516 tree = create_tree (dfa, tree, elem, CONCAT);
2517 if (BE (elem == NULL || tree == NULL, 0))
2518 goto parse_dup_op_espace;
2519 }
2520
2521 if (start == end)
2522 return tree;
2523
2524 /* Duplicate ELEM before it is marked optional. */
2525 elem = duplicate_tree (elem, dfa);
2526 old_tree = tree;
2527 }
2528 else
2529 old_tree = NULL;
2530
2531 if (elem->token.type == SUBEXP)
2532 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2533
2534 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2535 if (BE (tree == NULL, 0))
2536 goto parse_dup_op_espace;
2537
2538 /* This loop is actually executed only when end != -1,
2539 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2540 already created the start+1-th copy. */
2541 for (i = start + 2; i <= end; ++i)
2542 {
2543 elem = duplicate_tree (elem, dfa);
2544 tree = create_tree (dfa, tree, elem, CONCAT);
2545 if (BE (elem == NULL || tree == NULL, 0))
2546 goto parse_dup_op_espace;
2547
2548 tree = create_tree (dfa, tree, NULL, OP_ALT);
2549 if (BE (tree == NULL, 0))
2550 goto parse_dup_op_espace;
2551 }
2552
2553 if (old_tree)
2554 tree = create_tree (dfa, old_tree, tree, CONCAT);
2555
2556 return tree;
2557
2558 parse_dup_op_espace:
2559 *err = REG_ESPACE;
2560 return NULL;
2561 }
2562
2563 /* Size of the names for collating symbol/equivalence_class/character_class.
2564 I'm not sure, but maybe enough. */
2565 #define BRACKET_NAME_BUF_SIZE 32
2566
2567 #ifndef _LIBC
2568 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2569 Build the range expression which starts from START_ELEM, and ends
2570 at END_ELEM. The result are written to MBCSET and SBCSET.
2571 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2572 mbcset->range_ends, is a pointer argument sinse we may
2573 update it. */
2574
2575 static reg_errcode_t
2576 internal_function
2577 # ifdef RE_ENABLE_I18N
2578 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2579 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2580 # else /* not RE_ENABLE_I18N */
2581 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2582 bracket_elem_t *end_elem)
2583 # endif /* not RE_ENABLE_I18N */
2584 {
2585 unsigned int start_ch, end_ch;
2586 /* Equivalence Classes and Character Classes can't be a range start/end. */
2587 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2588 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2589 0))
2590 return REG_ERANGE;
2591
2592 /* We can handle no multi character collating elements without libc
2593 support. */
2594 if (BE ((start_elem->type == COLL_SYM
2595 && strlen ((char *) start_elem->opr.name) > 1)
2596 || (end_elem->type == COLL_SYM
2597 && strlen ((char *) end_elem->opr.name) > 1), 0))
2598 return REG_ECOLLATE;
2599
2600 # ifdef RE_ENABLE_I18N
2601 {
2602 wchar_t wc;
2603 wint_t start_wc;
2604 wint_t end_wc;
2605 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2606
2607 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2608 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2609 : 0));
2610 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2611 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2612 : 0));
2613 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2614 ? __btowc (start_ch) : start_elem->opr.wch);
2615 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2616 ? __btowc (end_ch) : end_elem->opr.wch);
2617 if (start_wc == WEOF || end_wc == WEOF)
2618 return REG_ECOLLATE;
2619 cmp_buf[0] = start_wc;
2620 cmp_buf[4] = end_wc;
2621 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2622 return REG_ERANGE;
2623
2624 /* Got valid collation sequence values, add them as a new entry.
2625 However, for !_LIBC we have no collation elements: if the
2626 character set is single byte, the single byte character set
2627 that we build below suffices. parse_bracket_exp passes
2628 no MBCSET if dfa->mb_cur_max == 1. */
2629 if (mbcset)
2630 {
2631 /* Check the space of the arrays. */
2632 if (BE (*range_alloc == mbcset->nranges, 0))
2633 {
2634 /* There is not enough space, need realloc. */
2635 wchar_t *new_array_start, *new_array_end;
2636 int new_nranges;
2637
2638 /* +1 in case of mbcset->nranges is 0. */
2639 new_nranges = 2 * mbcset->nranges + 1;
2640 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2641 are NULL if *range_alloc == 0. */
2642 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2643 new_nranges);
2644 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2645 new_nranges);
2646
2647 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2648 return REG_ESPACE;
2649
2650 mbcset->range_starts = new_array_start;
2651 mbcset->range_ends = new_array_end;
2652 *range_alloc = new_nranges;
2653 }
2654
2655 mbcset->range_starts[mbcset->nranges] = start_wc;
2656 mbcset->range_ends[mbcset->nranges++] = end_wc;
2657 }
2658
2659 /* Build the table for single byte characters. */
2660 for (wc = 0; wc < SBC_MAX; ++wc)
2661 {
2662 cmp_buf[2] = wc;
2663 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2664 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2665 bitset_set (sbcset, wc);
2666 }
2667 }
2668 # else /* not RE_ENABLE_I18N */
2669 {
2670 unsigned int ch;
2671 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2672 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2673 : 0));
2674 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2675 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2676 : 0));
2677 if (start_ch > end_ch)
2678 return REG_ERANGE;
2679 /* Build the table for single byte characters. */
2680 for (ch = 0; ch < SBC_MAX; ++ch)
2681 if (start_ch <= ch && ch <= end_ch)
2682 bitset_set (sbcset, ch);
2683 }
2684 # endif /* not RE_ENABLE_I18N */
2685 return REG_NOERROR;
2686 }
2687 #endif /* not _LIBC */
2688
2689 #ifndef _LIBC
2690 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2691 Build the collating element which is represented by NAME.
2692 The result are written to MBCSET and SBCSET.
2693 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2694 pointer argument since we may update it. */
2695
2696 static reg_errcode_t
2697 internal_function
2698 # ifdef RE_ENABLE_I18N
2699 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2700 int *coll_sym_alloc, const unsigned char *name)
2701 # else /* not RE_ENABLE_I18N */
2702 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2703 # endif /* not RE_ENABLE_I18N */
2704 {
2705 size_t name_len = strlen ((const char *) name);
2706 if (BE (name_len != 1, 0))
2707 return REG_ECOLLATE;
2708 else
2709 {
2710 bitset_set (sbcset, name[0]);
2711 return REG_NOERROR;
2712 }
2713 }
2714 #endif /* not _LIBC */
2715
2716 /* This function parse bracket expression like "[abc]", "[a-c]",
2717 "[[.a-a.]]" etc. */
2718
2719 static bin_tree_t *
2720 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2721 reg_syntax_t syntax, reg_errcode_t *err)
2722 {
2723 #ifdef _LIBC
2724 const unsigned char *collseqmb;
2725 const char *collseqwc;
2726 uint32_t nrules;
2727 int32_t table_size;
2728 const int32_t *symb_table;
2729 const unsigned char *extra;
2730
2731 /* Local function for parse_bracket_exp used in _LIBC environement.
2732 Seek the collating symbol entry correspondings to NAME.
2733 Return the index of the symbol in the SYMB_TABLE. */
2734
2735 auto inline int32_t
2736 __attribute ((always_inline))
2737 seek_collating_symbol_entry (name, name_len)
2738 const unsigned char *name;
2739 size_t name_len;
2740 {
2741 int32_t hash = elem_hash ((const char *) name, name_len);
2742 int32_t elem = hash % table_size;
2743 if (symb_table[2 * elem] != 0)
2744 {
2745 int32_t second = hash % (table_size - 2) + 1;
2746
2747 do
2748 {
2749 /* First compare the hashing value. */
2750 if (symb_table[2 * elem] == hash
2751 /* Compare the length of the name. */
2752 && name_len == extra[symb_table[2 * elem + 1]]
2753 /* Compare the name. */
2754 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2755 name_len) == 0)
2756 {
2757 /* Yep, this is the entry. */
2758 break;
2759 }
2760
2761 /* Next entry. */
2762 elem += second;
2763 }
2764 while (symb_table[2 * elem] != 0);
2765 }
2766 return elem;
2767 }
2768
2769 /* Local function for parse_bracket_exp used in _LIBC environment.
2770 Look up the collation sequence value of BR_ELEM.
2771 Return the value if succeeded, UINT_MAX otherwise. */
2772
2773 auto inline unsigned int
2774 __attribute ((always_inline))
2775 lookup_collation_sequence_value (br_elem)
2776 bracket_elem_t *br_elem;
2777 {
2778 if (br_elem->type == SB_CHAR)
2779 {
2780 /*
2781 if (MB_CUR_MAX == 1)
2782 */
2783 if (nrules == 0)
2784 return collseqmb[br_elem->opr.ch];
2785 else
2786 {
2787 wint_t wc = __btowc (br_elem->opr.ch);
2788 return __collseq_table_lookup (collseqwc, wc);
2789 }
2790 }
2791 else if (br_elem->type == MB_CHAR)
2792 {
2793 if (nrules != 0)
2794 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2795 }
2796 else if (br_elem->type == COLL_SYM)
2797 {
2798 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2799 if (nrules != 0)
2800 {
2801 int32_t elem, idx;
2802 elem = seek_collating_symbol_entry (br_elem->opr.name,
2803 sym_name_len);
2804 if (symb_table[2 * elem] != 0)
2805 {
2806 /* We found the entry. */
2807 idx = symb_table[2 * elem + 1];
2808 /* Skip the name of collating element name. */
2809 idx += 1 + extra[idx];
2810 /* Skip the byte sequence of the collating element. */
2811 idx += 1 + extra[idx];
2812 /* Adjust for the alignment. */
2813 idx = (idx + 3) & ~3;
2814 /* Skip the multibyte collation sequence value. */
2815 idx += sizeof (unsigned int);
2816 /* Skip the wide char sequence of the collating element. */
2817 idx += sizeof (unsigned int) *
2818 (1 + *(unsigned int *) (extra + idx));
2819 /* Return the collation sequence value. */
2820 return *(unsigned int *) (extra + idx);
2821 }
2822 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2823 {
2824 /* No valid character. Match it as a single byte
2825 character. */
2826 return collseqmb[br_elem->opr.name[0]];
2827 }
2828 }
2829 else if (sym_name_len == 1)
2830 return collseqmb[br_elem->opr.name[0]];
2831 }
2832 return UINT_MAX;
2833 }
2834
2835 /* Local function for parse_bracket_exp used in _LIBC environement.
2836 Build the range expression which starts from START_ELEM, and ends
2837 at END_ELEM. The result are written to MBCSET and SBCSET.
2838 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2839 mbcset->range_ends, is a pointer argument sinse we may
2840 update it. */
2841
2842 auto inline reg_errcode_t
2843 __attribute ((always_inline))
2844 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2845 re_charset_t *mbcset;
2846 int *range_alloc;
2847 bitset_t sbcset;
2848 bracket_elem_t *start_elem, *end_elem;
2849 {
2850 unsigned int ch;
2851 uint32_t start_collseq;
2852 uint32_t end_collseq;
2853
2854 /* Equivalence Classes and Character Classes can't be a range
2855 start/end. */
2856 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2857 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2858 0))
2859 return REG_ERANGE;
2860
2861 start_collseq = lookup_collation_sequence_value (start_elem);
2862 end_collseq = lookup_collation_sequence_value (end_elem);
2863 /* Check start/end collation sequence values. */
2864 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2865 return REG_ECOLLATE;
2866 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2867 return REG_ERANGE;
2868
2869 /* Got valid collation sequence values, add them as a new entry.
2870 However, if we have no collation elements, and the character set
2871 is single byte, the single byte character set that we
2872 build below suffices. */
2873 if (nrules > 0 || dfa->mb_cur_max > 1)
2874 {
2875 /* Check the space of the arrays. */
2876 if (BE (*range_alloc == mbcset->nranges, 0))
2877 {
2878 /* There is not enough space, need realloc. */
2879 uint32_t *new_array_start;
2880 uint32_t *new_array_end;
2881 int new_nranges;
2882
2883 /* +1 in case of mbcset->nranges is 0. */
2884 new_nranges = 2 * mbcset->nranges + 1;
2885 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2886 new_nranges);
2887 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2888 new_nranges);
2889
2890 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2891 return REG_ESPACE;
2892
2893 mbcset->range_starts = new_array_start;
2894 mbcset->range_ends = new_array_end;
2895 *range_alloc = new_nranges;
2896 }
2897
2898 mbcset->range_starts[mbcset->nranges] = start_collseq;
2899 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2900 }
2901
2902 /* Build the table for single byte characters. */
2903 for (ch = 0; ch < SBC_MAX; ch++)
2904 {
2905 uint32_t ch_collseq;
2906 /*
2907 if (MB_CUR_MAX == 1)
2908 */
2909 if (nrules == 0)
2910 ch_collseq = collseqmb[ch];
2911 else
2912 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2913 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2914 bitset_set (sbcset, ch);
2915 }
2916 return REG_NOERROR;
2917 }
2918
2919 /* Local function for parse_bracket_exp used in _LIBC environement.
2920 Build the collating element which is represented by NAME.
2921 The result are written to MBCSET and SBCSET.
2922 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2923 pointer argument sinse we may update it. */
2924
2925 auto inline reg_errcode_t
2926 __attribute ((always_inline))
2927 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2928 re_charset_t *mbcset;
2929 int *coll_sym_alloc;
2930 bitset_t sbcset;
2931 const unsigned char *name;
2932 {
2933 int32_t elem, idx;
2934 size_t name_len = strlen ((const char *) name);
2935 if (nrules != 0)
2936 {
2937 elem = seek_collating_symbol_entry (name, name_len);
2938 if (symb_table[2 * elem] != 0)
2939 {
2940 /* We found the entry. */
2941 idx = symb_table[2 * elem + 1];
2942 /* Skip the name of collating element name. */
2943 idx += 1 + extra[idx];
2944 }
2945 else if (symb_table[2 * elem] == 0 && name_len == 1)
2946 {
2947 /* No valid character, treat it as a normal
2948 character. */
2949 bitset_set (sbcset, name[0]);
2950 return REG_NOERROR;
2951 }
2952 else
2953 return REG_ECOLLATE;
2954
2955 /* Got valid collation sequence, add it as a new entry. */
2956 /* Check the space of the arrays. */
2957 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2958 {
2959 /* Not enough, realloc it. */
2960 /* +1 in case of mbcset->ncoll_syms is 0. */
2961 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2962 /* Use realloc since mbcset->coll_syms is NULL
2963 if *alloc == 0. */
2964 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2965 new_coll_sym_alloc);
2966 if (BE (new_coll_syms == NULL, 0))
2967 return REG_ESPACE;
2968 mbcset->coll_syms = new_coll_syms;
2969 *coll_sym_alloc = new_coll_sym_alloc;
2970 }
2971 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2972 return REG_NOERROR;
2973 }
2974 else
2975 {
2976 if (BE (name_len != 1, 0))
2977 return REG_ECOLLATE;
2978 else
2979 {
2980 bitset_set (sbcset, name[0]);
2981 return REG_NOERROR;
2982 }
2983 }
2984 }
2985 #endif
2986
2987 re_token_t br_token;
2988 re_bitset_ptr_t sbcset;
2989 #ifdef RE_ENABLE_I18N
2990 re_charset_t *mbcset;
2991 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2992 int equiv_class_alloc = 0, char_class_alloc = 0;
2993 #endif /* not RE_ENABLE_I18N */
2994 int non_match = 0;
2995 bin_tree_t *work_tree;
2996 int token_len;
2997 int first_round = 1;
2998 #ifdef _LIBC
2999 collseqmb = (const unsigned char *)
3000 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3001 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3002 if (nrules)
3003 {
3004 /*
3005 if (MB_CUR_MAX > 1)
3006 */
3007 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3008 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3009 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3010 _NL_COLLATE_SYMB_TABLEMB);
3011 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3012 _NL_COLLATE_SYMB_EXTRAMB);
3013 }
3014 #endif
3015 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3016 #ifdef RE_ENABLE_I18N
3017 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3018 #endif /* RE_ENABLE_I18N */
3019 #ifdef RE_ENABLE_I18N
3020 if (BE (sbcset == NULL || mbcset == NULL, 0))
3021 #else
3022 if (BE (sbcset == NULL, 0))
3023 #endif /* RE_ENABLE_I18N */
3024 {
3025 *err = REG_ESPACE;
3026 return NULL;
3027 }
3028
3029 token_len = peek_token_bracket (token, regexp, syntax);
3030 if (BE (token->type == END_OF_RE, 0))
3031 {
3032 *err = REG_BADPAT;
3033 goto parse_bracket_exp_free_return;
3034 }
3035 if (token->type == OP_NON_MATCH_LIST)
3036 {
3037 #ifdef RE_ENABLE_I18N
3038 mbcset->non_match = 1;
3039 #endif /* not RE_ENABLE_I18N */
3040 non_match = 1;
3041 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3042 bitset_set (sbcset, '\n');
3043 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3044 token_len = peek_token_bracket (token, regexp, syntax);
3045 if (BE (token->type == END_OF_RE, 0))
3046 {
3047 *err = REG_BADPAT;
3048 goto parse_bracket_exp_free_return;
3049 }
3050 }
3051
3052 /* We treat the first ']' as a normal character. */
3053 if (token->type == OP_CLOSE_BRACKET)
3054 token->type = CHARACTER;
3055
3056 while (1)
3057 {
3058 bracket_elem_t start_elem, end_elem;
3059 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3060 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3061 reg_errcode_t ret;
3062 int token_len2 = 0, is_range_exp = 0;
3063 re_token_t token2;
3064
3065 start_elem.opr.name = start_name_buf;
3066 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3067 syntax, first_round);
3068 if (BE (ret != REG_NOERROR, 0))
3069 {
3070 *err = ret;
3071 goto parse_bracket_exp_free_return;
3072 }
3073 first_round = 0;
3074
3075 /* Get information about the next token. We need it in any case. */
3076 token_len = peek_token_bracket (token, regexp, syntax);
3077
3078 /* Do not check for ranges if we know they are not allowed. */
3079 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3080 {
3081 if (BE (token->type == END_OF_RE, 0))
3082 {
3083 *err = REG_EBRACK;
3084 goto parse_bracket_exp_free_return;
3085 }
3086 if (token->type == OP_CHARSET_RANGE)
3087 {
3088 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3089 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3090 if (BE (token2.type == END_OF_RE, 0))
3091 {
3092 *err = REG_EBRACK;
3093 goto parse_bracket_exp_free_return;
3094 }
3095 if (token2.type == OP_CLOSE_BRACKET)
3096 {
3097 /* We treat the last '-' as a normal character. */
3098 re_string_skip_bytes (regexp, -token_len);
3099 token->type = CHARACTER;
3100 }
3101 else
3102 is_range_exp = 1;
3103 }
3104 }
3105
3106 if (is_range_exp == 1)
3107 {
3108 end_elem.opr.name = end_name_buf;
3109 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3110 dfa, syntax, 1);
3111 if (BE (ret != REG_NOERROR, 0))
3112 {
3113 *err = ret;
3114 goto parse_bracket_exp_free_return;
3115 }
3116
3117 token_len = peek_token_bracket (token, regexp, syntax);
3118
3119 #ifdef _LIBC
3120 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3121 &start_elem, &end_elem);
3122 #else
3123 # ifdef RE_ENABLE_I18N
3124 *err = build_range_exp (sbcset,
3125 dfa->mb_cur_max > 1 ? mbcset : NULL,
3126 &range_alloc, &start_elem, &end_elem);
3127 # else
3128 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3129 # endif
3130 #endif /* RE_ENABLE_I18N */
3131 if (BE (*err != REG_NOERROR, 0))
3132 goto parse_bracket_exp_free_return;
3133 }
3134 else
3135 {
3136 switch (start_elem.type)
3137 {
3138 case SB_CHAR:
3139 bitset_set (sbcset, start_elem.opr.ch);
3140 break;
3141 #ifdef RE_ENABLE_I18N
3142 case MB_CHAR:
3143 /* Check whether the array has enough space. */
3144 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3145 {
3146 wchar_t *new_mbchars;
3147 /* Not enough, realloc it. */
3148 /* +1 in case of mbcset->nmbchars is 0. */
3149 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3150 /* Use realloc since array is NULL if *alloc == 0. */
3151 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3152 mbchar_alloc);
3153 if (BE (new_mbchars == NULL, 0))
3154 goto parse_bracket_exp_espace;
3155 mbcset->mbchars = new_mbchars;
3156 }
3157 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3158 break;
3159 #endif /* RE_ENABLE_I18N */
3160 case EQUIV_CLASS:
3161 *err = build_equiv_class (sbcset,
3162 #ifdef RE_ENABLE_I18N
3163 mbcset, &equiv_class_alloc,
3164 #endif /* RE_ENABLE_I18N */
3165 start_elem.opr.name);
3166 if (BE (*err != REG_NOERROR, 0))
3167 goto parse_bracket_exp_free_return;
3168 break;
3169 case COLL_SYM:
3170 *err = build_collating_symbol (sbcset,
3171 #ifdef RE_ENABLE_I18N
3172 mbcset, &coll_sym_alloc,
3173 #endif /* RE_ENABLE_I18N */
3174 start_elem.opr.name);
3175 if (BE (*err != REG_NOERROR, 0))
3176 goto parse_bracket_exp_free_return;
3177 break;
3178 case CHAR_CLASS:
3179 *err = build_charclass (regexp->trans, sbcset,
3180 #ifdef RE_ENABLE_I18N
3181 mbcset, &char_class_alloc,
3182 #endif /* RE_ENABLE_I18N */
3183 start_elem.opr.name, syntax);
3184 if (BE (*err != REG_NOERROR, 0))
3185 goto parse_bracket_exp_free_return;
3186 break;
3187 default:
3188 assert (0);
3189 break;
3190 }
3191 }
3192 if (BE (token->type == END_OF_RE, 0))
3193 {
3194 *err = REG_EBRACK;
3195 goto parse_bracket_exp_free_return;
3196 }
3197 if (token->type == OP_CLOSE_BRACKET)
3198 break;
3199 }
3200
3201 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3202
3203 /* If it is non-matching list. */
3204 if (non_match)
3205 bitset_not (sbcset);
3206
3207 #ifdef RE_ENABLE_I18N
3208 /* Ensure only single byte characters are set. */
3209 if (dfa->mb_cur_max > 1)
3210 bitset_mask (sbcset, dfa->sb_char);
3211
3212 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3213 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3214 || mbcset->non_match)))
3215 {
3216 bin_tree_t *mbc_tree;
3217 int sbc_idx;
3218 /* Build a tree for complex bracket. */
3219 dfa->has_mb_node = 1;
3220 br_token.type = COMPLEX_BRACKET;
3221 br_token.opr.mbcset = mbcset;
3222 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3223 if (BE (mbc_tree == NULL, 0))
3224 goto parse_bracket_exp_espace;
3225 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3226 if (sbcset[sbc_idx])
3227 break;
3228 /* If there are no bits set in sbcset, there is no point
3229 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3230 if (sbc_idx < BITSET_WORDS)
3231 {
3232 /* Build a tree for simple bracket. */
3233 br_token.type = SIMPLE_BRACKET;
3234 br_token.opr.sbcset = sbcset;
3235 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3236 if (BE (work_tree == NULL, 0))
3237 goto parse_bracket_exp_espace;
3238
3239 /* Then join them by ALT node. */
3240 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3241 if (BE (work_tree == NULL, 0))
3242 goto parse_bracket_exp_espace;
3243 }
3244 else
3245 {
3246 re_free (sbcset);
3247 work_tree = mbc_tree;
3248 }
3249 }
3250 else
3251 #endif /* not RE_ENABLE_I18N */
3252 {
3253 #ifdef RE_ENABLE_I18N
3254 free_charset (mbcset);
3255 #endif
3256 /* Build a tree for simple bracket. */
3257 br_token.type = SIMPLE_BRACKET;
3258 br_token.opr.sbcset = sbcset;
3259 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3260 if (BE (work_tree == NULL, 0))
3261 goto parse_bracket_exp_espace;
3262 }
3263 return work_tree;
3264
3265 parse_bracket_exp_espace:
3266 *err = REG_ESPACE;
3267 parse_bracket_exp_free_return:
3268 re_free (sbcset);
3269 #ifdef RE_ENABLE_I18N
3270 free_charset (mbcset);
3271 #endif /* RE_ENABLE_I18N */
3272 return NULL;
3273 }
3274
3275 /* Parse an element in the bracket expression. */
3276
3277 static reg_errcode_t
3278 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3279 re_token_t *token, int token_len, re_dfa_t *dfa,
3280 reg_syntax_t syntax, int accept_hyphen)
3281 {
3282 #ifdef RE_ENABLE_I18N
3283 int cur_char_size;
3284 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3285 if (cur_char_size > 1)
3286 {
3287 elem->type = MB_CHAR;
3288 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3289 re_string_skip_bytes (regexp, cur_char_size);
3290 return REG_NOERROR;
3291 }
3292 #endif /* RE_ENABLE_I18N */
3293 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3294 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3295 || token->type == OP_OPEN_EQUIV_CLASS)
3296 return parse_bracket_symbol (elem, regexp, token);
3297 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3298 {
3299 /* A '-' must only appear as anything but a range indicator before
3300 the closing bracket. Everything else is an error. */
3301 re_token_t token2;
3302 (void) peek_token_bracket (&token2, regexp, syntax);
3303 if (token2.type != OP_CLOSE_BRACKET)
3304 /* The actual error value is not standardized since this whole
3305 case is undefined. But ERANGE makes good sense. */
3306 return REG_ERANGE;
3307 }
3308 elem->type = SB_CHAR;
3309 elem->opr.ch = token->opr.c;
3310 return REG_NOERROR;
3311 }
3312
3313 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3314 such as [:<character_class>:], [.<collating_element>.], and
3315 [=<equivalent_class>=]. */
3316
3317 static reg_errcode_t
3318 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3319 re_token_t *token)
3320 {
3321 unsigned char ch, delim = token->opr.c;
3322 int i = 0;
3323 if (re_string_eoi(regexp))
3324 return REG_EBRACK;
3325 for (;; ++i)
3326 {
3327 if (i >= BRACKET_NAME_BUF_SIZE)
3328 return REG_EBRACK;
3329 if (token->type == OP_OPEN_CHAR_CLASS)
3330 ch = re_string_fetch_byte_case (regexp);
3331 else
3332 ch = re_string_fetch_byte (regexp);
3333 if (re_string_eoi(regexp))
3334 return REG_EBRACK;
3335 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3336 break;
3337 elem->opr.name[i] = ch;
3338 }
3339 re_string_skip_bytes (regexp, 1);
3340 elem->opr.name[i] = '\0';
3341 switch (token->type)
3342 {
3343 case OP_OPEN_COLL_ELEM:
3344 elem->type = COLL_SYM;
3345 break;
3346 case OP_OPEN_EQUIV_CLASS:
3347 elem->type = EQUIV_CLASS;
3348 break;
3349 case OP_OPEN_CHAR_CLASS:
3350 elem->type = CHAR_CLASS;
3351 break;
3352 default:
3353 break;
3354 }
3355 return REG_NOERROR;
3356 }
3357
3358 /* Helper function for parse_bracket_exp.
3359 Build the equivalence class which is represented by NAME.
3360 The result are written to MBCSET and SBCSET.
3361 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3362 is a pointer argument sinse we may update it. */
3363
3364 static reg_errcode_t
3365 #ifdef RE_ENABLE_I18N
3366 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3367 int *equiv_class_alloc, const unsigned char *name)
3368 #else /* not RE_ENABLE_I18N */
3369 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3370 #endif /* not RE_ENABLE_I18N */
3371 {
3372 #ifdef _LIBC
3373 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3374 if (nrules != 0)
3375 {
3376 const int32_t *table, *indirect;
3377 const unsigned char *weights, *extra, *cp;
3378 unsigned char char_buf[2];
3379 int32_t idx1, idx2;
3380 unsigned int ch;
3381 size_t len;
3382 /* This #include defines a local function! */
3383 # include <locale/weight.h>
3384 /* Calculate the index for equivalence class. */
3385 cp = name;
3386 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3387 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388 _NL_COLLATE_WEIGHTMB);
3389 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3390 _NL_COLLATE_EXTRAMB);
3391 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3392 _NL_COLLATE_INDIRECTMB);
3393 idx1 = findidx (&cp);
3394 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3395 /* This isn't a valid character. */
3396 return REG_ECOLLATE;
3397
3398 /* Build single byte matcing table for this equivalence class. */
3399 char_buf[1] = (unsigned char) '\0';
3400 len = weights[idx1 & 0xffffff];
3401 for (ch = 0; ch < SBC_MAX; ++ch)
3402 {
3403 char_buf[0] = ch;
3404 cp = char_buf;
3405 idx2 = findidx (&cp);
3406 /*
3407 idx2 = table[ch];
3408 */
3409 if (idx2 == 0)
3410 /* This isn't a valid character. */
3411 continue;
3412 /* Compare only if the length matches and the collation rule
3413 index is the same. */
3414 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3415 {
3416 int cnt = 0;
3417
3418 while (cnt <= len &&
3419 weights[(idx1 & 0xffffff) + 1 + cnt]
3420 == weights[(idx2 & 0xffffff) + 1 + cnt])
3421 ++cnt;
3422
3423 if (cnt > len)
3424 bitset_set (sbcset, ch);
3425 }
3426 }
3427 /* Check whether the array has enough space. */
3428 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3429 {
3430 /* Not enough, realloc it. */
3431 /* +1 in case of mbcset->nequiv_classes is 0. */
3432 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3433 /* Use realloc since the array is NULL if *alloc == 0. */
3434 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3435 int32_t,
3436 new_equiv_class_alloc);
3437 if (BE (new_equiv_classes == NULL, 0))
3438 return REG_ESPACE;
3439 mbcset->equiv_classes = new_equiv_classes;
3440 *equiv_class_alloc = new_equiv_class_alloc;
3441 }
3442 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3443 }
3444 else
3445 #endif /* _LIBC */
3446 {
3447 if (BE (strlen ((const char *) name) != 1, 0))
3448 return REG_ECOLLATE;
3449 bitset_set (sbcset, *name);
3450 }
3451 return REG_NOERROR;
3452 }
3453
3454 /* Helper function for parse_bracket_exp.
3455 Build the character class which is represented by NAME.
3456 The result are written to MBCSET and SBCSET.
3457 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3458 is a pointer argument sinse we may update it. */
3459
3460 static reg_errcode_t
3461 #ifdef RE_ENABLE_I18N
3462 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3463 re_charset_t *mbcset, int *char_class_alloc,
3464 const unsigned char *class_name, reg_syntax_t syntax)
3465 #else /* not RE_ENABLE_I18N */
3466 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3467 const unsigned char *class_name, reg_syntax_t syntax)
3468 #endif /* not RE_ENABLE_I18N */
3469 {
3470 int i;
3471 const char *name = (const char *) class_name;
3472
3473 /* In case of REG_ICASE "upper" and "lower" match the both of
3474 upper and lower cases. */
3475 if ((syntax & RE_ICASE)
3476 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3477 name = "alpha";
3478
3479 #ifdef RE_ENABLE_I18N
3480 /* Check the space of the arrays. */
3481 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3482 {
3483 /* Not enough, realloc it. */
3484 /* +1 in case of mbcset->nchar_classes is 0. */
3485 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3486 /* Use realloc since array is NULL if *alloc == 0. */
3487 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3488 new_char_class_alloc);
3489 if (BE (new_char_classes == NULL, 0))
3490 return REG_ESPACE;
3491 mbcset->char_classes = new_char_classes;
3492 *char_class_alloc = new_char_class_alloc;
3493 }
3494 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3495 #endif /* RE_ENABLE_I18N */
3496
3497 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3498 do { \
3499 if (BE (trans != NULL, 0)) \
3500 { \
3501 for (i = 0; i < SBC_MAX; ++i) \
3502 if (ctype_func (i)) \
3503 bitset_set (sbcset, trans[i]); \
3504 } \
3505 else \
3506 { \
3507 for (i = 0; i < SBC_MAX; ++i) \
3508 if (ctype_func (i)) \
3509 bitset_set (sbcset, i); \
3510 } \
3511 } while (0)
3512
3513 if (strcmp (name, "alnum") == 0)
3514 BUILD_CHARCLASS_LOOP (isalnum);
3515 else if (strcmp (name, "cntrl") == 0)
3516 BUILD_CHARCLASS_LOOP (iscntrl);
3517 else if (strcmp (name, "lower") == 0)
3518 BUILD_CHARCLASS_LOOP (islower);
3519 else if (strcmp (name, "space") == 0)
3520 BUILD_CHARCLASS_LOOP (isspace);
3521 else if (strcmp (name, "alpha") == 0)
3522 BUILD_CHARCLASS_LOOP (isalpha);
3523 else if (strcmp (name, "digit") == 0)
3524 BUILD_CHARCLASS_LOOP (isdigit);
3525 else if (strcmp (name, "print") == 0)
3526 BUILD_CHARCLASS_LOOP (isprint);
3527 else if (strcmp (name, "upper") == 0)
3528 BUILD_CHARCLASS_LOOP (isupper);
3529 else if (strcmp (name, "blank") == 0)
3530 BUILD_CHARCLASS_LOOP (isblank);
3531 else if (strcmp (name, "graph") == 0)
3532 BUILD_CHARCLASS_LOOP (isgraph);
3533 else if (strcmp (name, "punct") == 0)
3534 BUILD_CHARCLASS_LOOP (ispunct);
3535 else if (strcmp (name, "xdigit") == 0)
3536 BUILD_CHARCLASS_LOOP (isxdigit);
3537 else
3538 return REG_ECTYPE;
3539
3540 return REG_NOERROR;
3541 }
3542
3543 static bin_tree_t *
3544 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3545 const unsigned char *class_name,
3546 const unsigned char *extra, int non_match,
3547 reg_errcode_t *err)
3548 {
3549 re_bitset_ptr_t sbcset;
3550 #ifdef RE_ENABLE_I18N
3551 re_charset_t *mbcset;
3552 int alloc = 0;
3553 #endif /* not RE_ENABLE_I18N */
3554 reg_errcode_t ret;
3555 re_token_t br_token;
3556 bin_tree_t *tree;
3557
3558 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3559 #ifdef RE_ENABLE_I18N
3560 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3561 #endif /* RE_ENABLE_I18N */
3562
3563 #ifdef RE_ENABLE_I18N
3564 if (BE (sbcset == NULL || mbcset == NULL, 0))
3565 #else /* not RE_ENABLE_I18N */
3566 if (BE (sbcset == NULL, 0))
3567 #endif /* not RE_ENABLE_I18N */
3568 {
3569 *err = REG_ESPACE;
3570 return NULL;
3571 }
3572
3573 if (non_match)
3574 {
3575 #ifdef RE_ENABLE_I18N
3576 mbcset->non_match = 1;
3577 #endif /* not RE_ENABLE_I18N */
3578 }
3579
3580 /* We don't care the syntax in this case. */
3581 ret = build_charclass (trans, sbcset,
3582 #ifdef RE_ENABLE_I18N
3583 mbcset, &alloc,
3584 #endif /* RE_ENABLE_I18N */
3585 class_name, 0);
3586
3587 if (BE (ret != REG_NOERROR, 0))
3588 {
3589 re_free (sbcset);
3590 #ifdef RE_ENABLE_I18N
3591 free_charset (mbcset);
3592 #endif /* RE_ENABLE_I18N */
3593 *err = ret;
3594 return NULL;
3595 }
3596 /* \w match '_' also. */
3597 for (; *extra; extra++)
3598 bitset_set (sbcset, *extra);
3599
3600 /* If it is non-matching list. */
3601 if (non_match)
3602 bitset_not (sbcset);
3603
3604 #ifdef RE_ENABLE_I18N
3605 /* Ensure only single byte characters are set. */
3606 if (dfa->mb_cur_max > 1)
3607 bitset_mask (sbcset, dfa->sb_char);
3608 #endif
3609
3610 /* Build a tree for simple bracket. */
3611 br_token.type = SIMPLE_BRACKET;
3612 br_token.opr.sbcset = sbcset;
3613 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3614 if (BE (tree == NULL, 0))
3615 goto build_word_op_espace;
3616
3617 #ifdef RE_ENABLE_I18N
3618 if (dfa->mb_cur_max > 1)
3619 {
3620 bin_tree_t *mbc_tree;
3621 /* Build a tree for complex bracket. */
3622 br_token.type = COMPLEX_BRACKET;
3623 br_token.opr.mbcset = mbcset;
3624 dfa->has_mb_node = 1;
3625 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3626 if (BE (mbc_tree == NULL, 0))
3627 goto build_word_op_espace;
3628 /* Then join them by ALT node. */
3629 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3630 if (BE (mbc_tree != NULL, 1))
3631 return tree;
3632 }
3633 else
3634 {
3635 free_charset (mbcset);
3636 return tree;
3637 }
3638 #else /* not RE_ENABLE_I18N */
3639 return tree;
3640 #endif /* not RE_ENABLE_I18N */
3641
3642 build_word_op_espace:
3643 re_free (sbcset);
3644 #ifdef RE_ENABLE_I18N
3645 free_charset (mbcset);
3646 #endif /* RE_ENABLE_I18N */
3647 *err = REG_ESPACE;
3648 return NULL;
3649 }
3650
3651 /* This is intended for the expressions like "a{1,3}".
3652 Fetch a number from `input', and return the number.
3653 Return -1, if the number field is empty like "{,1}".
3654 Return -2, If an error is occured. */
3655
3656 static int
3657 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3658 {
3659 int num = -1;
3660 unsigned char c;
3661 while (1)
3662 {
3663 fetch_token (token, input, syntax);
3664 c = token->opr.c;
3665 if (BE (token->type == END_OF_RE, 0))
3666 return -2;
3667 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3668 break;
3669 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3670 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3671 num = (num > RE_DUP_MAX) ? -2 : num;
3672 }
3673 return num;
3674 }
3675 \f
3676 #ifdef RE_ENABLE_I18N
3677 static void
3678 free_charset (re_charset_t *cset)
3679 {
3680 re_free (cset->mbchars);
3681 # ifdef _LIBC
3682 re_free (cset->coll_syms);
3683 re_free (cset->equiv_classes);
3684 re_free (cset->range_starts);
3685 re_free (cset->range_ends);
3686 # endif
3687 re_free (cset->char_classes);
3688 re_free (cset);
3689 }
3690 #endif /* RE_ENABLE_I18N */
3691 \f
3692 /* Functions for binary tree operation. */
3693
3694 /* Create a tree node. */
3695
3696 static bin_tree_t *
3697 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3698 re_token_type_t type)
3699 {
3700 re_token_t t;
3701 t.type = type;
3702 return create_token_tree (dfa, left, right, &t);
3703 }
3704
3705 static bin_tree_t *
3706 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3707 const re_token_t *token)
3708 {
3709 bin_tree_t *tree;
3710 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3711 {
3712 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3713
3714 if (storage == NULL)
3715 return NULL;
3716 storage->next = dfa->str_tree_storage;
3717 dfa->str_tree_storage = storage;
3718 dfa->str_tree_storage_idx = 0;
3719 }
3720 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3721
3722 tree->parent = NULL;
3723 tree->left = left;
3724 tree->right = right;
3725 tree->token = *token;
3726 tree->token.duplicated = 0;
3727 tree->token.opt_subexp = 0;
3728 tree->first = NULL;
3729 tree->next = NULL;
3730 tree->node_idx = -1;
3731
3732 if (left != NULL)
3733 left->parent = tree;
3734 if (right != NULL)
3735 right->parent = tree;
3736 return tree;
3737 }
3738
3739 /* Mark the tree SRC as an optional subexpression.
3740 To be called from preorder or postorder. */
3741
3742 static reg_errcode_t
3743 mark_opt_subexp (void *extra, bin_tree_t *node)
3744 {
3745 int idx = (int) (long) extra;
3746 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3747 node->token.opt_subexp = 1;
3748
3749 return REG_NOERROR;
3750 }
3751
3752 /* Free the allocated memory inside NODE. */
3753
3754 static void
3755 free_token (re_token_t *node)
3756 {
3757 #ifdef RE_ENABLE_I18N
3758 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3759 free_charset (node->opr.mbcset);
3760 else
3761 #endif /* RE_ENABLE_I18N */
3762 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3763 re_free (node->opr.sbcset);
3764 }
3765
3766 /* Worker function for tree walking. Free the allocated memory inside NODE
3767 and its children. */
3768
3769 static reg_errcode_t
3770 free_tree (void *extra, bin_tree_t *node)
3771 {
3772 free_token (&node->token);
3773 return REG_NOERROR;
3774 }
3775
3776
3777 /* Duplicate the node SRC, and return new node. This is a preorder
3778 visit similar to the one implemented by the generic visitor, but
3779 we need more infrastructure to maintain two parallel trees --- so,
3780 it's easier to duplicate. */
3781
3782 static bin_tree_t *
3783 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3784 {
3785 const bin_tree_t *node;
3786 bin_tree_t *dup_root;
3787 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3788
3789 for (node = root; ; )
3790 {
3791 /* Create a new tree and link it back to the current parent. */
3792 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3793 if (*p_new == NULL)
3794 return NULL;
3795 (*p_new)->parent = dup_node;
3796 (*p_new)->token.duplicated = 1;
3797 dup_node = *p_new;
3798
3799 /* Go to the left node, or up and to the right. */
3800 if (node->left)
3801 {
3802 node = node->left;
3803 p_new = &dup_node->left;
3804 }
3805 else
3806 {
3807 const bin_tree_t *prev = NULL;
3808 while (node->right == prev || node->right == NULL)
3809 {
3810 prev = node;
3811 node = node->parent;
3812 dup_node = dup_node->parent;
3813 if (!node)
3814 return dup_root;
3815 }
3816 node = node->right;
3817 p_new = &dup_node->right;
3818 }
3819 }
3820 }