]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
[BZ #281]
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
39 void *extra);
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
42 void *extra);
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 bin_tree_t *node);
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
51 int top_clone_node, int root_node,
52 unsigned int constraint);
53 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
54 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
55 unsigned int constraint);
56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
58 int node, int root);
59 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
60 static int fetch_number (re_string_t *input, re_token_t *token,
61 reg_syntax_t syntax);
62 static void fetch_token (re_token_t *result, re_string_t *input,
63 reg_syntax_t syntax);
64 static int peek_token (re_token_t *token, re_string_t *input,
65 reg_syntax_t syntax);
66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
69 reg_syntax_t syntax, reg_errcode_t *err);
70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
80 re_token_t *token, reg_syntax_t syntax,
81 int nest, reg_errcode_t *err);
82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
83 re_dfa_t *dfa, re_token_t *token,
84 reg_syntax_t syntax, reg_errcode_t *err);
85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
86 re_token_t *token, reg_syntax_t syntax,
87 reg_errcode_t *err);
88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token, int token_len,
91 re_dfa_t *dfa,
92 reg_syntax_t syntax,
93 int accept_hyphen);
94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
95 re_string_t *regexp,
96 re_token_t *token);
97 #ifndef _LIBC
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
100 re_charset_t *mbcset, int *range_alloc,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
105 int *coll_sym_alloc,
106 const unsigned char *name);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
109 bracket_elem_t *start_elem,
110 bracket_elem_t *end_elem);
111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
112 const unsigned char *name);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
117 re_charset_t *mbcset,
118 int *equiv_class_alloc,
119 const unsigned char *name);
120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
121 re_bitset_ptr_t sbcset,
122 re_charset_t *mbcset,
123 int *char_class_alloc,
124 const unsigned char *class_name,
125 reg_syntax_t syntax);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
128 const unsigned char *name);
129 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
130 re_bitset_ptr_t sbcset,
131 const unsigned char *class_name,
132 reg_syntax_t syntax);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
135 RE_TRANSLATE_TYPE trans,
136 const unsigned char *class_name,
137 const unsigned char *extra,
138 int non_match, reg_errcode_t *err);
139 static bin_tree_t *create_tree (re_dfa_t *dfa,
140 bin_tree_t *left, bin_tree_t *right,
141 re_token_type_t type);
142 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
143 bin_tree_t *left, bin_tree_t *right,
144 const re_token_t *token);
145 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
146 static void free_token (re_token_t *node);
147 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
148 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
149 \f
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
154
155 const char __re_error_msgid[] attribute_hidden =
156 {
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
159 "\0"
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
162 "\0"
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
165 "\0"
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
168 "\0"
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
171 "\0"
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
174 "\0"
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
177 "\0"
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
180 "\0"
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
183 "\0"
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
186 "\0"
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
189 "\0"
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
192 "\0"
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
195 "\0"
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
198 "\0"
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
201 "\0"
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
204 "\0"
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
207 };
208
209 const size_t __re_error_msgid_idx[] attribute_hidden =
210 {
211 REG_NOERROR_IDX,
212 REG_NOMATCH_IDX,
213 REG_BADPAT_IDX,
214 REG_ECOLLATE_IDX,
215 REG_ECTYPE_IDX,
216 REG_EESCAPE_IDX,
217 REG_ESUBREG_IDX,
218 REG_EBRACK_IDX,
219 REG_EPAREN_IDX,
220 REG_EBRACE_IDX,
221 REG_BADBR_IDX,
222 REG_ERANGE_IDX,
223 REG_ESPACE_IDX,
224 REG_BADRPT_IDX,
225 REG_EEND_IDX,
226 REG_ESIZE_IDX,
227 REG_ERPAREN_IDX
228 };
229 \f
230 /* Entry points for GNU code. */
231
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
235
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
238
239 const char *
240 re_compile_pattern (pattern, length, bufp)
241 const char *pattern;
242 size_t length;
243 struct re_pattern_buffer *bufp;
244 {
245 reg_errcode_t ret;
246
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
251
252 /* Match anchors at newline. */
253 bufp->newline_anchor = 1;
254
255 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
256
257 if (!ret)
258 return NULL;
259 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
260 }
261 #ifdef _LIBC
262 weak_alias (__re_compile_pattern, re_compile_pattern)
263 #endif
264
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options;
271
272
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
276
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
279
280 reg_syntax_t
281 re_set_syntax (syntax)
282 reg_syntax_t syntax;
283 {
284 reg_syntax_t ret = re_syntax_options;
285
286 re_syntax_options = syntax;
287 return ret;
288 }
289 #ifdef _LIBC
290 weak_alias (__re_set_syntax, re_set_syntax)
291 #endif
292
293 int
294 re_compile_fastmap (bufp)
295 struct re_pattern_buffer *bufp;
296 {
297 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
298 char *fastmap = bufp->fastmap;
299
300 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
301 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
302 if (dfa->init_state != dfa->init_state_word)
303 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
304 if (dfa->init_state != dfa->init_state_nl)
305 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
306 if (dfa->init_state != dfa->init_state_begbuf)
307 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
308 bufp->fastmap_accurate = 1;
309 return 0;
310 }
311 #ifdef _LIBC
312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
313 #endif
314
315 static inline void
316 __attribute ((always_inline))
317 re_set_fastmap (char *fastmap, int icase, int ch)
318 {
319 fastmap[ch] = 1;
320 if (icase)
321 fastmap[tolower (ch)] = 1;
322 }
323
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
326
327 static void
328 re_compile_fastmap_iter (bufp, init_state, fastmap)
329 regex_t *bufp;
330 const re_dfastate_t *init_state;
331 char *fastmap;
332 {
333 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
334 int node_cnt;
335 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
336 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
337 {
338 int node = init_state->nodes.elems[node_cnt];
339 re_token_type_t type = dfa->nodes[node].type;
340
341 if (type == CHARACTER)
342 {
343 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
346 {
347 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
348 wchar_t wc;
349 mbstate_t state;
350
351 p = buf;
352 *p++ = dfa->nodes[node].opr.c;
353 while (++node < dfa->nodes_len
354 && dfa->nodes[node].type == CHARACTER
355 && dfa->nodes[node].mb_partial)
356 *p++ = dfa->nodes[node].opr.c;
357 memset (&state, 0, sizeof (state));
358 if (mbrtowc (&wc, (const char *) buf, p - buf,
359 &state) == p - buf
360 && (__wcrtomb ((char *) buf, towlower (wc), &state)
361 != (size_t) -1))
362 re_set_fastmap (fastmap, 0, buf[0]);
363 }
364 #endif
365 }
366 else if (type == SIMPLE_BRACKET)
367 {
368 int i, j, ch;
369 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
370 for (j = 0; j < UINT_BITS; ++j, ++ch)
371 if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
372 re_set_fastmap (fastmap, icase, ch);
373 }
374 #ifdef RE_ENABLE_I18N
375 else if (type == COMPLEX_BRACKET)
376 {
377 int i;
378 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
379 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
380 || cset->nranges || cset->nchar_classes)
381 {
382 # ifdef _LIBC
383 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
384 {
385 /* In this case we want to catch the bytes which are
386 the first byte of any collation elements.
387 e.g. In da_DK, we want to catch 'a' since "aa"
388 is a valid collation element, and don't catch
389 'b' since 'b' is the only collation element
390 which starts from 'b'. */
391 int j, ch;
392 const int32_t *table = (const int32_t *)
393 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
394 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
395 for (j = 0; j < UINT_BITS; ++j, ++ch)
396 if (table[ch] < 0)
397 re_set_fastmap (fastmap, icase, ch);
398 }
399 # else
400 if (dfa->mb_cur_max > 1)
401 for (i = 0; i < SBC_MAX; ++i)
402 if (__btowc (i) == WEOF)
403 re_set_fastmap (fastmap, icase, i);
404 # endif /* not _LIBC */
405 }
406 for (i = 0; i < cset->nmbchars; ++i)
407 {
408 char buf[256];
409 mbstate_t state;
410 memset (&state, '\0', sizeof (state));
411 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414 {
415 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 != (size_t) -1)
417 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
418 }
419 }
420 }
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
427 {
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
431 return;
432 }
433 }
434 }
435 \f
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
438
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
441
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
451
452 PATTERN is the address of the pattern string.
453
454 CFLAGS is a series of bits which affect compilation.
455
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
458
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
461
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
464
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
467 registers.
468
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
471
472 int
473 regcomp (preg, pattern, cflags)
474 regex_t *__restrict preg;
475 const char *__restrict pattern;
476 int cflags;
477 {
478 reg_errcode_t ret;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
481
482 preg->buffer = NULL;
483 preg->allocated = 0;
484 preg->used = 0;
485
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
489 return REG_ESPACE;
490
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
500 }
501 else
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
505
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
511 ret = REG_EPAREN;
512
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
518 else
519 {
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
523 }
524
525 return (int) ret;
526 }
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
530
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
533
534 size_t
535 regerror (errcode, preg, errbuf, errbuf_size)
536 int errcode;
537 const regex_t *__restrict preg;
538 char *__restrict errbuf;
539 size_t errbuf_size;
540 {
541 const char *msg;
542 size_t msg_size;
543
544 if (BE (errcode < 0
545 || errcode >= (int) (sizeof (__re_error_msgid_idx)
546 / sizeof (__re_error_msgid_idx[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
551 abort ();
552
553 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
554
555 msg_size = strlen (msg) + 1; /* Includes the null. */
556
557 if (BE (errbuf_size != 0, 1))
558 {
559 if (BE (msg_size > errbuf_size, 0))
560 {
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
563 #else
564 memcpy (errbuf, msg, errbuf_size - 1);
565 errbuf[errbuf_size - 1] = 0;
566 #endif
567 }
568 else
569 memcpy (errbuf, msg, msg_size);
570 }
571
572 return msg_size;
573 }
574 #ifdef _LIBC
575 weak_alias (__regerror, regerror)
576 #endif
577
578
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map =
585 {
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
589 # else
590 # error "Add case for new unsigned int size"
591 # endif
592 };
593 #endif
594
595
596 static void
597 free_dfa_content (re_dfa_t *dfa)
598 {
599 int i, j;
600
601 if (dfa->nodes)
602 for (i = 0; i < dfa->nodes_len; ++i)
603 free_token (dfa->nodes + i);
604 re_free (dfa->nexts);
605 for (i = 0; i < dfa->nodes_len; ++i)
606 {
607 if (dfa->eclosures != NULL)
608 re_node_set_free (dfa->eclosures + i);
609 if (dfa->inveclosures != NULL)
610 re_node_set_free (dfa->inveclosures + i);
611 if (dfa->edests != NULL)
612 re_node_set_free (dfa->edests + i);
613 }
614 re_free (dfa->edests);
615 re_free (dfa->eclosures);
616 re_free (dfa->inveclosures);
617 re_free (dfa->nodes);
618
619 if (dfa->state_table)
620 for (i = 0; i <= dfa->state_hash_mask; ++i)
621 {
622 struct re_state_table_entry *entry = dfa->state_table + i;
623 for (j = 0; j < entry->num; ++j)
624 {
625 re_dfastate_t *state = entry->array[j];
626 free_state (state);
627 }
628 re_free (entry->array);
629 }
630 re_free (dfa->state_table);
631 #ifdef RE_ENABLE_I18N
632 if (dfa->sb_char != utf8_sb_map)
633 re_free (dfa->sb_char);
634 #endif
635 re_free (dfa->subexp_map);
636 #ifdef DEBUG
637 re_free (dfa->re_str);
638 #endif
639
640 re_free (dfa);
641 }
642
643
644 /* Free dynamically allocated space used by PREG. */
645
646 void
647 regfree (preg)
648 regex_t *preg;
649 {
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
653 preg->buffer = NULL;
654 preg->allocated = 0;
655
656 re_free (preg->fastmap);
657 preg->fastmap = NULL;
658
659 re_free (preg->translate);
660 preg->translate = NULL;
661 }
662 #ifdef _LIBC
663 weak_alias (__regfree, regfree)
664 #endif
665 \f
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669 #if defined _REGEX_RE_COMP || defined _LIBC
670
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
673
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
681 re_comp (s)
682 const char *s;
683 {
684 reg_errcode_t ret;
685 char *fastmap;
686
687 if (!s)
688 {
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
691 return 0;
692 }
693
694 if (re_comp_buf.buffer)
695 {
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
700 re_comp_buf.fastmap = fastmap;
701 }
702
703 if (re_comp_buf.fastmap == NULL)
704 {
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
709 }
710
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
713
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
716
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
718
719 if (!ret)
720 return NULL;
721
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
724 }
725
726 #ifdef _LIBC
727 libc_freeres_fn (free_mem)
728 {
729 __regfree (&re_comp_buf);
730 }
731 #endif
732
733 #endif /* _REGEX_RE_COMP */
734 \f
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
738
739 static reg_errcode_t
740 re_compile_internal (preg, pattern, length, syntax)
741 regex_t *preg;
742 const char * pattern;
743 size_t length;
744 reg_syntax_t syntax;
745 {
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
749
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
758
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
762 {
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
772 }
773 preg->used = sizeof (re_dfa_t);
774
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
777 {
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
782 }
783 #ifdef DEBUG
784 /* Note: length+1 will not overflow since it is checked in init_dfa. */
785 dfa->re_str = re_malloc (char, length + 1);
786 strncpy (dfa->re_str, pattern, length + 1);
787 #endif
788
789 __libc_lock_init (dfa->lock);
790
791 err = re_string_construct (&regexp, pattern, length, preg->translate,
792 syntax & RE_ICASE, dfa);
793 if (BE (err != REG_NOERROR, 0))
794 {
795 re_compile_internal_free_return:
796 free_workarea_compile (preg);
797 re_string_destruct (&regexp);
798 free_dfa_content (dfa);
799 preg->buffer = NULL;
800 preg->allocated = 0;
801 return err;
802 }
803
804 /* Parse the regular expression, and build a structure tree. */
805 preg->re_nsub = 0;
806 dfa->str_tree = parse (&regexp, preg, syntax, &err);
807 if (BE (dfa->str_tree == NULL, 0))
808 goto re_compile_internal_free_return;
809
810 /* Analyze the tree and create the nfa. */
811 err = analyze (preg);
812 if (BE (err != REG_NOERROR, 0))
813 goto re_compile_internal_free_return;
814
815 #ifdef RE_ENABLE_I18N
816 /* If possible, do searching in single byte encoding to speed things up. */
817 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
818 optimize_utf8 (dfa);
819 #endif
820
821 /* Then create the initial state of the dfa. */
822 err = create_initial_state (dfa);
823
824 /* Release work areas. */
825 free_workarea_compile (preg);
826 re_string_destruct (&regexp);
827
828 if (BE (err != REG_NOERROR, 0))
829 {
830 free_dfa_content (dfa);
831 preg->buffer = NULL;
832 preg->allocated = 0;
833 }
834
835 return err;
836 }
837
838 /* Initialize DFA. We use the length of the regular expression PAT_LEN
839 as the initial length of some arrays. */
840
841 static reg_errcode_t
842 init_dfa (dfa, pat_len)
843 re_dfa_t *dfa;
844 size_t pat_len;
845 {
846 unsigned int table_size;
847 #ifndef _LIBC
848 char *codeset_name;
849 #endif
850
851 memset (dfa, '\0', sizeof (re_dfa_t));
852
853 /* Force allocation of str_tree_storage the first time. */
854 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
855
856 /* Avoid overflows. */
857 if (pat_len == SIZE_MAX)
858 return REG_ESPACE;
859
860 dfa->nodes_alloc = pat_len + 1;
861 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
862
863 /* table_size = 2 ^ ceil(log pat_len) */
864 for (table_size = 1; ; table_size <<= 1)
865 if (table_size > pat_len)
866 break;
867
868 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
869 dfa->state_hash_mask = table_size - 1;
870
871 dfa->mb_cur_max = MB_CUR_MAX;
872 #ifdef _LIBC
873 if (dfa->mb_cur_max == 6
874 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
875 dfa->is_utf8 = 1;
876 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
877 != 0);
878 #else
879 # ifdef HAVE_LANGINFO_CODESET
880 codeset_name = nl_langinfo (CODESET);
881 # else
882 codeset_name = getenv ("LC_ALL");
883 if (codeset_name == NULL || codeset_name[0] == '\0')
884 codeset_name = getenv ("LC_CTYPE");
885 if (codeset_name == NULL || codeset_name[0] == '\0')
886 codeset_name = getenv ("LANG");
887 if (codeset_name == NULL)
888 codeset_name = "";
889 else if (strchr (codeset_name, '.') != NULL)
890 codeset_name = strchr (codeset_name, '.') + 1;
891 # endif
892
893 if (strcasecmp (codeset_name, "UTF-8") == 0
894 || strcasecmp (codeset_name, "UTF8") == 0)
895 dfa->is_utf8 = 1;
896
897 /* We check exhaustively in the loop below if this charset is a
898 superset of ASCII. */
899 dfa->map_notascii = 0;
900 #endif
901
902 #ifdef RE_ENABLE_I18N
903 if (dfa->mb_cur_max > 1)
904 {
905 if (dfa->is_utf8)
906 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
907 else
908 {
909 int i, j, ch;
910
911 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
912 if (BE (dfa->sb_char == NULL, 0))
913 return REG_ESPACE;
914
915 /* Clear all bits by, then set those corresponding to single
916 byte chars. */
917 bitset_empty (dfa->sb_char);
918
919 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
920 for (j = 0; j < UINT_BITS; ++j, ++ch)
921 {
922 wint_t wch = __btowc (ch);
923 if (wch != WEOF)
924 dfa->sb_char[i] |= 1u << j;
925 # ifndef _LIBC
926 if (isascii (ch) && wch != ch)
927 dfa->map_notascii = 1;
928 # endif
929 }
930 }
931 }
932 #endif
933
934 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
935 return REG_ESPACE;
936 return REG_NOERROR;
937 }
938
939 /* Initialize WORD_CHAR table, which indicate which character is
940 "word". In this case "word" means that it is the word construction
941 character used by some operators like "\<", "\>", etc. */
942
943 static void
944 init_word_char (dfa)
945 re_dfa_t *dfa;
946 {
947 int i, j, ch;
948 dfa->word_ops_used = 1;
949 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
950 for (j = 0; j < UINT_BITS; ++j, ++ch)
951 if (isalnum (ch) || ch == '_')
952 dfa->word_char[i] |= 1u << j;
953 }
954
955 /* Free the work area which are only used while compiling. */
956
957 static void
958 free_workarea_compile (preg)
959 regex_t *preg;
960 {
961 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
962 bin_tree_storage_t *storage, *next;
963 for (storage = dfa->str_tree_storage; storage; storage = next)
964 {
965 next = storage->next;
966 re_free (storage);
967 }
968 dfa->str_tree_storage = NULL;
969 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
970 dfa->str_tree = NULL;
971 re_free (dfa->org_indices);
972 dfa->org_indices = NULL;
973 }
974
975 /* Create initial states for all contexts. */
976
977 static reg_errcode_t
978 create_initial_state (dfa)
979 re_dfa_t *dfa;
980 {
981 int first, i;
982 reg_errcode_t err;
983 re_node_set init_nodes;
984
985 /* Initial states have the epsilon closure of the node which is
986 the first node of the regular expression. */
987 first = dfa->str_tree->first->node_idx;
988 dfa->init_node = first;
989 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
990 if (BE (err != REG_NOERROR, 0))
991 return err;
992
993 /* The back-references which are in initial states can epsilon transit,
994 since in this case all of the subexpressions can be null.
995 Then we add epsilon closures of the nodes which are the next nodes of
996 the back-references. */
997 if (dfa->nbackref > 0)
998 for (i = 0; i < init_nodes.nelem; ++i)
999 {
1000 int node_idx = init_nodes.elems[i];
1001 re_token_type_t type = dfa->nodes[node_idx].type;
1002
1003 int clexp_idx;
1004 if (type != OP_BACK_REF)
1005 continue;
1006 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1007 {
1008 re_token_t *clexp_node;
1009 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1010 if (clexp_node->type == OP_CLOSE_SUBEXP
1011 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1012 break;
1013 }
1014 if (clexp_idx == init_nodes.nelem)
1015 continue;
1016
1017 if (type == OP_BACK_REF)
1018 {
1019 int dest_idx = dfa->edests[node_idx].elems[0];
1020 if (!re_node_set_contains (&init_nodes, dest_idx))
1021 {
1022 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1023 i = 0;
1024 }
1025 }
1026 }
1027
1028 /* It must be the first time to invoke acquire_state. */
1029 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1030 /* We don't check ERR here, since the initial state must not be NULL. */
1031 if (BE (dfa->init_state == NULL, 0))
1032 return err;
1033 if (dfa->init_state->has_constraint)
1034 {
1035 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1036 CONTEXT_WORD);
1037 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1038 CONTEXT_NEWLINE);
1039 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1040 &init_nodes,
1041 CONTEXT_NEWLINE
1042 | CONTEXT_BEGBUF);
1043 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1044 || dfa->init_state_begbuf == NULL, 0))
1045 return err;
1046 }
1047 else
1048 dfa->init_state_word = dfa->init_state_nl
1049 = dfa->init_state_begbuf = dfa->init_state;
1050
1051 re_node_set_free (&init_nodes);
1052 return REG_NOERROR;
1053 }
1054 \f
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058 DFA nodes where needed. */
1059
1060 static void
1061 optimize_utf8 (dfa)
1062 re_dfa_t *dfa;
1063 {
1064 int node, i, mb_chars = 0, has_period = 0;
1065
1066 for (node = 0; node < dfa->nodes_len; ++node)
1067 switch (dfa->nodes[node].type)
1068 {
1069 case CHARACTER:
1070 if (dfa->nodes[node].opr.c >= 0x80)
1071 mb_chars = 1;
1072 break;
1073 case ANCHOR:
1074 switch (dfa->nodes[node].opr.idx)
1075 {
1076 case LINE_FIRST:
1077 case LINE_LAST:
1078 case BUF_FIRST:
1079 case BUF_LAST:
1080 break;
1081 default:
1082 /* Word anchors etc. cannot be handled. */
1083 return;
1084 }
1085 break;
1086 case OP_PERIOD:
1087 has_period = 1;
1088 break;
1089 case OP_BACK_REF:
1090 case OP_ALT:
1091 case END_OF_RE:
1092 case OP_DUP_ASTERISK:
1093 case OP_OPEN_SUBEXP:
1094 case OP_CLOSE_SUBEXP:
1095 break;
1096 case COMPLEX_BRACKET:
1097 return;
1098 case SIMPLE_BRACKET:
1099 /* Just double check. */
1100 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1101 if (dfa->nodes[node].opr.sbcset[i])
1102 return;
1103 break;
1104 default:
1105 abort ();
1106 }
1107
1108 if (mb_chars || has_period)
1109 for (node = 0; node < dfa->nodes_len; ++node)
1110 {
1111 if (dfa->nodes[node].type == CHARACTER
1112 && dfa->nodes[node].opr.c >= 0x80)
1113 dfa->nodes[node].mb_partial = 0;
1114 else if (dfa->nodes[node].type == OP_PERIOD)
1115 dfa->nodes[node].type = OP_UTF8_PERIOD;
1116 }
1117
1118 /* The search can be in single byte locale. */
1119 dfa->mb_cur_max = 1;
1120 dfa->is_utf8 = 0;
1121 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1122 }
1123 #endif
1124 \f
1125 /* Analyze the structure tree, and calculate "first", "next", "edest",
1126 "eclosure", and "inveclosure". */
1127
1128 static reg_errcode_t
1129 analyze (preg)
1130 regex_t *preg;
1131 {
1132 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1133 reg_errcode_t ret;
1134
1135 /* Allocate arrays. */
1136 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1137 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1138 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1139 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1140 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1141 || dfa->eclosures == NULL, 0))
1142 return REG_ESPACE;
1143
1144 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1145 if (dfa->subexp_map != NULL)
1146 {
1147 int i;
1148 for (i = 0; i < preg->re_nsub; i++)
1149 dfa->subexp_map[i] = i;
1150 preorder (dfa->str_tree, optimize_subexps, dfa);
1151 for (i = 0; i < preg->re_nsub; i++)
1152 if (dfa->subexp_map[i] != i)
1153 break;
1154 if (i == preg->re_nsub)
1155 {
1156 free (dfa->subexp_map);
1157 dfa->subexp_map = NULL;
1158 }
1159 }
1160
1161 ret = postorder (dfa->str_tree, lower_subexps, preg);
1162 if (BE (ret != REG_NOERROR, 0))
1163 return ret;
1164 ret = postorder (dfa->str_tree, calc_first, dfa);
1165 if (BE (ret != REG_NOERROR, 0))
1166 return ret;
1167 preorder (dfa->str_tree, calc_next, dfa);
1168 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1169 if (BE (ret != REG_NOERROR, 0))
1170 return ret;
1171 ret = calc_eclosure (dfa);
1172 if (BE (ret != REG_NOERROR, 0))
1173 return ret;
1174
1175 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1176 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1177 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1178 || dfa->nbackref)
1179 {
1180 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1181 if (BE (dfa->inveclosures == NULL, 0))
1182 return REG_ESPACE;
1183 ret = calc_inveclosure (dfa);
1184 }
1185
1186 return ret;
1187 }
1188
1189 /* Our parse trees are very unbalanced, so we cannot use a stack to
1190 implement parse tree visits. Instead, we use parent pointers and
1191 some hairy code in these two functions. */
1192 static reg_errcode_t
1193 postorder (root, fn, extra)
1194 bin_tree_t *root;
1195 reg_errcode_t (fn (void *, bin_tree_t *));
1196 void *extra;
1197 {
1198 bin_tree_t *node, *prev;
1199
1200 for (node = root; ; )
1201 {
1202 /* Descend down the tree, preferably to the left (or to the right
1203 if that's the only child). */
1204 while (node->left || node->right)
1205 if (node->left)
1206 node = node->left;
1207 else
1208 node = node->right;
1209
1210 do
1211 {
1212 reg_errcode_t err = fn (extra, node);
1213 if (BE (err != REG_NOERROR, 0))
1214 return err;
1215 if (node->parent == NULL)
1216 return REG_NOERROR;
1217 prev = node;
1218 node = node->parent;
1219 }
1220 /* Go up while we have a node that is reached from the right. */
1221 while (node->right == prev || node->right == NULL);
1222 node = node->right;
1223 }
1224 }
1225
1226 static reg_errcode_t
1227 preorder (root, fn, extra)
1228 bin_tree_t *root;
1229 reg_errcode_t (fn (void *, bin_tree_t *));
1230 void *extra;
1231 {
1232 bin_tree_t *node;
1233
1234 for (node = root; ; )
1235 {
1236 reg_errcode_t err = fn (extra, node);
1237 if (BE (err != REG_NOERROR, 0))
1238 return err;
1239
1240 /* Go to the left node, or up and to the right. */
1241 if (node->left)
1242 node = node->left;
1243 else
1244 {
1245 bin_tree_t *prev = NULL;
1246 while (node->right == prev || node->right == NULL)
1247 {
1248 prev = node;
1249 node = node->parent;
1250 if (!node)
1251 return REG_NOERROR;
1252 }
1253 node = node->right;
1254 }
1255 }
1256 }
1257
1258 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1259 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1260 backreferences as well. Requires a preorder visit. */
1261 static reg_errcode_t
1262 optimize_subexps (extra, node)
1263 void *extra;
1264 bin_tree_t *node;
1265 {
1266 re_dfa_t *dfa = (re_dfa_t *) extra;
1267
1268 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1269 {
1270 int idx = node->token.opr.idx;
1271 node->token.opr.idx = dfa->subexp_map[idx];
1272 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1273 }
1274
1275 else if (node->token.type == SUBEXP
1276 && node->left && node->left->token.type == SUBEXP)
1277 {
1278 int other_idx = node->left->token.opr.idx;
1279
1280 node->left = node->left->left;
1281 if (node->left)
1282 node->left->parent = node;
1283
1284 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1285 if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1286 dfa->used_bkref_map &= ~(1u << other_idx);
1287 }
1288
1289 return REG_NOERROR;
1290 }
1291
1292 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1293 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1294 static reg_errcode_t
1295 lower_subexps (extra, node)
1296 void *extra;
1297 bin_tree_t *node;
1298 {
1299 regex_t *preg = (regex_t *) extra;
1300 reg_errcode_t err = REG_NOERROR;
1301
1302 if (node->left && node->left->token.type == SUBEXP)
1303 {
1304 node->left = lower_subexp (&err, preg, node->left);
1305 if (node->left)
1306 node->left->parent = node;
1307 }
1308 if (node->right && node->right->token.type == SUBEXP)
1309 {
1310 node->right = lower_subexp (&err, preg, node->right);
1311 if (node->right)
1312 node->right->parent = node;
1313 }
1314
1315 return err;
1316 }
1317
1318 static bin_tree_t *
1319 lower_subexp (err, preg, node)
1320 reg_errcode_t *err;
1321 regex_t *preg;
1322 bin_tree_t *node;
1323 {
1324 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1325 bin_tree_t *body = node->left;
1326 bin_tree_t *op, *cls, *tree1, *tree;
1327
1328 if (preg->no_sub
1329 /* We do not optimize empty subexpressions, because otherwise we may
1330 have bad CONCAT nodes with NULL children. This is obviously not
1331 very common, so we do not lose much. An example that triggers
1332 this case is the sed "script" /\(\)/x. */
1333 && node->left != NULL
1334 && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1335 || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1336 return node->left;
1337
1338 /* Convert the SUBEXP node to the concatenation of an
1339 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1340 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1341 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1342 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1343 tree = create_tree (dfa, op, tree1, CONCAT);
1344 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1345 {
1346 *err = REG_ESPACE;
1347 return NULL;
1348 }
1349
1350 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1351 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1352 return tree;
1353 }
1354
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356 nodes. Requires a postorder visit. */
1357 static reg_errcode_t
1358 calc_first (extra, node)
1359 void *extra;
1360 bin_tree_t *node;
1361 {
1362 re_dfa_t *dfa = (re_dfa_t *) extra;
1363 if (node->token.type == CONCAT)
1364 {
1365 node->first = node->left->first;
1366 node->node_idx = node->left->node_idx;
1367 }
1368 else
1369 {
1370 node->first = node;
1371 node->node_idx = re_dfa_add_node (dfa, node->token);
1372 if (BE (node->node_idx == -1, 0))
1373 return REG_ESPACE;
1374 }
1375 return REG_NOERROR;
1376 }
1377
1378 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1379 static reg_errcode_t
1380 calc_next (extra, node)
1381 void *extra;
1382 bin_tree_t *node;
1383 {
1384 switch (node->token.type)
1385 {
1386 case OP_DUP_ASTERISK:
1387 node->left->next = node;
1388 break;
1389 case CONCAT:
1390 node->left->next = node->right->first;
1391 node->right->next = node->next;
1392 break;
1393 default:
1394 if (node->left)
1395 node->left->next = node->next;
1396 if (node->right)
1397 node->right->next = node->next;
1398 break;
1399 }
1400 return REG_NOERROR;
1401 }
1402
1403 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1404 static reg_errcode_t
1405 link_nfa_nodes (extra, node)
1406 void *extra;
1407 bin_tree_t *node;
1408 {
1409 re_dfa_t *dfa = (re_dfa_t *) extra;
1410 int idx = node->node_idx;
1411 reg_errcode_t err = REG_NOERROR;
1412
1413 switch (node->token.type)
1414 {
1415 case CONCAT:
1416 break;
1417
1418 case END_OF_RE:
1419 assert (node->next == NULL);
1420 break;
1421
1422 case OP_DUP_ASTERISK:
1423 case OP_ALT:
1424 {
1425 int left, right;
1426 dfa->has_plural_match = 1;
1427 if (node->left != NULL)
1428 left = node->left->first->node_idx;
1429 else
1430 left = node->next->node_idx;
1431 if (node->right != NULL)
1432 right = node->right->first->node_idx;
1433 else
1434 right = node->next->node_idx;
1435 assert (left > -1);
1436 assert (right > -1);
1437 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1438 }
1439 break;
1440
1441 case ANCHOR:
1442 case OP_OPEN_SUBEXP:
1443 case OP_CLOSE_SUBEXP:
1444 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1445 break;
1446
1447 case OP_BACK_REF:
1448 dfa->nexts[idx] = node->next->node_idx;
1449 if (node->token.type == OP_BACK_REF)
1450 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1451 break;
1452
1453 default:
1454 assert (!IS_EPSILON_NODE (node->token.type));
1455 dfa->nexts[idx] = node->next->node_idx;
1456 break;
1457 }
1458
1459 return err;
1460 }
1461
1462 /* Duplicate the epsilon closure of the node ROOT_NODE.
1463 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1464 to their own constraint. */
1465
1466 static reg_errcode_t
1467 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1468 init_constraint)
1469 re_dfa_t *dfa;
1470 int top_org_node, top_clone_node, root_node;
1471 unsigned int init_constraint;
1472 {
1473 int org_node, clone_node, ret;
1474 unsigned int constraint = init_constraint;
1475 for (org_node = top_org_node, clone_node = top_clone_node;;)
1476 {
1477 int org_dest, clone_dest;
1478 if (dfa->nodes[org_node].type == OP_BACK_REF)
1479 {
1480 /* If the back reference epsilon-transit, its destination must
1481 also have the constraint. Then duplicate the epsilon closure
1482 of the destination of the back reference, and store it in
1483 edests of the back reference. */
1484 org_dest = dfa->nexts[org_node];
1485 re_node_set_empty (dfa->edests + clone_node);
1486 clone_dest = duplicate_node (dfa, org_dest, constraint);
1487 if (BE (clone_dest == -1, 0))
1488 return REG_ESPACE;
1489 dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491 if (BE (ret < 0, 0))
1492 return REG_ESPACE;
1493 }
1494 else if (dfa->edests[org_node].nelem == 0)
1495 {
1496 /* In case of the node can't epsilon-transit, don't duplicate the
1497 destination and store the original destination as the
1498 destination of the node. */
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 break;
1501 }
1502 else if (dfa->edests[org_node].nelem == 1)
1503 {
1504 /* In case of the node can epsilon-transit, and it has only one
1505 destination. */
1506 org_dest = dfa->edests[org_node].elems[0];
1507 re_node_set_empty (dfa->edests + clone_node);
1508 if (dfa->nodes[org_node].type == ANCHOR)
1509 {
1510 /* In case of the node has another constraint, append it. */
1511 if (org_node == root_node && clone_node != org_node)
1512 {
1513 /* ...but if the node is root_node itself, it means the
1514 epsilon closure have a loop, then tie it to the
1515 destination of the root_node. */
1516 ret = re_node_set_insert (dfa->edests + clone_node,
1517 org_dest);
1518 if (BE (ret < 0, 0))
1519 return REG_ESPACE;
1520 break;
1521 }
1522 constraint |= dfa->nodes[org_node].opr.ctx_type;
1523 }
1524 clone_dest = duplicate_node (dfa, org_dest, constraint);
1525 if (BE (clone_dest == -1, 0))
1526 return REG_ESPACE;
1527 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 if (BE (ret < 0, 0))
1529 return REG_ESPACE;
1530 }
1531 else /* dfa->edests[org_node].nelem == 2 */
1532 {
1533 /* In case of the node can epsilon-transit, and it has two
1534 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1535 org_dest = dfa->edests[org_node].elems[0];
1536 re_node_set_empty (dfa->edests + clone_node);
1537 /* Search for a duplicated node which satisfies the constraint. */
1538 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539 if (clone_dest == -1)
1540 {
1541 /* There are no such a duplicated node, create a new one. */
1542 reg_errcode_t err;
1543 clone_dest = duplicate_node (dfa, org_dest, constraint);
1544 if (BE (clone_dest == -1, 0))
1545 return REG_ESPACE;
1546 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547 if (BE (ret < 0, 0))
1548 return REG_ESPACE;
1549 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550 root_node, constraint);
1551 if (BE (err != REG_NOERROR, 0))
1552 return err;
1553 }
1554 else
1555 {
1556 /* There are a duplicated node which satisfy the constraint,
1557 use it to avoid infinite loop. */
1558 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (ret < 0, 0))
1560 return REG_ESPACE;
1561 }
1562
1563 org_dest = dfa->edests[org_node].elems[1];
1564 clone_dest = duplicate_node (dfa, org_dest, constraint);
1565 if (BE (clone_dest == -1, 0))
1566 return REG_ESPACE;
1567 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 if (BE (ret < 0, 0))
1569 return REG_ESPACE;
1570 }
1571 org_node = org_dest;
1572 clone_node = clone_dest;
1573 }
1574 return REG_NOERROR;
1575 }
1576
1577 /* Search for a node which is duplicated from the node ORG_NODE, and
1578 satisfies the constraint CONSTRAINT. */
1579
1580 static int
1581 search_duplicated_node (dfa, org_node, constraint)
1582 const re_dfa_t *dfa;
1583 int org_node;
1584 unsigned int constraint;
1585 {
1586 int idx;
1587 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1588 {
1589 if (org_node == dfa->org_indices[idx]
1590 && constraint == dfa->nodes[idx].constraint)
1591 return idx; /* Found. */
1592 }
1593 return -1; /* Not found. */
1594 }
1595
1596 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1597 Return the index of the new node, or -1 if insufficient storage is
1598 available. */
1599
1600 static int
1601 duplicate_node (dfa, org_idx, constraint)
1602 re_dfa_t *dfa;
1603 int org_idx;
1604 unsigned int constraint;
1605 {
1606 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1607 if (BE (dup_idx != -1, 1))
1608 {
1609 dfa->nodes[dup_idx].constraint = constraint;
1610 if (dfa->nodes[org_idx].type == ANCHOR)
1611 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1612 dfa->nodes[dup_idx].duplicated = 1;
1613
1614 /* Store the index of the original node. */
1615 dfa->org_indices[dup_idx] = org_idx;
1616 }
1617 return dup_idx;
1618 }
1619
1620 static reg_errcode_t
1621 calc_inveclosure (dfa)
1622 re_dfa_t *dfa;
1623 {
1624 int src, idx, ret;
1625 for (idx = 0; idx < dfa->nodes_len; ++idx)
1626 re_node_set_init_empty (dfa->inveclosures + idx);
1627
1628 for (src = 0; src < dfa->nodes_len; ++src)
1629 {
1630 int *elems = dfa->eclosures[src].elems;
1631 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1632 {
1633 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634 if (BE (ret == -1, 0))
1635 return REG_ESPACE;
1636 }
1637 }
1638
1639 return REG_NOERROR;
1640 }
1641
1642 /* Calculate "eclosure" for all the node in DFA. */
1643
1644 static reg_errcode_t
1645 calc_eclosure (dfa)
1646 re_dfa_t *dfa;
1647 {
1648 int node_idx, incomplete;
1649 #ifdef DEBUG
1650 assert (dfa->nodes_len > 0);
1651 #endif
1652 incomplete = 0;
1653 /* For each nodes, calculate epsilon closure. */
1654 for (node_idx = 0; ; ++node_idx)
1655 {
1656 reg_errcode_t err;
1657 re_node_set eclosure_elem;
1658 if (node_idx == dfa->nodes_len)
1659 {
1660 if (!incomplete)
1661 break;
1662 incomplete = 0;
1663 node_idx = 0;
1664 }
1665
1666 #ifdef DEBUG
1667 assert (dfa->eclosures[node_idx].nelem != -1);
1668 #endif
1669
1670 /* If we have already calculated, skip it. */
1671 if (dfa->eclosures[node_idx].nelem != 0)
1672 continue;
1673 /* Calculate epsilon closure of `node_idx'. */
1674 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1675 if (BE (err != REG_NOERROR, 0))
1676 return err;
1677
1678 if (dfa->eclosures[node_idx].nelem == 0)
1679 {
1680 incomplete = 1;
1681 re_node_set_free (&eclosure_elem);
1682 }
1683 }
1684 return REG_NOERROR;
1685 }
1686
1687 /* Calculate epsilon closure of NODE. */
1688
1689 static reg_errcode_t
1690 calc_eclosure_iter (new_set, dfa, node, root)
1691 re_node_set *new_set;
1692 re_dfa_t *dfa;
1693 int node, root;
1694 {
1695 reg_errcode_t err;
1696 unsigned int constraint;
1697 int i, incomplete;
1698 re_node_set eclosure;
1699 incomplete = 0;
1700 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1701 if (BE (err != REG_NOERROR, 0))
1702 return err;
1703
1704 /* This indicates that we are calculating this node now.
1705 We reference this value to avoid infinite loop. */
1706 dfa->eclosures[node].nelem = -1;
1707
1708 constraint = ((dfa->nodes[node].type == ANCHOR)
1709 ? dfa->nodes[node].opr.ctx_type : 0);
1710 /* If the current node has constraints, duplicate all nodes.
1711 Since they must inherit the constraints. */
1712 if (constraint
1713 && dfa->edests[node].nelem
1714 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1715 {
1716 int org_node, cur_node;
1717 org_node = cur_node = node;
1718 err = duplicate_node_closure (dfa, node, node, node, constraint);
1719 if (BE (err != REG_NOERROR, 0))
1720 return err;
1721 }
1722
1723 /* Expand each epsilon destination nodes. */
1724 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1725 for (i = 0; i < dfa->edests[node].nelem; ++i)
1726 {
1727 re_node_set eclosure_elem;
1728 int edest = dfa->edests[node].elems[i];
1729 /* If calculating the epsilon closure of `edest' is in progress,
1730 return intermediate result. */
1731 if (dfa->eclosures[edest].nelem == -1)
1732 {
1733 incomplete = 1;
1734 continue;
1735 }
1736 /* If we haven't calculated the epsilon closure of `edest' yet,
1737 calculate now. Otherwise use calculated epsilon closure. */
1738 if (dfa->eclosures[edest].nelem == 0)
1739 {
1740 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1741 if (BE (err != REG_NOERROR, 0))
1742 return err;
1743 }
1744 else
1745 eclosure_elem = dfa->eclosures[edest];
1746 /* Merge the epsilon closure of `edest'. */
1747 re_node_set_merge (&eclosure, &eclosure_elem);
1748 /* If the epsilon closure of `edest' is incomplete,
1749 the epsilon closure of this node is also incomplete. */
1750 if (dfa->eclosures[edest].nelem == 0)
1751 {
1752 incomplete = 1;
1753 re_node_set_free (&eclosure_elem);
1754 }
1755 }
1756
1757 /* Epsilon closures include itself. */
1758 re_node_set_insert (&eclosure, node);
1759 if (incomplete && !root)
1760 dfa->eclosures[node].nelem = 0;
1761 else
1762 dfa->eclosures[node] = eclosure;
1763 *new_set = eclosure;
1764 return REG_NOERROR;
1765 }
1766 \f
1767 /* Functions for token which are used in the parser. */
1768
1769 /* Fetch a token from INPUT.
1770 We must not use this function inside bracket expressions. */
1771
1772 static void
1773 fetch_token (result, input, syntax)
1774 re_token_t *result;
1775 re_string_t *input;
1776 reg_syntax_t syntax;
1777 {
1778 re_string_skip_bytes (input, peek_token (result, input, syntax));
1779 }
1780
1781 /* Peek a token from INPUT, and return the length of the token.
1782 We must not use this function inside bracket expressions. */
1783
1784 static int
1785 peek_token (token, input, syntax)
1786 re_token_t *token;
1787 re_string_t *input;
1788 reg_syntax_t syntax;
1789 {
1790 unsigned char c;
1791
1792 if (re_string_eoi (input))
1793 {
1794 token->type = END_OF_RE;
1795 return 0;
1796 }
1797
1798 c = re_string_peek_byte (input, 0);
1799 token->opr.c = c;
1800
1801 token->word_char = 0;
1802 #ifdef RE_ENABLE_I18N
1803 token->mb_partial = 0;
1804 if (input->mb_cur_max > 1 &&
1805 !re_string_first_byte (input, re_string_cur_idx (input)))
1806 {
1807 token->type = CHARACTER;
1808 token->mb_partial = 1;
1809 return 1;
1810 }
1811 #endif
1812 if (c == '\\')
1813 {
1814 unsigned char c2;
1815 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1816 {
1817 token->type = BACK_SLASH;
1818 return 1;
1819 }
1820
1821 c2 = re_string_peek_byte_case (input, 1);
1822 token->opr.c = c2;
1823 token->type = CHARACTER;
1824 #ifdef RE_ENABLE_I18N
1825 if (input->mb_cur_max > 1)
1826 {
1827 wint_t wc = re_string_wchar_at (input,
1828 re_string_cur_idx (input) + 1);
1829 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1830 }
1831 else
1832 #endif
1833 token->word_char = IS_WORD_CHAR (c2) != 0;
1834
1835 switch (c2)
1836 {
1837 case '|':
1838 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1839 token->type = OP_ALT;
1840 break;
1841 case '1': case '2': case '3': case '4': case '5':
1842 case '6': case '7': case '8': case '9':
1843 if (!(syntax & RE_NO_BK_REFS))
1844 {
1845 token->type = OP_BACK_REF;
1846 token->opr.idx = c2 - '1';
1847 }
1848 break;
1849 case '<':
1850 if (!(syntax & RE_NO_GNU_OPS))
1851 {
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_FIRST;
1854 }
1855 break;
1856 case '>':
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 {
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = WORD_LAST;
1861 }
1862 break;
1863 case 'b':
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 {
1866 token->type = ANCHOR;
1867 token->opr.ctx_type = WORD_DELIM;
1868 }
1869 break;
1870 case 'B':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 {
1873 token->type = ANCHOR;
1874 token->opr.ctx_type = NOT_WORD_DELIM;
1875 }
1876 break;
1877 case 'w':
1878 if (!(syntax & RE_NO_GNU_OPS))
1879 token->type = OP_WORD;
1880 break;
1881 case 'W':
1882 if (!(syntax & RE_NO_GNU_OPS))
1883 token->type = OP_NOTWORD;
1884 break;
1885 case 's':
1886 if (!(syntax & RE_NO_GNU_OPS))
1887 token->type = OP_SPACE;
1888 break;
1889 case 'S':
1890 if (!(syntax & RE_NO_GNU_OPS))
1891 token->type = OP_NOTSPACE;
1892 break;
1893 case '`':
1894 if (!(syntax & RE_NO_GNU_OPS))
1895 {
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = BUF_FIRST;
1898 }
1899 break;
1900 case '\'':
1901 if (!(syntax & RE_NO_GNU_OPS))
1902 {
1903 token->type = ANCHOR;
1904 token->opr.ctx_type = BUF_LAST;
1905 }
1906 break;
1907 case '(':
1908 if (!(syntax & RE_NO_BK_PARENS))
1909 token->type = OP_OPEN_SUBEXP;
1910 break;
1911 case ')':
1912 if (!(syntax & RE_NO_BK_PARENS))
1913 token->type = OP_CLOSE_SUBEXP;
1914 break;
1915 case '+':
1916 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1917 token->type = OP_DUP_PLUS;
1918 break;
1919 case '?':
1920 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1921 token->type = OP_DUP_QUESTION;
1922 break;
1923 case '{':
1924 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1925 token->type = OP_OPEN_DUP_NUM;
1926 break;
1927 case '}':
1928 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1929 token->type = OP_CLOSE_DUP_NUM;
1930 break;
1931 default:
1932 break;
1933 }
1934 return 2;
1935 }
1936
1937 token->type = CHARACTER;
1938 #ifdef RE_ENABLE_I18N
1939 if (input->mb_cur_max > 1)
1940 {
1941 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1942 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1943 }
1944 else
1945 #endif
1946 token->word_char = IS_WORD_CHAR (token->opr.c);
1947
1948 switch (c)
1949 {
1950 case '\n':
1951 if (syntax & RE_NEWLINE_ALT)
1952 token->type = OP_ALT;
1953 break;
1954 case '|':
1955 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1956 token->type = OP_ALT;
1957 break;
1958 case '*':
1959 token->type = OP_DUP_ASTERISK;
1960 break;
1961 case '+':
1962 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1963 token->type = OP_DUP_PLUS;
1964 break;
1965 case '?':
1966 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1967 token->type = OP_DUP_QUESTION;
1968 break;
1969 case '{':
1970 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1971 token->type = OP_OPEN_DUP_NUM;
1972 break;
1973 case '}':
1974 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1975 token->type = OP_CLOSE_DUP_NUM;
1976 break;
1977 case '(':
1978 if (syntax & RE_NO_BK_PARENS)
1979 token->type = OP_OPEN_SUBEXP;
1980 break;
1981 case ')':
1982 if (syntax & RE_NO_BK_PARENS)
1983 token->type = OP_CLOSE_SUBEXP;
1984 break;
1985 case '[':
1986 token->type = OP_OPEN_BRACKET;
1987 break;
1988 case '.':
1989 token->type = OP_PERIOD;
1990 break;
1991 case '^':
1992 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1993 re_string_cur_idx (input) != 0)
1994 {
1995 char prev = re_string_peek_byte (input, -1);
1996 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1997 break;
1998 }
1999 token->type = ANCHOR;
2000 token->opr.ctx_type = LINE_FIRST;
2001 break;
2002 case '$':
2003 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2004 re_string_cur_idx (input) + 1 != re_string_length (input))
2005 {
2006 re_token_t next;
2007 re_string_skip_bytes (input, 1);
2008 peek_token (&next, input, syntax);
2009 re_string_skip_bytes (input, -1);
2010 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2011 break;
2012 }
2013 token->type = ANCHOR;
2014 token->opr.ctx_type = LINE_LAST;
2015 break;
2016 default:
2017 break;
2018 }
2019 return 1;
2020 }
2021
2022 /* Peek a token from INPUT, and return the length of the token.
2023 We must not use this function out of bracket expressions. */
2024
2025 static int
2026 peek_token_bracket (token, input, syntax)
2027 re_token_t *token;
2028 re_string_t *input;
2029 reg_syntax_t syntax;
2030 {
2031 unsigned char c;
2032 if (re_string_eoi (input))
2033 {
2034 token->type = END_OF_RE;
2035 return 0;
2036 }
2037 c = re_string_peek_byte (input, 0);
2038 token->opr.c = c;
2039
2040 #ifdef RE_ENABLE_I18N
2041 if (input->mb_cur_max > 1 &&
2042 !re_string_first_byte (input, re_string_cur_idx (input)))
2043 {
2044 token->type = CHARACTER;
2045 return 1;
2046 }
2047 #endif /* RE_ENABLE_I18N */
2048
2049 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2050 && re_string_cur_idx (input) + 1 < re_string_length (input))
2051 {
2052 /* In this case, '\' escape a character. */
2053 unsigned char c2;
2054 re_string_skip_bytes (input, 1);
2055 c2 = re_string_peek_byte (input, 0);
2056 token->opr.c = c2;
2057 token->type = CHARACTER;
2058 return 1;
2059 }
2060 if (c == '[') /* '[' is a special char in a bracket exps. */
2061 {
2062 unsigned char c2;
2063 int token_len;
2064 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2065 c2 = re_string_peek_byte (input, 1);
2066 else
2067 c2 = 0;
2068 token->opr.c = c2;
2069 token_len = 2;
2070 switch (c2)
2071 {
2072 case '.':
2073 token->type = OP_OPEN_COLL_ELEM;
2074 break;
2075 case '=':
2076 token->type = OP_OPEN_EQUIV_CLASS;
2077 break;
2078 case ':':
2079 if (syntax & RE_CHAR_CLASSES)
2080 {
2081 token->type = OP_OPEN_CHAR_CLASS;
2082 break;
2083 }
2084 /* else fall through. */
2085 default:
2086 token->type = CHARACTER;
2087 token->opr.c = c;
2088 token_len = 1;
2089 break;
2090 }
2091 return token_len;
2092 }
2093 switch (c)
2094 {
2095 case '-':
2096 token->type = OP_CHARSET_RANGE;
2097 break;
2098 case ']':
2099 token->type = OP_CLOSE_BRACKET;
2100 break;
2101 case '^':
2102 token->type = OP_NON_MATCH_LIST;
2103 break;
2104 default:
2105 token->type = CHARACTER;
2106 }
2107 return 1;
2108 }
2109 \f
2110 /* Functions for parser. */
2111
2112 /* Entry point of the parser.
2113 Parse the regular expression REGEXP and return the structure tree.
2114 If an error is occured, ERR is set by error code, and return NULL.
2115 This function build the following tree, from regular expression <reg_exp>:
2116 CAT
2117 / \
2118 / \
2119 <reg_exp> EOR
2120
2121 CAT means concatenation.
2122 EOR means end of regular expression. */
2123
2124 static bin_tree_t *
2125 parse (regexp, preg, syntax, err)
2126 re_string_t *regexp;
2127 regex_t *preg;
2128 reg_syntax_t syntax;
2129 reg_errcode_t *err;
2130 {
2131 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2132 bin_tree_t *tree, *eor, *root;
2133 re_token_t current_token;
2134 dfa->syntax = syntax;
2135 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2136 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2137 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2138 return NULL;
2139 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2140 if (tree != NULL)
2141 root = create_tree (dfa, tree, eor, CONCAT);
2142 else
2143 root = eor;
2144 if (BE (eor == NULL || root == NULL, 0))
2145 {
2146 *err = REG_ESPACE;
2147 return NULL;
2148 }
2149 return root;
2150 }
2151
2152 /* This function build the following tree, from regular expression
2153 <branch1>|<branch2>:
2154 ALT
2155 / \
2156 / \
2157 <branch1> <branch2>
2158
2159 ALT means alternative, which represents the operator `|'. */
2160
2161 static bin_tree_t *
2162 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2163 re_string_t *regexp;
2164 regex_t *preg;
2165 re_token_t *token;
2166 reg_syntax_t syntax;
2167 int nest;
2168 reg_errcode_t *err;
2169 {
2170 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2171 bin_tree_t *tree, *branch = NULL;
2172 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2173 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2174 return NULL;
2175
2176 while (token->type == OP_ALT)
2177 {
2178 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2179 if (token->type != OP_ALT && token->type != END_OF_RE
2180 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2181 {
2182 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2183 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2184 return NULL;
2185 }
2186 else
2187 branch = NULL;
2188 tree = create_tree (dfa, tree, branch, OP_ALT);
2189 if (BE (tree == NULL, 0))
2190 {
2191 *err = REG_ESPACE;
2192 return NULL;
2193 }
2194 }
2195 return tree;
2196 }
2197
2198 /* This function build the following tree, from regular expression
2199 <exp1><exp2>:
2200 CAT
2201 / \
2202 / \
2203 <exp1> <exp2>
2204
2205 CAT means concatenation. */
2206
2207 static bin_tree_t *
2208 parse_branch (regexp, preg, token, syntax, nest, err)
2209 re_string_t *regexp;
2210 regex_t *preg;
2211 re_token_t *token;
2212 reg_syntax_t syntax;
2213 int nest;
2214 reg_errcode_t *err;
2215 {
2216 bin_tree_t *tree, *exp;
2217 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2218 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2219 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2220 return NULL;
2221
2222 while (token->type != OP_ALT && token->type != END_OF_RE
2223 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2224 {
2225 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2226 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2227 {
2228 return NULL;
2229 }
2230 if (tree != NULL && exp != NULL)
2231 {
2232 tree = create_tree (dfa, tree, exp, CONCAT);
2233 if (tree == NULL)
2234 {
2235 *err = REG_ESPACE;
2236 return NULL;
2237 }
2238 }
2239 else if (tree == NULL)
2240 tree = exp;
2241 /* Otherwise exp == NULL, we don't need to create new tree. */
2242 }
2243 return tree;
2244 }
2245
2246 /* This function build the following tree, from regular expression a*:
2247 *
2248 |
2249 a
2250 */
2251
2252 static bin_tree_t *
2253 parse_expression (regexp, preg, token, syntax, nest, err)
2254 re_string_t *regexp;
2255 regex_t *preg;
2256 re_token_t *token;
2257 reg_syntax_t syntax;
2258 int nest;
2259 reg_errcode_t *err;
2260 {
2261 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2262 bin_tree_t *tree;
2263 switch (token->type)
2264 {
2265 case CHARACTER:
2266 tree = create_token_tree (dfa, NULL, NULL, token);
2267 if (BE (tree == NULL, 0))
2268 {
2269 *err = REG_ESPACE;
2270 return NULL;
2271 }
2272 #ifdef RE_ENABLE_I18N
2273 if (dfa->mb_cur_max > 1)
2274 {
2275 while (!re_string_eoi (regexp)
2276 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2277 {
2278 bin_tree_t *mbc_remain;
2279 fetch_token (token, regexp, syntax);
2280 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2281 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2282 if (BE (mbc_remain == NULL || tree == NULL, 0))
2283 {
2284 *err = REG_ESPACE;
2285 return NULL;
2286 }
2287 }
2288 }
2289 #endif
2290 break;
2291 case OP_OPEN_SUBEXP:
2292 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2293 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2294 return NULL;
2295 break;
2296 case OP_OPEN_BRACKET:
2297 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2298 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2299 return NULL;
2300 break;
2301 case OP_BACK_REF:
2302 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2303 {
2304 *err = REG_ESUBREG;
2305 return NULL;
2306 }
2307 dfa->used_bkref_map |= 1 << token->opr.idx;
2308 tree = create_token_tree (dfa, NULL, NULL, token);
2309 if (BE (tree == NULL, 0))
2310 {
2311 *err = REG_ESPACE;
2312 return NULL;
2313 }
2314 ++dfa->nbackref;
2315 dfa->has_mb_node = 1;
2316 break;
2317 case OP_OPEN_DUP_NUM:
2318 if (syntax & RE_CONTEXT_INVALID_DUP)
2319 {
2320 *err = REG_BADRPT;
2321 return NULL;
2322 }
2323 /* FALLTHROUGH */
2324 case OP_DUP_ASTERISK:
2325 case OP_DUP_PLUS:
2326 case OP_DUP_QUESTION:
2327 if (syntax & RE_CONTEXT_INVALID_OPS)
2328 {
2329 *err = REG_BADRPT;
2330 return NULL;
2331 }
2332 else if (syntax & RE_CONTEXT_INDEP_OPS)
2333 {
2334 fetch_token (token, regexp, syntax);
2335 return parse_expression (regexp, preg, token, syntax, nest, err);
2336 }
2337 /* else fall through */
2338 case OP_CLOSE_SUBEXP:
2339 if ((token->type == OP_CLOSE_SUBEXP) &&
2340 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2341 {
2342 *err = REG_ERPAREN;
2343 return NULL;
2344 }
2345 /* else fall through */
2346 case OP_CLOSE_DUP_NUM:
2347 /* We treat it as a normal character. */
2348
2349 /* Then we can these characters as normal characters. */
2350 token->type = CHARACTER;
2351 /* mb_partial and word_char bits should be initialized already
2352 by peek_token. */
2353 tree = create_token_tree (dfa, NULL, NULL, token);
2354 if (BE (tree == NULL, 0))
2355 {
2356 *err = REG_ESPACE;
2357 return NULL;
2358 }
2359 break;
2360 case ANCHOR:
2361 if ((token->opr.ctx_type
2362 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2363 && dfa->word_ops_used == 0)
2364 init_word_char (dfa);
2365 if (token->opr.ctx_type == WORD_DELIM
2366 || token->opr.ctx_type == NOT_WORD_DELIM)
2367 {
2368 bin_tree_t *tree_first, *tree_last;
2369 if (token->opr.ctx_type == WORD_DELIM)
2370 {
2371 token->opr.ctx_type = WORD_FIRST;
2372 tree_first = create_token_tree (dfa, NULL, NULL, token);
2373 token->opr.ctx_type = WORD_LAST;
2374 }
2375 else
2376 {
2377 token->opr.ctx_type = INSIDE_WORD;
2378 tree_first = create_token_tree (dfa, NULL, NULL, token);
2379 token->opr.ctx_type = INSIDE_NOTWORD;
2380 }
2381 tree_last = create_token_tree (dfa, NULL, NULL, token);
2382 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2383 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2384 {
2385 *err = REG_ESPACE;
2386 return NULL;
2387 }
2388 }
2389 else
2390 {
2391 tree = create_token_tree (dfa, NULL, NULL, token);
2392 if (BE (tree == NULL, 0))
2393 {
2394 *err = REG_ESPACE;
2395 return NULL;
2396 }
2397 }
2398 /* We must return here, since ANCHORs can't be followed
2399 by repetition operators.
2400 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2401 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2402 fetch_token (token, regexp, syntax);
2403 return tree;
2404 case OP_PERIOD:
2405 tree = create_token_tree (dfa, NULL, NULL, token);
2406 if (BE (tree == NULL, 0))
2407 {
2408 *err = REG_ESPACE;
2409 return NULL;
2410 }
2411 if (dfa->mb_cur_max > 1)
2412 dfa->has_mb_node = 1;
2413 break;
2414 case OP_WORD:
2415 case OP_NOTWORD:
2416 tree = build_charclass_op (dfa, regexp->trans,
2417 (const unsigned char *) "alnum",
2418 (const unsigned char *) "_",
2419 token->type == OP_NOTWORD, err);
2420 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2421 return NULL;
2422 break;
2423 case OP_SPACE:
2424 case OP_NOTSPACE:
2425 tree = build_charclass_op (dfa, regexp->trans,
2426 (const unsigned char *) "space",
2427 (const unsigned char *) "",
2428 token->type == OP_NOTSPACE, err);
2429 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2430 return NULL;
2431 break;
2432 case OP_ALT:
2433 case END_OF_RE:
2434 return NULL;
2435 case BACK_SLASH:
2436 *err = REG_EESCAPE;
2437 return NULL;
2438 default:
2439 /* Must not happen? */
2440 #ifdef DEBUG
2441 assert (0);
2442 #endif
2443 return NULL;
2444 }
2445 fetch_token (token, regexp, syntax);
2446
2447 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2448 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2449 {
2450 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2451 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2452 return NULL;
2453 /* In BRE consecutive duplications are not allowed. */
2454 if ((syntax & RE_CONTEXT_INVALID_DUP)
2455 && (token->type == OP_DUP_ASTERISK
2456 || token->type == OP_OPEN_DUP_NUM))
2457 {
2458 *err = REG_BADRPT;
2459 return NULL;
2460 }
2461 }
2462
2463 return tree;
2464 }
2465
2466 /* This function build the following tree, from regular expression
2467 (<reg_exp>):
2468 SUBEXP
2469 |
2470 <reg_exp>
2471 */
2472
2473 static bin_tree_t *
2474 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2475 re_string_t *regexp;
2476 regex_t *preg;
2477 re_token_t *token;
2478 reg_syntax_t syntax;
2479 int nest;
2480 reg_errcode_t *err;
2481 {
2482 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2483 bin_tree_t *tree;
2484 size_t cur_nsub;
2485 cur_nsub = preg->re_nsub++;
2486
2487 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488
2489 /* The subexpression may be a null string. */
2490 if (token->type == OP_CLOSE_SUBEXP)
2491 tree = NULL;
2492 else
2493 {
2494 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2496 *err = REG_EPAREN;
2497 if (BE (*err != REG_NOERROR, 0))
2498 return NULL;
2499 }
2500
2501 if (cur_nsub <= '9' - '1')
2502 dfa->completed_bkref_map |= 1 << cur_nsub;
2503
2504 tree = create_tree (dfa, tree, NULL, SUBEXP);
2505 if (BE (tree == NULL, 0))
2506 {
2507 *err = REG_ESPACE;
2508 return NULL;
2509 }
2510 tree->token.opr.idx = cur_nsub;
2511 return tree;
2512 }
2513
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2515
2516 static bin_tree_t *
2517 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2518 bin_tree_t *elem;
2519 re_string_t *regexp;
2520 re_dfa_t *dfa;
2521 re_token_t *token;
2522 reg_syntax_t syntax;
2523 reg_errcode_t *err;
2524 {
2525 bin_tree_t *tree = NULL, *old_tree = NULL;
2526 int i, start, end, start_idx = re_string_cur_idx (regexp);
2527 re_token_t start_token = *token;
2528
2529 if (token->type == OP_OPEN_DUP_NUM)
2530 {
2531 end = 0;
2532 start = fetch_number (regexp, token, syntax);
2533 if (start == -1)
2534 {
2535 if (token->type == CHARACTER && token->opr.c == ',')
2536 start = 0; /* We treat "{,m}" as "{0,m}". */
2537 else
2538 {
2539 *err = REG_BADBR; /* <re>{} is invalid. */
2540 return NULL;
2541 }
2542 }
2543 if (BE (start != -2, 1))
2544 {
2545 /* We treat "{n}" as "{n,n}". */
2546 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2547 : ((token->type == CHARACTER && token->opr.c == ',')
2548 ? fetch_number (regexp, token, syntax) : -2));
2549 }
2550 if (BE (start == -2 || end == -2, 0))
2551 {
2552 /* Invalid sequence. */
2553 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2554 {
2555 if (token->type == END_OF_RE)
2556 *err = REG_EBRACE;
2557 else
2558 *err = REG_BADBR;
2559
2560 return NULL;
2561 }
2562
2563 /* If the syntax bit is set, rollback. */
2564 re_string_set_index (regexp, start_idx);
2565 *token = start_token;
2566 token->type = CHARACTER;
2567 /* mb_partial and word_char bits should be already initialized by
2568 peek_token. */
2569 return elem;
2570 }
2571
2572 if (BE (end != -1 && start > end, 0))
2573 {
2574 /* First number greater than second. */
2575 *err = REG_BADBR;
2576 return NULL;
2577 }
2578 }
2579 else
2580 {
2581 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2582 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2583 }
2584
2585 fetch_token (token, regexp, syntax);
2586
2587 if (BE (elem == NULL, 0))
2588 return NULL;
2589 if (BE (start == 0 && end == 0, 0))
2590 {
2591 postorder (elem, free_tree, NULL);
2592 return NULL;
2593 }
2594
2595 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2596 if (BE (start > 0, 0))
2597 {
2598 tree = elem;
2599 for (i = 2; i <= start; ++i)
2600 {
2601 elem = duplicate_tree (elem, dfa);
2602 tree = create_tree (dfa, tree, elem, CONCAT);
2603 if (BE (elem == NULL || tree == NULL, 0))
2604 goto parse_dup_op_espace;
2605 }
2606
2607 if (start == end)
2608 return tree;
2609
2610 /* Duplicate ELEM before it is marked optional. */
2611 elem = duplicate_tree (elem, dfa);
2612 old_tree = tree;
2613 }
2614 else
2615 old_tree = NULL;
2616
2617 if (elem->token.type == SUBEXP)
2618 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2619
2620 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2621 if (BE (tree == NULL, 0))
2622 goto parse_dup_op_espace;
2623
2624 /* This loop is actually executed only when end != -1,
2625 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2626 already created the start+1-th copy. */
2627 for (i = start + 2; i <= end; ++i)
2628 {
2629 elem = duplicate_tree (elem, dfa);
2630 tree = create_tree (dfa, tree, elem, CONCAT);
2631 if (BE (elem == NULL || tree == NULL, 0))
2632 goto parse_dup_op_espace;
2633
2634 tree = create_tree (dfa, tree, NULL, OP_ALT);
2635 if (BE (tree == NULL, 0))
2636 goto parse_dup_op_espace;
2637 }
2638
2639 if (old_tree)
2640 tree = create_tree (dfa, old_tree, tree, CONCAT);
2641
2642 return tree;
2643
2644 parse_dup_op_espace:
2645 *err = REG_ESPACE;
2646 return NULL;
2647 }
2648
2649 /* Size of the names for collating symbol/equivalence_class/character_class.
2650 I'm not sure, but maybe enough. */
2651 #define BRACKET_NAME_BUF_SIZE 32
2652
2653 #ifndef _LIBC
2654 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2655 Build the range expression which starts from START_ELEM, and ends
2656 at END_ELEM. The result are written to MBCSET and SBCSET.
2657 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2658 mbcset->range_ends, is a pointer argument sinse we may
2659 update it. */
2660
2661 static reg_errcode_t
2662 # ifdef RE_ENABLE_I18N
2663 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2664 re_charset_t *mbcset;
2665 int *range_alloc;
2666 # else /* not RE_ENABLE_I18N */
2667 build_range_exp (sbcset, start_elem, end_elem)
2668 # endif /* not RE_ENABLE_I18N */
2669 re_bitset_ptr_t sbcset;
2670 bracket_elem_t *start_elem, *end_elem;
2671 {
2672 unsigned int start_ch, end_ch;
2673 /* Equivalence Classes and Character Classes can't be a range start/end. */
2674 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2675 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2676 0))
2677 return REG_ERANGE;
2678
2679 /* We can handle no multi character collating elements without libc
2680 support. */
2681 if (BE ((start_elem->type == COLL_SYM
2682 && strlen ((char *) start_elem->opr.name) > 1)
2683 || (end_elem->type == COLL_SYM
2684 && strlen ((char *) end_elem->opr.name) > 1), 0))
2685 return REG_ECOLLATE;
2686
2687 # ifdef RE_ENABLE_I18N
2688 {
2689 wchar_t wc;
2690 wint_t start_wc;
2691 wint_t end_wc;
2692 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2693
2694 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2695 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2696 : 0));
2697 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2698 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2699 : 0));
2700 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2701 ? __btowc (start_ch) : start_elem->opr.wch);
2702 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2703 ? __btowc (end_ch) : end_elem->opr.wch);
2704 if (start_wc == WEOF || end_wc == WEOF)
2705 return REG_ECOLLATE;
2706 cmp_buf[0] = start_wc;
2707 cmp_buf[4] = end_wc;
2708 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2709 return REG_ERANGE;
2710
2711 /* Got valid collation sequence values, add them as a new entry.
2712 However, for !_LIBC we have no collation elements: if the
2713 character set is single byte, the single byte character set
2714 that we build below suffices. parse_bracket_exp passes
2715 no MBCSET if dfa->mb_cur_max == 1. */
2716 if (mbcset)
2717 {
2718 /* Check the space of the arrays. */
2719 if (BE (*range_alloc == mbcset->nranges, 0))
2720 {
2721 /* There is not enough space, need realloc. */
2722 wchar_t *new_array_start, *new_array_end;
2723 int new_nranges;
2724
2725 /* +1 in case of mbcset->nranges is 0. */
2726 new_nranges = 2 * mbcset->nranges + 1;
2727 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2728 are NULL if *range_alloc == 0. */
2729 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2730 new_nranges);
2731 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2732 new_nranges);
2733
2734 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2735 return REG_ESPACE;
2736
2737 mbcset->range_starts = new_array_start;
2738 mbcset->range_ends = new_array_end;
2739 *range_alloc = new_nranges;
2740 }
2741
2742 mbcset->range_starts[mbcset->nranges] = start_wc;
2743 mbcset->range_ends[mbcset->nranges++] = end_wc;
2744 }
2745
2746 /* Build the table for single byte characters. */
2747 for (wc = 0; wc < SBC_MAX; ++wc)
2748 {
2749 cmp_buf[2] = wc;
2750 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2751 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2752 bitset_set (sbcset, wc);
2753 }
2754 }
2755 # else /* not RE_ENABLE_I18N */
2756 {
2757 unsigned int ch;
2758 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2759 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2760 : 0));
2761 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2762 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2763 : 0));
2764 if (start_ch > end_ch)
2765 return REG_ERANGE;
2766 /* Build the table for single byte characters. */
2767 for (ch = 0; ch < SBC_MAX; ++ch)
2768 if (start_ch <= ch && ch <= end_ch)
2769 bitset_set (sbcset, ch);
2770 }
2771 # endif /* not RE_ENABLE_I18N */
2772 return REG_NOERROR;
2773 }
2774 #endif /* not _LIBC */
2775
2776 #ifndef _LIBC
2777 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2778 Build the collating element which is represented by NAME.
2779 The result are written to MBCSET and SBCSET.
2780 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2781 pointer argument since we may update it. */
2782
2783 static reg_errcode_t
2784 # ifdef RE_ENABLE_I18N
2785 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2786 re_charset_t *mbcset;
2787 int *coll_sym_alloc;
2788 # else /* not RE_ENABLE_I18N */
2789 build_collating_symbol (sbcset, name)
2790 # endif /* not RE_ENABLE_I18N */
2791 re_bitset_ptr_t sbcset;
2792 const unsigned char *name;
2793 {
2794 size_t name_len = strlen ((const char *) name);
2795 if (BE (name_len != 1, 0))
2796 return REG_ECOLLATE;
2797 else
2798 {
2799 bitset_set (sbcset, name[0]);
2800 return REG_NOERROR;
2801 }
2802 }
2803 #endif /* not _LIBC */
2804
2805 /* This function parse bracket expression like "[abc]", "[a-c]",
2806 "[[.a-a.]]" etc. */
2807
2808 static bin_tree_t *
2809 parse_bracket_exp (regexp, dfa, token, syntax, err)
2810 re_string_t *regexp;
2811 re_dfa_t *dfa;
2812 re_token_t *token;
2813 reg_syntax_t syntax;
2814 reg_errcode_t *err;
2815 {
2816 #ifdef _LIBC
2817 const unsigned char *collseqmb;
2818 const char *collseqwc;
2819 uint32_t nrules;
2820 int32_t table_size;
2821 const int32_t *symb_table;
2822 const unsigned char *extra;
2823
2824 /* Local function for parse_bracket_exp used in _LIBC environement.
2825 Seek the collating symbol entry correspondings to NAME.
2826 Return the index of the symbol in the SYMB_TABLE. */
2827
2828 auto inline int32_t
2829 __attribute ((always_inline))
2830 seek_collating_symbol_entry (name, name_len)
2831 const unsigned char *name;
2832 size_t name_len;
2833 {
2834 int32_t hash = elem_hash ((const char *) name, name_len);
2835 int32_t elem = hash % table_size;
2836 int32_t second = hash % (table_size - 2);
2837 while (symb_table[2 * elem] != 0)
2838 {
2839 /* First compare the hashing value. */
2840 if (symb_table[2 * elem] == hash
2841 /* Compare the length of the name. */
2842 && name_len == extra[symb_table[2 * elem + 1]]
2843 /* Compare the name. */
2844 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2845 name_len) == 0)
2846 {
2847 /* Yep, this is the entry. */
2848 break;
2849 }
2850
2851 /* Next entry. */
2852 elem += second;
2853 }
2854 return elem;
2855 }
2856
2857 /* Local function for parse_bracket_exp used in _LIBC environement.
2858 Look up the collation sequence value of BR_ELEM.
2859 Return the value if succeeded, UINT_MAX otherwise. */
2860
2861 auto inline unsigned int
2862 __attribute ((always_inline))
2863 lookup_collation_sequence_value (br_elem)
2864 bracket_elem_t *br_elem;
2865 {
2866 if (br_elem->type == SB_CHAR)
2867 {
2868 /*
2869 if (MB_CUR_MAX == 1)
2870 */
2871 if (nrules == 0)
2872 return collseqmb[br_elem->opr.ch];
2873 else
2874 {
2875 wint_t wc = __btowc (br_elem->opr.ch);
2876 return __collseq_table_lookup (collseqwc, wc);
2877 }
2878 }
2879 else if (br_elem->type == MB_CHAR)
2880 {
2881 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2882 }
2883 else if (br_elem->type == COLL_SYM)
2884 {
2885 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2886 if (nrules != 0)
2887 {
2888 int32_t elem, idx;
2889 elem = seek_collating_symbol_entry (br_elem->opr.name,
2890 sym_name_len);
2891 if (symb_table[2 * elem] != 0)
2892 {
2893 /* We found the entry. */
2894 idx = symb_table[2 * elem + 1];
2895 /* Skip the name of collating element name. */
2896 idx += 1 + extra[idx];
2897 /* Skip the byte sequence of the collating element. */
2898 idx += 1 + extra[idx];
2899 /* Adjust for the alignment. */
2900 idx = (idx + 3) & ~3;
2901 /* Skip the multibyte collation sequence value. */
2902 idx += sizeof (unsigned int);
2903 /* Skip the wide char sequence of the collating element. */
2904 idx += sizeof (unsigned int) *
2905 (1 + *(unsigned int *) (extra + idx));
2906 /* Return the collation sequence value. */
2907 return *(unsigned int *) (extra + idx);
2908 }
2909 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2910 {
2911 /* No valid character. Match it as a single byte
2912 character. */
2913 return collseqmb[br_elem->opr.name[0]];
2914 }
2915 }
2916 else if (sym_name_len == 1)
2917 return collseqmb[br_elem->opr.name[0]];
2918 }
2919 return UINT_MAX;
2920 }
2921
2922 /* Local function for parse_bracket_exp used in _LIBC environement.
2923 Build the range expression which starts from START_ELEM, and ends
2924 at END_ELEM. The result are written to MBCSET and SBCSET.
2925 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2926 mbcset->range_ends, is a pointer argument sinse we may
2927 update it. */
2928
2929 auto inline reg_errcode_t
2930 __attribute ((always_inline))
2931 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2932 re_charset_t *mbcset;
2933 int *range_alloc;
2934 re_bitset_ptr_t sbcset;
2935 bracket_elem_t *start_elem, *end_elem;
2936 {
2937 unsigned int ch;
2938 uint32_t start_collseq;
2939 uint32_t end_collseq;
2940
2941 /* Equivalence Classes and Character Classes can't be a range
2942 start/end. */
2943 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2944 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2945 0))
2946 return REG_ERANGE;
2947
2948 start_collseq = lookup_collation_sequence_value (start_elem);
2949 end_collseq = lookup_collation_sequence_value (end_elem);
2950 /* Check start/end collation sequence values. */
2951 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2952 return REG_ECOLLATE;
2953 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2954 return REG_ERANGE;
2955
2956 /* Got valid collation sequence values, add them as a new entry.
2957 However, if we have no collation elements, and the character set
2958 is single byte, the single byte character set that we
2959 build below suffices. */
2960 if (nrules > 0 || dfa->mb_cur_max > 1)
2961 {
2962 /* Check the space of the arrays. */
2963 if (BE (*range_alloc == mbcset->nranges, 0))
2964 {
2965 /* There is not enough space, need realloc. */
2966 uint32_t *new_array_start;
2967 uint32_t *new_array_end;
2968 int new_nranges;
2969
2970 /* +1 in case of mbcset->nranges is 0. */
2971 new_nranges = 2 * mbcset->nranges + 1;
2972 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2973 new_nranges);
2974 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2975 new_nranges);
2976
2977 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2978 return REG_ESPACE;
2979
2980 mbcset->range_starts = new_array_start;
2981 mbcset->range_ends = new_array_end;
2982 *range_alloc = new_nranges;
2983 }
2984
2985 mbcset->range_starts[mbcset->nranges] = start_collseq;
2986 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2987 }
2988
2989 /* Build the table for single byte characters. */
2990 for (ch = 0; ch < SBC_MAX; ch++)
2991 {
2992 uint32_t ch_collseq;
2993 /*
2994 if (MB_CUR_MAX == 1)
2995 */
2996 if (nrules == 0)
2997 ch_collseq = collseqmb[ch];
2998 else
2999 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3000 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3001 bitset_set (sbcset, ch);
3002 }
3003 return REG_NOERROR;
3004 }
3005
3006 /* Local function for parse_bracket_exp used in _LIBC environement.
3007 Build the collating element which is represented by NAME.
3008 The result are written to MBCSET and SBCSET.
3009 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3010 pointer argument sinse we may update it. */
3011
3012 auto inline reg_errcode_t
3013 __attribute ((always_inline))
3014 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3015 re_charset_t *mbcset;
3016 int *coll_sym_alloc;
3017 re_bitset_ptr_t sbcset;
3018 const unsigned char *name;
3019 {
3020 int32_t elem, idx;
3021 size_t name_len = strlen ((const char *) name);
3022 if (nrules != 0)
3023 {
3024 elem = seek_collating_symbol_entry (name, name_len);
3025 if (symb_table[2 * elem] != 0)
3026 {
3027 /* We found the entry. */
3028 idx = symb_table[2 * elem + 1];
3029 /* Skip the name of collating element name. */
3030 idx += 1 + extra[idx];
3031 }
3032 else if (symb_table[2 * elem] == 0 && name_len == 1)
3033 {
3034 /* No valid character, treat it as a normal
3035 character. */
3036 bitset_set (sbcset, name[0]);
3037 return REG_NOERROR;
3038 }
3039 else
3040 return REG_ECOLLATE;
3041
3042 /* Got valid collation sequence, add it as a new entry. */
3043 /* Check the space of the arrays. */
3044 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3045 {
3046 /* Not enough, realloc it. */
3047 /* +1 in case of mbcset->ncoll_syms is 0. */
3048 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3049 /* Use realloc since mbcset->coll_syms is NULL
3050 if *alloc == 0. */
3051 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3052 new_coll_sym_alloc);
3053 if (BE (new_coll_syms == NULL, 0))
3054 return REG_ESPACE;
3055 mbcset->coll_syms = new_coll_syms;
3056 *coll_sym_alloc = new_coll_sym_alloc;
3057 }
3058 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3059 return REG_NOERROR;
3060 }
3061 else
3062 {
3063 if (BE (name_len != 1, 0))
3064 return REG_ECOLLATE;
3065 else
3066 {
3067 bitset_set (sbcset, name[0]);
3068 return REG_NOERROR;
3069 }
3070 }
3071 }
3072 #endif
3073
3074 re_token_t br_token;
3075 re_bitset_ptr_t sbcset;
3076 #ifdef RE_ENABLE_I18N
3077 re_charset_t *mbcset;
3078 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3079 int equiv_class_alloc = 0, char_class_alloc = 0;
3080 #endif /* not RE_ENABLE_I18N */
3081 int non_match = 0;
3082 bin_tree_t *work_tree;
3083 int token_len;
3084 int first_round = 1;
3085 #ifdef _LIBC
3086 collseqmb = (const unsigned char *)
3087 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3088 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3089 if (nrules)
3090 {
3091 /*
3092 if (MB_CUR_MAX > 1)
3093 */
3094 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3095 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3096 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3097 _NL_COLLATE_SYMB_TABLEMB);
3098 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3099 _NL_COLLATE_SYMB_EXTRAMB);
3100 }
3101 #endif
3102 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3103 #ifdef RE_ENABLE_I18N
3104 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3105 #endif /* RE_ENABLE_I18N */
3106 #ifdef RE_ENABLE_I18N
3107 if (BE (sbcset == NULL || mbcset == NULL, 0))
3108 #else
3109 if (BE (sbcset == NULL, 0))
3110 #endif /* RE_ENABLE_I18N */
3111 {
3112 *err = REG_ESPACE;
3113 return NULL;
3114 }
3115
3116 token_len = peek_token_bracket (token, regexp, syntax);
3117 if (BE (token->type == END_OF_RE, 0))
3118 {
3119 *err = REG_BADPAT;
3120 goto parse_bracket_exp_free_return;
3121 }
3122 if (token->type == OP_NON_MATCH_LIST)
3123 {
3124 #ifdef RE_ENABLE_I18N
3125 mbcset->non_match = 1;
3126 #endif /* not RE_ENABLE_I18N */
3127 non_match = 1;
3128 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3129 bitset_set (sbcset, '\0');
3130 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3131 token_len = peek_token_bracket (token, regexp, syntax);
3132 if (BE (token->type == END_OF_RE, 0))
3133 {
3134 *err = REG_BADPAT;
3135 goto parse_bracket_exp_free_return;
3136 }
3137 }
3138
3139 /* We treat the first ']' as a normal character. */
3140 if (token->type == OP_CLOSE_BRACKET)
3141 token->type = CHARACTER;
3142
3143 while (1)
3144 {
3145 bracket_elem_t start_elem, end_elem;
3146 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3147 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3148 reg_errcode_t ret;
3149 int token_len2 = 0, is_range_exp = 0;
3150 re_token_t token2;
3151
3152 start_elem.opr.name = start_name_buf;
3153 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3154 syntax, first_round);
3155 if (BE (ret != REG_NOERROR, 0))
3156 {
3157 *err = ret;
3158 goto parse_bracket_exp_free_return;
3159 }
3160 first_round = 0;
3161
3162 /* Get information about the next token. We need it in any case. */
3163 token_len = peek_token_bracket (token, regexp, syntax);
3164
3165 /* Do not check for ranges if we know they are not allowed. */
3166 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3167 {
3168 if (BE (token->type == END_OF_RE, 0))
3169 {
3170 *err = REG_EBRACK;
3171 goto parse_bracket_exp_free_return;
3172 }
3173 if (token->type == OP_CHARSET_RANGE)
3174 {
3175 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3176 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3177 if (BE (token2.type == END_OF_RE, 0))
3178 {
3179 *err = REG_EBRACK;
3180 goto parse_bracket_exp_free_return;
3181 }
3182 if (token2.type == OP_CLOSE_BRACKET)
3183 {
3184 /* We treat the last '-' as a normal character. */
3185 re_string_skip_bytes (regexp, -token_len);
3186 token->type = CHARACTER;
3187 }
3188 else
3189 is_range_exp = 1;
3190 }
3191 }
3192
3193 if (is_range_exp == 1)
3194 {
3195 end_elem.opr.name = end_name_buf;
3196 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3197 dfa, syntax, 1);
3198 if (BE (ret != REG_NOERROR, 0))
3199 {
3200 *err = ret;
3201 goto parse_bracket_exp_free_return;
3202 }
3203
3204 token_len = peek_token_bracket (token, regexp, syntax);
3205
3206 #ifdef _LIBC
3207 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3208 &start_elem, &end_elem);
3209 #else
3210 # ifdef RE_ENABLE_I18N
3211 *err = build_range_exp (sbcset,
3212 dfa->mb_cur_max > 1 ? mbcset : NULL,
3213 &range_alloc, &start_elem, &end_elem);
3214 # else
3215 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3216 # endif
3217 #endif /* RE_ENABLE_I18N */
3218 if (BE (*err != REG_NOERROR, 0))
3219 goto parse_bracket_exp_free_return;
3220 }
3221 else
3222 {
3223 switch (start_elem.type)
3224 {
3225 case SB_CHAR:
3226 bitset_set (sbcset, start_elem.opr.ch);
3227 break;
3228 #ifdef RE_ENABLE_I18N
3229 case MB_CHAR:
3230 /* Check whether the array has enough space. */
3231 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3232 {
3233 wchar_t *new_mbchars;
3234 /* Not enough, realloc it. */
3235 /* +1 in case of mbcset->nmbchars is 0. */
3236 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3237 /* Use realloc since array is NULL if *alloc == 0. */
3238 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3239 mbchar_alloc);
3240 if (BE (new_mbchars == NULL, 0))
3241 goto parse_bracket_exp_espace;
3242 mbcset->mbchars = new_mbchars;
3243 }
3244 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3245 break;
3246 #endif /* RE_ENABLE_I18N */
3247 case EQUIV_CLASS:
3248 *err = build_equiv_class (sbcset,
3249 #ifdef RE_ENABLE_I18N
3250 mbcset, &equiv_class_alloc,
3251 #endif /* RE_ENABLE_I18N */
3252 start_elem.opr.name);
3253 if (BE (*err != REG_NOERROR, 0))
3254 goto parse_bracket_exp_free_return;
3255 break;
3256 case COLL_SYM:
3257 *err = build_collating_symbol (sbcset,
3258 #ifdef RE_ENABLE_I18N
3259 mbcset, &coll_sym_alloc,
3260 #endif /* RE_ENABLE_I18N */
3261 start_elem.opr.name);
3262 if (BE (*err != REG_NOERROR, 0))
3263 goto parse_bracket_exp_free_return;
3264 break;
3265 case CHAR_CLASS:
3266 *err = build_charclass (regexp->trans, sbcset,
3267 #ifdef RE_ENABLE_I18N
3268 mbcset, &char_class_alloc,
3269 #endif /* RE_ENABLE_I18N */
3270 start_elem.opr.name, syntax);
3271 if (BE (*err != REG_NOERROR, 0))
3272 goto parse_bracket_exp_free_return;
3273 break;
3274 default:
3275 assert (0);
3276 break;
3277 }
3278 }
3279 if (BE (token->type == END_OF_RE, 0))
3280 {
3281 *err = REG_EBRACK;
3282 goto parse_bracket_exp_free_return;
3283 }
3284 if (token->type == OP_CLOSE_BRACKET)
3285 break;
3286 }
3287
3288 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3289
3290 /* If it is non-matching list. */
3291 if (non_match)
3292 bitset_not (sbcset);
3293
3294 #ifdef RE_ENABLE_I18N
3295 /* Ensure only single byte characters are set. */
3296 if (dfa->mb_cur_max > 1)
3297 bitset_mask (sbcset, dfa->sb_char);
3298
3299 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3300 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3301 || mbcset->non_match)))
3302 {
3303 bin_tree_t *mbc_tree;
3304 int sbc_idx;
3305 /* Build a tree for complex bracket. */
3306 dfa->has_mb_node = 1;
3307 br_token.type = COMPLEX_BRACKET;
3308 br_token.opr.mbcset = mbcset;
3309 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3310 if (BE (mbc_tree == NULL, 0))
3311 goto parse_bracket_exp_espace;
3312 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3313 if (sbcset[sbc_idx])
3314 break;
3315 /* If there are no bits set in sbcset, there is no point
3316 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3317 if (sbc_idx < BITSET_UINTS)
3318 {
3319 /* Build a tree for simple bracket. */
3320 br_token.type = SIMPLE_BRACKET;
3321 br_token.opr.sbcset = sbcset;
3322 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3323 if (BE (work_tree == NULL, 0))
3324 goto parse_bracket_exp_espace;
3325
3326 /* Then join them by ALT node. */
3327 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3328 if (BE (work_tree == NULL, 0))
3329 goto parse_bracket_exp_espace;
3330 }
3331 else
3332 {
3333 re_free (sbcset);
3334 work_tree = mbc_tree;
3335 }
3336 }
3337 else
3338 #endif /* not RE_ENABLE_I18N */
3339 {
3340 #ifdef RE_ENABLE_I18N
3341 free_charset (mbcset);
3342 #endif
3343 /* Build a tree for simple bracket. */
3344 br_token.type = SIMPLE_BRACKET;
3345 br_token.opr.sbcset = sbcset;
3346 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3347 if (BE (work_tree == NULL, 0))
3348 goto parse_bracket_exp_espace;
3349 }
3350 return work_tree;
3351
3352 parse_bracket_exp_espace:
3353 *err = REG_ESPACE;
3354 parse_bracket_exp_free_return:
3355 re_free (sbcset);
3356 #ifdef RE_ENABLE_I18N
3357 free_charset (mbcset);
3358 #endif /* RE_ENABLE_I18N */
3359 return NULL;
3360 }
3361
3362 /* Parse an element in the bracket expression. */
3363
3364 static reg_errcode_t
3365 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3366 accept_hyphen)
3367 bracket_elem_t *elem;
3368 re_string_t *regexp;
3369 re_token_t *token;
3370 int token_len;
3371 re_dfa_t *dfa;
3372 reg_syntax_t syntax;
3373 int accept_hyphen;
3374 {
3375 #ifdef RE_ENABLE_I18N
3376 int cur_char_size;
3377 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3378 if (cur_char_size > 1)
3379 {
3380 elem->type = MB_CHAR;
3381 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3382 re_string_skip_bytes (regexp, cur_char_size);
3383 return REG_NOERROR;
3384 }
3385 #endif /* RE_ENABLE_I18N */
3386 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3387 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3388 || token->type == OP_OPEN_EQUIV_CLASS)
3389 return parse_bracket_symbol (elem, regexp, token);
3390 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3391 {
3392 /* A '-' must only appear as anything but a range indicator before
3393 the closing bracket. Everything else is an error. */
3394 re_token_t token2;
3395 (void) peek_token_bracket (&token2, regexp, syntax);
3396 if (token2.type != OP_CLOSE_BRACKET)
3397 /* The actual error value is not standardized since this whole
3398 case is undefined. But ERANGE makes good sense. */
3399 return REG_ERANGE;
3400 }
3401 elem->type = SB_CHAR;
3402 elem->opr.ch = token->opr.c;
3403 return REG_NOERROR;
3404 }
3405
3406 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3407 such as [:<character_class>:], [.<collating_element>.], and
3408 [=<equivalent_class>=]. */
3409
3410 static reg_errcode_t
3411 parse_bracket_symbol (elem, regexp, token)
3412 bracket_elem_t *elem;
3413 re_string_t *regexp;
3414 re_token_t *token;
3415 {
3416 unsigned char ch, delim = token->opr.c;
3417 int i = 0;
3418 if (re_string_eoi(regexp))
3419 return REG_EBRACK;
3420 for (;; ++i)
3421 {
3422 if (i >= BRACKET_NAME_BUF_SIZE)
3423 return REG_EBRACK;
3424 if (token->type == OP_OPEN_CHAR_CLASS)
3425 ch = re_string_fetch_byte_case (regexp);
3426 else
3427 ch = re_string_fetch_byte (regexp);
3428 if (re_string_eoi(regexp))
3429 return REG_EBRACK;
3430 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3431 break;
3432 elem->opr.name[i] = ch;
3433 }
3434 re_string_skip_bytes (regexp, 1);
3435 elem->opr.name[i] = '\0';
3436 switch (token->type)
3437 {
3438 case OP_OPEN_COLL_ELEM:
3439 elem->type = COLL_SYM;
3440 break;
3441 case OP_OPEN_EQUIV_CLASS:
3442 elem->type = EQUIV_CLASS;
3443 break;
3444 case OP_OPEN_CHAR_CLASS:
3445 elem->type = CHAR_CLASS;
3446 break;
3447 default:
3448 break;
3449 }
3450 return REG_NOERROR;
3451 }
3452
3453 /* Helper function for parse_bracket_exp.
3454 Build the equivalence class which is represented by NAME.
3455 The result are written to MBCSET and SBCSET.
3456 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3457 is a pointer argument sinse we may update it. */
3458
3459 static reg_errcode_t
3460 #ifdef RE_ENABLE_I18N
3461 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3462 re_charset_t *mbcset;
3463 int *equiv_class_alloc;
3464 #else /* not RE_ENABLE_I18N */
3465 build_equiv_class (sbcset, name)
3466 #endif /* not RE_ENABLE_I18N */
3467 re_bitset_ptr_t sbcset;
3468 const unsigned char *name;
3469 {
3470 #if defined _LIBC
3471 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3472 if (nrules != 0)
3473 {
3474 const int32_t *table, *indirect;
3475 const unsigned char *weights, *extra, *cp;
3476 unsigned char char_buf[2];
3477 int32_t idx1, idx2;
3478 unsigned int ch;
3479 size_t len;
3480 /* This #include defines a local function! */
3481 # include <locale/weight.h>
3482 /* Calculate the index for equivalence class. */
3483 cp = name;
3484 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3485 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3486 _NL_COLLATE_WEIGHTMB);
3487 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3488 _NL_COLLATE_EXTRAMB);
3489 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3490 _NL_COLLATE_INDIRECTMB);
3491 idx1 = findidx (&cp);
3492 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3493 /* This isn't a valid character. */
3494 return REG_ECOLLATE;
3495
3496 /* Build single byte matcing table for this equivalence class. */
3497 char_buf[1] = (unsigned char) '\0';
3498 len = weights[idx1];
3499 for (ch = 0; ch < SBC_MAX; ++ch)
3500 {
3501 char_buf[0] = ch;
3502 cp = char_buf;
3503 idx2 = findidx (&cp);
3504 /*
3505 idx2 = table[ch];
3506 */
3507 if (idx2 == 0)
3508 /* This isn't a valid character. */
3509 continue;
3510 if (len == weights[idx2])
3511 {
3512 int cnt = 0;
3513 while (cnt <= len &&
3514 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3515 ++cnt;
3516
3517 if (cnt > len)
3518 bitset_set (sbcset, ch);
3519 }
3520 }
3521 /* Check whether the array has enough space. */
3522 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3523 {
3524 /* Not enough, realloc it. */
3525 /* +1 in case of mbcset->nequiv_classes is 0. */
3526 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3527 /* Use realloc since the array is NULL if *alloc == 0. */
3528 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3529 int32_t,
3530 new_equiv_class_alloc);
3531 if (BE (new_equiv_classes == NULL, 0))
3532 return REG_ESPACE;
3533 mbcset->equiv_classes = new_equiv_classes;
3534 *equiv_class_alloc = new_equiv_class_alloc;
3535 }
3536 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3537 }
3538 else
3539 #endif /* _LIBC */
3540 {
3541 if (BE (strlen ((const char *) name) != 1, 0))
3542 return REG_ECOLLATE;
3543 bitset_set (sbcset, *name);
3544 }
3545 return REG_NOERROR;
3546 }
3547
3548 /* Helper function for parse_bracket_exp.
3549 Build the character class which is represented by NAME.
3550 The result are written to MBCSET and SBCSET.
3551 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3552 is a pointer argument sinse we may update it. */
3553
3554 static reg_errcode_t
3555 #ifdef RE_ENABLE_I18N
3556 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3557 re_charset_t *mbcset;
3558 int *char_class_alloc;
3559 #else /* not RE_ENABLE_I18N */
3560 build_charclass (trans, sbcset, class_name, syntax)
3561 #endif /* not RE_ENABLE_I18N */
3562 RE_TRANSLATE_TYPE trans;
3563 re_bitset_ptr_t sbcset;
3564 const unsigned char *class_name;
3565 reg_syntax_t syntax;
3566 {
3567 int i;
3568 const char *name = (const char *) class_name;
3569
3570 /* In case of REG_ICASE "upper" and "lower" match the both of
3571 upper and lower cases. */
3572 if ((syntax & RE_ICASE)
3573 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3574 name = "alpha";
3575
3576 #ifdef RE_ENABLE_I18N
3577 /* Check the space of the arrays. */
3578 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3579 {
3580 /* Not enough, realloc it. */
3581 /* +1 in case of mbcset->nchar_classes is 0. */
3582 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3583 /* Use realloc since array is NULL if *alloc == 0. */
3584 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3585 new_char_class_alloc);
3586 if (BE (new_char_classes == NULL, 0))
3587 return REG_ESPACE;
3588 mbcset->char_classes = new_char_classes;
3589 *char_class_alloc = new_char_class_alloc;
3590 }
3591 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3592 #endif /* RE_ENABLE_I18N */
3593
3594 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3595 for (i = 0; i < SBC_MAX; ++i) \
3596 { \
3597 if (ctype_func (i)) \
3598 { \
3599 int ch = trans ? trans[i] : i; \
3600 bitset_set (sbcset, ch); \
3601 } \
3602 }
3603
3604 if (strcmp (name, "alnum") == 0)
3605 BUILD_CHARCLASS_LOOP (isalnum)
3606 else if (strcmp (name, "cntrl") == 0)
3607 BUILD_CHARCLASS_LOOP (iscntrl)
3608 else if (strcmp (name, "lower") == 0)
3609 BUILD_CHARCLASS_LOOP (islower)
3610 else if (strcmp (name, "space") == 0)
3611 BUILD_CHARCLASS_LOOP (isspace)
3612 else if (strcmp (name, "alpha") == 0)
3613 BUILD_CHARCLASS_LOOP (isalpha)
3614 else if (strcmp (name, "digit") == 0)
3615 BUILD_CHARCLASS_LOOP (isdigit)
3616 else if (strcmp (name, "print") == 0)
3617 BUILD_CHARCLASS_LOOP (isprint)
3618 else if (strcmp (name, "upper") == 0)
3619 BUILD_CHARCLASS_LOOP (isupper)
3620 else if (strcmp (name, "blank") == 0)
3621 BUILD_CHARCLASS_LOOP (isblank)
3622 else if (strcmp (name, "graph") == 0)
3623 BUILD_CHARCLASS_LOOP (isgraph)
3624 else if (strcmp (name, "punct") == 0)
3625 BUILD_CHARCLASS_LOOP (ispunct)
3626 else if (strcmp (name, "xdigit") == 0)
3627 BUILD_CHARCLASS_LOOP (isxdigit)
3628 else
3629 return REG_ECTYPE;
3630
3631 return REG_NOERROR;
3632 }
3633
3634 static bin_tree_t *
3635 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3636 re_dfa_t *dfa;
3637 RE_TRANSLATE_TYPE trans;
3638 const unsigned char *class_name;
3639 const unsigned char *extra;
3640 int non_match;
3641 reg_errcode_t *err;
3642 {
3643 re_bitset_ptr_t sbcset;
3644 #ifdef RE_ENABLE_I18N
3645 re_charset_t *mbcset;
3646 int alloc = 0;
3647 #endif /* not RE_ENABLE_I18N */
3648 reg_errcode_t ret;
3649 re_token_t br_token;
3650 bin_tree_t *tree;
3651
3652 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3653 #ifdef RE_ENABLE_I18N
3654 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3655 #endif /* RE_ENABLE_I18N */
3656
3657 #ifdef RE_ENABLE_I18N
3658 if (BE (sbcset == NULL || mbcset == NULL, 0))
3659 #else /* not RE_ENABLE_I18N */
3660 if (BE (sbcset == NULL, 0))
3661 #endif /* not RE_ENABLE_I18N */
3662 {
3663 *err = REG_ESPACE;
3664 return NULL;
3665 }
3666
3667 if (non_match)
3668 {
3669 #ifdef RE_ENABLE_I18N
3670 /*
3671 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3672 bitset_set(cset->sbcset, '\0');
3673 */
3674 mbcset->non_match = 1;
3675 #endif /* not RE_ENABLE_I18N */
3676 }
3677
3678 /* We don't care the syntax in this case. */
3679 ret = build_charclass (trans, sbcset,
3680 #ifdef RE_ENABLE_I18N
3681 mbcset, &alloc,
3682 #endif /* RE_ENABLE_I18N */
3683 class_name, 0);
3684
3685 if (BE (ret != REG_NOERROR, 0))
3686 {
3687 re_free (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689 free_charset (mbcset);
3690 #endif /* RE_ENABLE_I18N */
3691 *err = ret;
3692 return NULL;
3693 }
3694 /* \w match '_' also. */
3695 for (; *extra; extra++)
3696 bitset_set (sbcset, *extra);
3697
3698 /* If it is non-matching list. */
3699 if (non_match)
3700 bitset_not (sbcset);
3701
3702 #ifdef RE_ENABLE_I18N
3703 /* Ensure only single byte characters are set. */
3704 if (dfa->mb_cur_max > 1)
3705 bitset_mask (sbcset, dfa->sb_char);
3706 #endif
3707
3708 /* Build a tree for simple bracket. */
3709 br_token.type = SIMPLE_BRACKET;
3710 br_token.opr.sbcset = sbcset;
3711 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3712 if (BE (tree == NULL, 0))
3713 goto build_word_op_espace;
3714
3715 #ifdef RE_ENABLE_I18N
3716 if (dfa->mb_cur_max > 1)
3717 {
3718 bin_tree_t *mbc_tree;
3719 /* Build a tree for complex bracket. */
3720 br_token.type = COMPLEX_BRACKET;
3721 br_token.opr.mbcset = mbcset;
3722 dfa->has_mb_node = 1;
3723 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724 if (BE (mbc_tree == NULL, 0))
3725 goto build_word_op_espace;
3726 /* Then join them by ALT node. */
3727 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3728 if (BE (mbc_tree != NULL, 1))
3729 return tree;
3730 }
3731 else
3732 {
3733 free_charset (mbcset);
3734 return tree;
3735 }
3736 #else /* not RE_ENABLE_I18N */
3737 return tree;
3738 #endif /* not RE_ENABLE_I18N */
3739
3740 build_word_op_espace:
3741 re_free (sbcset);
3742 #ifdef RE_ENABLE_I18N
3743 free_charset (mbcset);
3744 #endif /* RE_ENABLE_I18N */
3745 *err = REG_ESPACE;
3746 return NULL;
3747 }
3748
3749 /* This is intended for the expressions like "a{1,3}".
3750 Fetch a number from `input', and return the number.
3751 Return -1, if the number field is empty like "{,1}".
3752 Return -2, If an error is occured. */
3753
3754 static int
3755 fetch_number (input, token, syntax)
3756 re_string_t *input;
3757 re_token_t *token;
3758 reg_syntax_t syntax;
3759 {
3760 int num = -1;
3761 unsigned char c;
3762 while (1)
3763 {
3764 fetch_token (token, input, syntax);
3765 c = token->opr.c;
3766 if (BE (token->type == END_OF_RE, 0))
3767 return -2;
3768 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769 break;
3770 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3771 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3772 num = (num > RE_DUP_MAX) ? -2 : num;
3773 }
3774 return num;
3775 }
3776 \f
3777 #ifdef RE_ENABLE_I18N
3778 static void
3779 free_charset (re_charset_t *cset)
3780 {
3781 re_free (cset->mbchars);
3782 # ifdef _LIBC
3783 re_free (cset->coll_syms);
3784 re_free (cset->equiv_classes);
3785 re_free (cset->range_starts);
3786 re_free (cset->range_ends);
3787 # endif
3788 re_free (cset->char_classes);
3789 re_free (cset);
3790 }
3791 #endif /* RE_ENABLE_I18N */
3792 \f
3793 /* Functions for binary tree operation. */
3794
3795 /* Create a tree node. */
3796
3797 static bin_tree_t *
3798 create_tree (dfa, left, right, type)
3799 re_dfa_t *dfa;
3800 bin_tree_t *left;
3801 bin_tree_t *right;
3802 re_token_type_t type;
3803 {
3804 re_token_t t;
3805 t.type = type;
3806 return create_token_tree (dfa, left, right, &t);
3807 }
3808
3809 static bin_tree_t *
3810 create_token_tree (dfa, left, right, token)
3811 re_dfa_t *dfa;
3812 bin_tree_t *left;
3813 bin_tree_t *right;
3814 const re_token_t *token;
3815 {
3816 bin_tree_t *tree;
3817 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3818 {
3819 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3820
3821 if (storage == NULL)
3822 return NULL;
3823 storage->next = dfa->str_tree_storage;
3824 dfa->str_tree_storage = storage;
3825 dfa->str_tree_storage_idx = 0;
3826 }
3827 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3828
3829 tree->parent = NULL;
3830 tree->left = left;
3831 tree->right = right;
3832 tree->token = *token;
3833 tree->token.duplicated = 0;
3834 tree->token.opt_subexp = 0;
3835 tree->first = NULL;
3836 tree->next = NULL;
3837 tree->node_idx = -1;
3838
3839 if (left != NULL)
3840 left->parent = tree;
3841 if (right != NULL)
3842 right->parent = tree;
3843 return tree;
3844 }
3845
3846 /* Mark the tree SRC as an optional subexpression.
3847 To be called from preorder or postorder. */
3848
3849 static reg_errcode_t
3850 mark_opt_subexp (extra, node)
3851 void *extra;
3852 bin_tree_t *node;
3853 {
3854 int idx = (int) (long) extra;
3855 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3856 node->token.opt_subexp = 1;
3857
3858 return REG_NOERROR;
3859 }
3860
3861 /* Free the allocated memory inside NODE. */
3862
3863 static void
3864 free_token (re_token_t *node)
3865 {
3866 #ifdef RE_ENABLE_I18N
3867 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3868 free_charset (node->opr.mbcset);
3869 else
3870 #endif /* RE_ENABLE_I18N */
3871 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3872 re_free (node->opr.sbcset);
3873 }
3874
3875 /* Worker function for tree walking. Free the allocated memory inside NODE
3876 and its children. */
3877
3878 static reg_errcode_t
3879 free_tree (void *extra, bin_tree_t *node)
3880 {
3881 free_token (&node->token);
3882 return REG_NOERROR;
3883 }
3884
3885
3886 /* Duplicate the node SRC, and return new node. This is a preorder
3887 visit similar to the one implemented by the generic visitor, but
3888 we need more infrastructure to maintain two parallel trees --- so,
3889 it's easier to duplicate. */
3890
3891 static bin_tree_t *
3892 duplicate_tree (root, dfa)
3893 const bin_tree_t *root;
3894 re_dfa_t *dfa;
3895 {
3896 const bin_tree_t *node;
3897 bin_tree_t *dup_root;
3898 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3899
3900 for (node = root; ; )
3901 {
3902 /* Create a new tree and link it back to the current parent. */
3903 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3904 if (*p_new == NULL)
3905 return NULL;
3906 (*p_new)->parent = dup_node;
3907 (*p_new)->token.duplicated = 1;
3908 dup_node = *p_new;
3909
3910 /* Go to the left node, or up and to the right. */
3911 if (node->left)
3912 {
3913 node = node->left;
3914 p_new = &dup_node->left;
3915 }
3916 else
3917 {
3918 const bin_tree_t *prev = NULL;
3919 while (node->right == prev || node->right == NULL)
3920 {
3921 prev = node;
3922 node = node->parent;
3923 dup_node = dup_node->parent;
3924 if (!node)
3925 return dup_root;
3926 }
3927 node = node->right;
3928 p_new = &dup_node->right;
3929 }
3930 }
3931 }