]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
[BZ #605, BZ #611]
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52 int top_clone_node, int root_node,
53 unsigned int constraint);
54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55 unsigned int constraint);
56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57 unsigned int constraint);
58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60 int node, int root);
61 static void calc_inveclosure (re_dfa_t *dfa);
62 static int fetch_number (re_string_t *input, re_token_t *token,
63 reg_syntax_t syntax);
64 static void fetch_token (re_token_t *result, re_string_t *input,
65 reg_syntax_t syntax);
66 static int peek_token (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
69 reg_syntax_t syntax);
70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71 reg_syntax_t syntax, reg_errcode_t *err);
72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82 re_token_t *token, reg_syntax_t syntax,
83 int nest, reg_errcode_t *err);
84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85 re_dfa_t *dfa, re_token_t *token,
86 reg_syntax_t syntax, reg_errcode_t *err);
87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88 re_token_t *token, reg_syntax_t syntax,
89 reg_errcode_t *err);
90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91 re_string_t *regexp,
92 re_token_t *token, int token_len,
93 re_dfa_t *dfa,
94 reg_syntax_t syntax,
95 int accept_hyphen);
96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97 re_string_t *regexp,
98 re_token_t *token);
99 #ifndef _LIBC
100 # ifdef RE_ENABLE_I18N
101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102 re_charset_t *mbcset, int *range_alloc,
103 bracket_elem_t *start_elem,
104 bracket_elem_t *end_elem);
105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106 re_charset_t *mbcset,
107 int *coll_sym_alloc,
108 const unsigned char *name);
109 # else /* not RE_ENABLE_I18N */
110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111 bracket_elem_t *start_elem,
112 bracket_elem_t *end_elem);
113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115 # endif /* not RE_ENABLE_I18N */
116 #endif /* not _LIBC */
117 #ifdef RE_ENABLE_I18N
118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 re_charset_t *mbcset,
120 int *equiv_class_alloc,
121 const unsigned char *name);
122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123 re_bitset_ptr_t sbcset,
124 re_charset_t *mbcset,
125 int *char_class_alloc,
126 const unsigned char *class_name,
127 reg_syntax_t syntax);
128 #else /* not RE_ENABLE_I18N */
129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130 const unsigned char *name);
131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132 re_bitset_ptr_t sbcset,
133 const unsigned char *class_name,
134 reg_syntax_t syntax);
135 #endif /* not RE_ENABLE_I18N */
136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137 unsigned RE_TRANSLATE_TYPE trans,
138 const unsigned char *class_name,
139 const unsigned char *extra,
140 int non_match, reg_errcode_t *err);
141 static bin_tree_t *create_tree (re_dfa_t *dfa,
142 bin_tree_t *left, bin_tree_t *right,
143 re_token_type_t type);
144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
145 bin_tree_t *left, bin_tree_t *right,
146 const re_token_t *token);
147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148 static void free_token (re_token_t *node);
149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
151 \f
152 /* This table gives an error message for each of the error codes listed
153 in regex.h. Obviously the order here has to be same as there.
154 POSIX doesn't require that we do anything for REG_NOERROR,
155 but why not be nice? */
156
157 const char __re_error_msgid[] attribute_hidden =
158 {
159 #define REG_NOERROR_IDX 0
160 gettext_noop ("Success") /* REG_NOERROR */
161 "\0"
162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163 gettext_noop ("No match") /* REG_NOMATCH */
164 "\0"
165 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
167 "\0"
168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
170 "\0"
171 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
173 "\0"
174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
176 "\0"
177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
179 "\0"
180 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
182 "\0"
183 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
185 "\0"
186 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
188 "\0"
189 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
191 "\0"
192 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193 gettext_noop ("Invalid range end") /* REG_ERANGE */
194 "\0"
195 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
196 gettext_noop ("Memory exhausted") /* REG_ESPACE */
197 "\0"
198 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
200 "\0"
201 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202 gettext_noop ("Premature end of regular expression") /* REG_EEND */
203 "\0"
204 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
205 gettext_noop ("Regular expression too big") /* REG_ESIZE */
206 "\0"
207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 };
210
211 const size_t __re_error_msgid_idx[] attribute_hidden =
212 {
213 REG_NOERROR_IDX,
214 REG_NOMATCH_IDX,
215 REG_BADPAT_IDX,
216 REG_ECOLLATE_IDX,
217 REG_ECTYPE_IDX,
218 REG_EESCAPE_IDX,
219 REG_ESUBREG_IDX,
220 REG_EBRACK_IDX,
221 REG_EPAREN_IDX,
222 REG_EBRACE_IDX,
223 REG_BADBR_IDX,
224 REG_ERANGE_IDX,
225 REG_ESPACE_IDX,
226 REG_BADRPT_IDX,
227 REG_EEND_IDX,
228 REG_ESIZE_IDX,
229 REG_ERPAREN_IDX
230 };
231 \f
232 /* Entry points for GNU code. */
233
234 /* re_compile_pattern is the GNU regular expression compiler: it
235 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236 Returns 0 if the pattern was valid, otherwise an error string.
237
238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239 are set in BUFP on entry. */
240
241 const char *
242 re_compile_pattern (pattern, length, bufp)
243 const char *pattern;
244 size_t length;
245 struct re_pattern_buffer *bufp;
246 {
247 reg_errcode_t ret;
248
249 /* And GNU code determines whether or not to get register information
250 by passing null for the REGS argument to re_match, etc., not by
251 setting no_sub, unless RE_NO_SUB is set. */
252 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
253
254 /* Match anchors at newline. */
255 bufp->newline_anchor = 1;
256
257 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
258
259 if (!ret)
260 return NULL;
261 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
262 }
263 #ifdef _LIBC
264 weak_alias (__re_compile_pattern, re_compile_pattern)
265 #endif
266
267 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
268 also be assigned to arbitrarily: each pattern buffer stores its own
269 syntax, so it can be changed between regex compilations. */
270 /* This has no initializer because initialized variables in Emacs
271 become read-only after dumping. */
272 reg_syntax_t re_syntax_options;
273
274
275 /* Specify the precise syntax of regexps for compilation. This provides
276 for compatibility for various utilities which historically have
277 different, incompatible syntaxes.
278
279 The argument SYNTAX is a bit mask comprised of the various bits
280 defined in regex.h. We return the old syntax. */
281
282 reg_syntax_t
283 re_set_syntax (syntax)
284 reg_syntax_t syntax;
285 {
286 reg_syntax_t ret = re_syntax_options;
287
288 re_syntax_options = syntax;
289 return ret;
290 }
291 #ifdef _LIBC
292 weak_alias (__re_set_syntax, re_set_syntax)
293 #endif
294
295 int
296 re_compile_fastmap (bufp)
297 struct re_pattern_buffer *bufp;
298 {
299 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
300 char *fastmap = bufp->fastmap;
301
302 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
303 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
304 if (dfa->init_state != dfa->init_state_word)
305 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
306 if (dfa->init_state != dfa->init_state_nl)
307 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
308 if (dfa->init_state != dfa->init_state_begbuf)
309 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
310 bufp->fastmap_accurate = 1;
311 return 0;
312 }
313 #ifdef _LIBC
314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
315 #endif
316
317 static inline void
318 __attribute ((always_inline))
319 re_set_fastmap (char *fastmap, int icase, int ch)
320 {
321 fastmap[ch] = 1;
322 if (icase)
323 fastmap[tolower (ch)] = 1;
324 }
325
326 /* Helper function for re_compile_fastmap.
327 Compile fastmap for the initial_state INIT_STATE. */
328
329 static void
330 re_compile_fastmap_iter (bufp, init_state, fastmap)
331 regex_t *bufp;
332 const re_dfastate_t *init_state;
333 char *fastmap;
334 {
335 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
336 int node_cnt;
337 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
338 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
339 {
340 int node = init_state->nodes.elems[node_cnt];
341 re_token_type_t type = dfa->nodes[node].type;
342
343 if (type == CHARACTER)
344 {
345 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
346 #ifdef RE_ENABLE_I18N
347 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
348 {
349 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
350 wchar_t wc;
351 mbstate_t state;
352
353 p = buf;
354 *p++ = dfa->nodes[node].opr.c;
355 while (++node < dfa->nodes_len
356 && dfa->nodes[node].type == CHARACTER
357 && dfa->nodes[node].mb_partial)
358 *p++ = dfa->nodes[node].opr.c;
359 memset (&state, 0, sizeof (state));
360 if (mbrtowc (&wc, (const char *) buf, p - buf,
361 &state) == p - buf
362 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
363 re_set_fastmap (fastmap, 0, buf[0]);
364 }
365 #endif
366 }
367 else if (type == SIMPLE_BRACKET)
368 {
369 int i, j, ch;
370 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
371 for (j = 0; j < UINT_BITS; ++j, ++ch)
372 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
373 re_set_fastmap (fastmap, icase, ch);
374 }
375 #ifdef RE_ENABLE_I18N
376 else if (type == COMPLEX_BRACKET)
377 {
378 int i;
379 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
380 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
381 || cset->nranges || cset->nchar_classes)
382 {
383 # ifdef _LIBC
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
385 {
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
392 int j, ch;
393 const int32_t *table = (const int32_t *)
394 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
395 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
396 for (j = 0; j < UINT_BITS; ++j, ++ch)
397 if (table[ch] < 0)
398 re_set_fastmap (fastmap, icase, ch);
399 }
400 # else
401 if (dfa->mb_cur_max > 1)
402 for (i = 0; i < SBC_MAX; ++i)
403 if (__btowc (i) == WEOF)
404 re_set_fastmap (fastmap, icase, i);
405 # endif /* not _LIBC */
406 }
407 for (i = 0; i < cset->nmbchars; ++i)
408 {
409 char buf[256];
410 mbstate_t state;
411 memset (&state, '\0', sizeof (state));
412 __wcrtomb (buf, cset->mbchars[i], &state);
413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415 {
416 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
417 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
418 }
419 }
420 }
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
427 {
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
431 return;
432 }
433 }
434 }
435 \f
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
438
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
441
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
451
452 PATTERN is the address of the pattern string.
453
454 CFLAGS is a series of bits which affect compilation.
455
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
458
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
461
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
464
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
467 registers.
468
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
471
472 int
473 regcomp (preg, pattern, cflags)
474 regex_t *__restrict preg;
475 const char *__restrict pattern;
476 int cflags;
477 {
478 reg_errcode_t ret;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
481
482 preg->buffer = NULL;
483 preg->allocated = 0;
484 preg->used = 0;
485
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
489 return REG_ESPACE;
490
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
500 }
501 else
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
505
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
511 ret = REG_EPAREN;
512
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
518 else
519 {
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
523 }
524
525 return (int) ret;
526 }
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
530
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
533
534 size_t
535 regerror (errcode, preg, errbuf, errbuf_size)
536 int errcode;
537 const regex_t *preg;
538 char *errbuf;
539 size_t errbuf_size;
540 {
541 const char *msg;
542 size_t msg_size;
543
544 if (BE (errcode < 0
545 || errcode >= (int) (sizeof (__re_error_msgid_idx)
546 / sizeof (__re_error_msgid_idx[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
551 abort ();
552
553 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
554
555 msg_size = strlen (msg) + 1; /* Includes the null. */
556
557 if (BE (errbuf_size != 0, 1))
558 {
559 if (BE (msg_size > errbuf_size, 0))
560 {
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
563 #else
564 memcpy (errbuf, msg, errbuf_size - 1);
565 errbuf[errbuf_size - 1] = 0;
566 #endif
567 }
568 else
569 memcpy (errbuf, msg, msg_size);
570 }
571
572 return msg_size;
573 }
574 #ifdef _LIBC
575 weak_alias (__regerror, regerror)
576 #endif
577
578
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map =
585 {
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
589 # else
590 # error "Add case for new unsigned int size"
591 # endif
592 };
593 #endif
594
595
596 static void
597 free_dfa_content (re_dfa_t *dfa)
598 {
599 int i, j;
600
601 if (dfa->nodes)
602 for (i = 0; i < dfa->nodes_len; ++i)
603 free_token (dfa->nodes + i);
604 re_free (dfa->nexts);
605 for (i = 0; i < dfa->nodes_len; ++i)
606 {
607 if (dfa->eclosures != NULL)
608 re_node_set_free (dfa->eclosures + i);
609 if (dfa->inveclosures != NULL)
610 re_node_set_free (dfa->inveclosures + i);
611 if (dfa->edests != NULL)
612 re_node_set_free (dfa->edests + i);
613 }
614 re_free (dfa->edests);
615 re_free (dfa->eclosures);
616 re_free (dfa->inveclosures);
617 re_free (dfa->nodes);
618
619 if (dfa->state_table)
620 for (i = 0; i <= dfa->state_hash_mask; ++i)
621 {
622 struct re_state_table_entry *entry = dfa->state_table + i;
623 for (j = 0; j < entry->num; ++j)
624 {
625 re_dfastate_t *state = entry->array[j];
626 free_state (state);
627 }
628 re_free (entry->array);
629 }
630 re_free (dfa->state_table);
631 #ifdef RE_ENABLE_I18N
632 if (dfa->sb_char != utf8_sb_map)
633 re_free (dfa->sb_char);
634 #endif
635 re_free (dfa->subexp_map);
636 #ifdef DEBUG
637 re_free (dfa->re_str);
638 #endif
639
640 re_free (dfa);
641 }
642
643
644 /* Free dynamically allocated space used by PREG. */
645
646 void
647 regfree (preg)
648 regex_t *preg;
649 {
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
653 preg->buffer = NULL;
654 preg->allocated = 0;
655
656 re_free (preg->fastmap);
657 preg->fastmap = NULL;
658
659 re_free (preg->translate);
660 preg->translate = NULL;
661 }
662 #ifdef _LIBC
663 weak_alias (__regfree, regfree)
664 #endif
665 \f
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669 #if defined _REGEX_RE_COMP || defined _LIBC
670
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
673
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
681 re_comp (s)
682 const char *s;
683 {
684 reg_errcode_t ret;
685 char *fastmap;
686
687 if (!s)
688 {
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
691 return 0;
692 }
693
694 if (re_comp_buf.buffer)
695 {
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
700 re_comp_buf.fastmap = fastmap;
701 }
702
703 if (re_comp_buf.fastmap == NULL)
704 {
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
709 }
710
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
713
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
716
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
718
719 if (!ret)
720 return NULL;
721
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
724 }
725
726 #ifdef _LIBC
727 libc_freeres_fn (free_mem)
728 {
729 __regfree (&re_comp_buf);
730 }
731 #endif
732
733 #endif /* _REGEX_RE_COMP */
734 \f
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
738
739 static reg_errcode_t
740 re_compile_internal (preg, pattern, length, syntax)
741 regex_t *preg;
742 const char * pattern;
743 int length;
744 reg_syntax_t syntax;
745 {
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
749
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
758
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
762 {
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
772 }
773 preg->used = sizeof (re_dfa_t);
774
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
777 {
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
782 }
783 #ifdef DEBUG
784 dfa->re_str = re_malloc (char, length + 1);
785 strncpy (dfa->re_str, pattern, length + 1);
786 #endif
787
788 err = re_string_construct (&regexp, pattern, length, preg->translate,
789 syntax & RE_ICASE, dfa);
790 if (BE (err != REG_NOERROR, 0))
791 {
792 re_compile_internal_free_return:
793 free_workarea_compile (preg);
794 re_string_destruct (&regexp);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
799 }
800
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
806
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
811
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816 #endif
817
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
820
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (&regexp);
824
825 if (BE (err != REG_NOERROR, 0))
826 {
827 free_dfa_content (dfa);
828 preg->buffer = NULL;
829 preg->allocated = 0;
830 }
831
832 return err;
833 }
834
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
837
838 static reg_errcode_t
839 init_dfa (dfa, pat_len)
840 re_dfa_t *dfa;
841 int pat_len;
842 {
843 int table_size;
844 #ifndef _LIBC
845 char *codeset_name;
846 #endif
847
848 memset (dfa, '\0', sizeof (re_dfa_t));
849
850 /* Force allocation of str_tree_storage the first time. */
851 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
852
853 dfa->nodes_alloc = pat_len + 1;
854 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
855
856 dfa->states_alloc = pat_len + 1;
857
858 /* table_size = 2 ^ ceil(log pat_len) */
859 for (table_size = 1; table_size > 0; table_size <<= 1)
860 if (table_size > pat_len)
861 break;
862
863 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
864 dfa->state_hash_mask = table_size - 1;
865
866 dfa->mb_cur_max = MB_CUR_MAX;
867 #ifdef _LIBC
868 if (dfa->mb_cur_max == 6
869 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
870 dfa->is_utf8 = 1;
871 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
872 != 0);
873 #else
874 # ifdef HAVE_LANGINFO_CODESET
875 codeset_name = nl_langinfo (CODESET);
876 # else
877 codeset_name = getenv ("LC_ALL");
878 if (codeset_name == NULL || codeset_name[0] == '\0')
879 codeset_name = getenv ("LC_CTYPE");
880 if (codeset_name == NULL || codeset_name[0] == '\0')
881 codeset_name = getenv ("LANG");
882 if (codeset_name == NULL)
883 codeset_name = "";
884 else if (strchr (codeset_name, '.') != NULL)
885 codeset_name = strchr (codeset_name, '.') + 1;
886 # endif
887
888 if (strcasecmp (codeset_name, "UTF-8") == 0
889 || strcasecmp (codeset_name, "UTF8") == 0)
890 dfa->is_utf8 = 1;
891
892 /* We check exhaustively in the loop below if this charset is a
893 superset of ASCII. */
894 dfa->map_notascii = 0;
895 #endif
896
897 #ifdef RE_ENABLE_I18N
898 if (dfa->mb_cur_max > 1)
899 {
900 if (dfa->is_utf8)
901 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
902 else
903 {
904 int i, j, ch;
905
906 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
907 if (BE (dfa->sb_char == NULL, 0))
908 return REG_ESPACE;
909
910 /* Clear all bits by, then set those corresponding to single
911 byte chars. */
912 bitset_empty (dfa->sb_char);
913
914 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
915 for (j = 0; j < UINT_BITS; ++j, ++ch)
916 {
917 wchar_t wch = __btowc (ch);
918 if (wch != WEOF)
919 dfa->sb_char[i] |= 1 << j;
920 # ifndef _LIBC
921 if (isascii (ch) && wch != (wchar_t) ch)
922 dfa->map_notascii = 1;
923 # endif
924 }
925 }
926 }
927 #endif
928
929 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
930 return REG_ESPACE;
931 return REG_NOERROR;
932 }
933
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
937
938 static void
939 init_word_char (dfa)
940 re_dfa_t *dfa;
941 {
942 int i, j, ch;
943 dfa->word_ops_used = 1;
944 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
945 for (j = 0; j < UINT_BITS; ++j, ++ch)
946 if (isalnum (ch) || ch == '_')
947 dfa->word_char[i] |= 1 << j;
948 }
949
950 /* Free the work area which are only used while compiling. */
951
952 static void
953 free_workarea_compile (preg)
954 regex_t *preg;
955 {
956 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
957 bin_tree_storage_t *storage, *next;
958 for (storage = dfa->str_tree_storage; storage; storage = next)
959 {
960 next = storage->next;
961 re_free (storage);
962 }
963 dfa->str_tree_storage = NULL;
964 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
965 dfa->str_tree = NULL;
966 re_free (dfa->org_indices);
967 dfa->org_indices = NULL;
968 }
969
970 /* Create initial states for all contexts. */
971
972 static reg_errcode_t
973 create_initial_state (dfa)
974 re_dfa_t *dfa;
975 {
976 int first, i;
977 reg_errcode_t err;
978 re_node_set init_nodes;
979
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first = dfa->str_tree->first->node_idx;
983 dfa->init_node = first;
984 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
985 if (BE (err != REG_NOERROR, 0))
986 return err;
987
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa->nbackref > 0)
993 for (i = 0; i < init_nodes.nelem; ++i)
994 {
995 int node_idx = init_nodes.elems[i];
996 re_token_type_t type = dfa->nodes[node_idx].type;
997
998 int clexp_idx;
999 if (type != OP_BACK_REF)
1000 continue;
1001 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1002 {
1003 re_token_t *clexp_node;
1004 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005 if (clexp_node->type == OP_CLOSE_SUBEXP
1006 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007 break;
1008 }
1009 if (clexp_idx == init_nodes.nelem)
1010 continue;
1011
1012 if (type == OP_BACK_REF)
1013 {
1014 int dest_idx = dfa->edests[node_idx].elems[0];
1015 if (!re_node_set_contains (&init_nodes, dest_idx))
1016 {
1017 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018 i = 0;
1019 }
1020 }
1021 }
1022
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa->init_state == NULL, 0))
1027 return err;
1028 if (dfa->init_state->has_constraint)
1029 {
1030 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031 CONTEXT_WORD);
1032 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033 CONTEXT_NEWLINE);
1034 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035 &init_nodes,
1036 CONTEXT_NEWLINE
1037 | CONTEXT_BEGBUF);
1038 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039 || dfa->init_state_begbuf == NULL, 0))
1040 return err;
1041 }
1042 else
1043 dfa->init_state_word = dfa->init_state_nl
1044 = dfa->init_state_begbuf = dfa->init_state;
1045
1046 re_node_set_free (&init_nodes);
1047 return REG_NOERROR;
1048 }
1049 \f
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1054
1055 static void
1056 optimize_utf8 (dfa)
1057 re_dfa_t *dfa;
1058 {
1059 int node, i, mb_chars = 0, has_period = 0;
1060
1061 for (node = 0; node < dfa->nodes_len; ++node)
1062 switch (dfa->nodes[node].type)
1063 {
1064 case CHARACTER:
1065 if (dfa->nodes[node].opr.c >= 0x80)
1066 mb_chars = 1;
1067 break;
1068 case ANCHOR:
1069 switch (dfa->nodes[node].opr.idx)
1070 {
1071 case LINE_FIRST:
1072 case LINE_LAST:
1073 case BUF_FIRST:
1074 case BUF_LAST:
1075 break;
1076 default:
1077 /* Word anchors etc. cannot be handled. */
1078 return;
1079 }
1080 break;
1081 case OP_PERIOD:
1082 has_period = 1;
1083 break;
1084 case OP_BACK_REF:
1085 case OP_ALT:
1086 case END_OF_RE:
1087 case OP_DUP_ASTERISK:
1088 case OP_OPEN_SUBEXP:
1089 case OP_CLOSE_SUBEXP:
1090 break;
1091 case COMPLEX_BRACKET:
1092 return;
1093 case SIMPLE_BRACKET:
1094 /* Just double check. */
1095 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1096 if (dfa->nodes[node].opr.sbcset[i])
1097 return;
1098 break;
1099 default:
1100 abort ();
1101 }
1102
1103 if (mb_chars || has_period)
1104 for (node = 0; node < dfa->nodes_len; ++node)
1105 {
1106 if (dfa->nodes[node].type == CHARACTER
1107 && dfa->nodes[node].opr.c >= 0x80)
1108 dfa->nodes[node].mb_partial = 0;
1109 else if (dfa->nodes[node].type == OP_PERIOD)
1110 dfa->nodes[node].type = OP_UTF8_PERIOD;
1111 }
1112
1113 /* The search can be in single byte locale. */
1114 dfa->mb_cur_max = 1;
1115 dfa->is_utf8 = 0;
1116 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1117 }
1118 #endif
1119 \f
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121 "eclosure", and "inveclosure". */
1122
1123 static reg_errcode_t
1124 analyze (preg)
1125 regex_t *preg;
1126 {
1127 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1128 reg_errcode_t ret;
1129
1130 /* Allocate arrays. */
1131 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1132 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1133 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1136 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1137 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1138 return REG_ESPACE;
1139
1140 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1141 if (dfa->subexp_map != NULL)
1142 {
1143 int i;
1144 for (i = 0; i < preg->re_nsub; i++)
1145 dfa->subexp_map[i] = i;
1146 preorder (dfa->str_tree, optimize_subexps, dfa);
1147 for (i = 0; i < preg->re_nsub; i++)
1148 if (dfa->subexp_map[i] != i)
1149 break;
1150 if (i == preg->re_nsub)
1151 {
1152 free (dfa->subexp_map);
1153 dfa->subexp_map = NULL;
1154 }
1155 }
1156
1157 ret = postorder (dfa->str_tree, lower_subexps, preg);
1158 if (BE (ret != REG_NOERROR, 0))
1159 return ret;
1160 ret = postorder (dfa->str_tree, calc_first, dfa);
1161 if (BE (ret != REG_NOERROR, 0))
1162 return ret;
1163 preorder (dfa->str_tree, calc_next, dfa);
1164 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1165 if (BE (ret != REG_NOERROR, 0))
1166 return ret;
1167 ret = calc_eclosure (dfa);
1168 if (BE (ret != REG_NOERROR, 0))
1169 return ret;
1170 calc_inveclosure (dfa);
1171 return ret;
1172 }
1173
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175 implement parse tree visits. Instead, we use parent pointers and
1176 some hairy code in these two functions. */
1177 static reg_errcode_t
1178 postorder (root, fn, extra)
1179 bin_tree_t *root;
1180 reg_errcode_t (fn (void *, bin_tree_t *));
1181 void *extra;
1182 {
1183 bin_tree_t *node, *prev;
1184
1185 for (node = root; ; )
1186 {
1187 /* Descend down the tree, preferably to the left (or to the right
1188 if that's the only child). */
1189 while (node->left || node->right)
1190 if (node->left)
1191 node = node->left;
1192 else
1193 node = node->right;
1194
1195 do
1196 {
1197 reg_errcode_t err = fn (extra, node);
1198 if (BE (err != REG_NOERROR, 0))
1199 return err;
1200 if (node->parent == NULL)
1201 return REG_NOERROR;
1202 prev = node;
1203 node = node->parent;
1204 }
1205 /* Go up while we have a node that is reached from the right. */
1206 while (node->right == prev || node->right == NULL);
1207 node = node->right;
1208 }
1209 }
1210
1211 static reg_errcode_t
1212 preorder (root, fn, extra)
1213 bin_tree_t *root;
1214 reg_errcode_t (fn (void *, bin_tree_t *));
1215 void *extra;
1216 {
1217 bin_tree_t *node;
1218
1219 for (node = root; ; )
1220 {
1221 reg_errcode_t err = fn (extra, node);
1222 if (BE (err != REG_NOERROR, 0))
1223 return err;
1224
1225 /* Go to the left node, or up and to the right. */
1226 if (node->left)
1227 node = node->left;
1228 else
1229 {
1230 bin_tree_t *prev = NULL;
1231 while (node->right == prev || node->right == NULL)
1232 {
1233 prev = node;
1234 node = node->parent;
1235 if (!node)
1236 return REG_NOERROR;
1237 }
1238 node = node->right;
1239 }
1240 }
1241 }
1242
1243 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1244 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1245 backreferences as well. Requires a preorder visit. */
1246 static reg_errcode_t
1247 optimize_subexps (extra, node)
1248 void *extra;
1249 bin_tree_t *node;
1250 {
1251 re_dfa_t *dfa = (re_dfa_t *) extra;
1252
1253 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1254 {
1255 int idx = node->token.opr.idx;
1256 node->token.opr.idx = dfa->subexp_map[idx];
1257 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1258 }
1259
1260 else if (node->token.type == SUBEXP
1261 && node->left && node->left->token.type == SUBEXP)
1262 {
1263 int other_idx = node->left->token.opr.idx;
1264
1265 node->left = node->left->left;
1266 if (node->left)
1267 node->left->parent = node;
1268
1269 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1270 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1271 dfa->used_bkref_map &= ~(1 << other_idx);
1272 }
1273
1274 return REG_NOERROR;
1275 }
1276
1277 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1278 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1279 static reg_errcode_t
1280 lower_subexps (extra, node)
1281 void *extra;
1282 bin_tree_t *node;
1283 {
1284 regex_t *preg = (regex_t *) extra;
1285 reg_errcode_t err = REG_NOERROR;
1286
1287 if (node->left && node->left->token.type == SUBEXP)
1288 {
1289 node->left = lower_subexp (&err, preg, node->left);
1290 if (node->left)
1291 node->left->parent = node;
1292 }
1293 if (node->right && node->right->token.type == SUBEXP)
1294 {
1295 node->right = lower_subexp (&err, preg, node->right);
1296 if (node->right)
1297 node->right->parent = node;
1298 }
1299
1300 return err;
1301 }
1302
1303 static bin_tree_t *
1304 lower_subexp (err, preg, node)
1305 reg_errcode_t *err;
1306 regex_t *preg;
1307 bin_tree_t *node;
1308 {
1309 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1310 bin_tree_t *body = node->left;
1311 bin_tree_t *op, *cls, *tree1, *tree;
1312
1313 if (preg->no_sub
1314 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1315 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1316 return node->left;
1317
1318 /* Convert the SUBEXP node to the concatenation of an
1319 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1320 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1321 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1322 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1323 tree = create_tree (dfa, op, tree1, CONCAT);
1324 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1325 {
1326 *err = REG_ESPACE;
1327 return NULL;
1328 }
1329
1330 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1331 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1332 return tree;
1333 }
1334
1335 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1336 nodes. Requires a postorder visit. */
1337 static reg_errcode_t
1338 calc_first (extra, node)
1339 void *extra;
1340 bin_tree_t *node;
1341 {
1342 re_dfa_t *dfa = (re_dfa_t *) extra;
1343 if (node->token.type == CONCAT)
1344 {
1345 node->first = node->left->first;
1346 node->node_idx = node->left->node_idx;
1347 }
1348 else
1349 {
1350 node->first = node;
1351 node->node_idx = re_dfa_add_node (dfa, node->token);
1352 if (BE (node->node_idx == -1, 0))
1353 return REG_ESPACE;
1354 }
1355 return REG_NOERROR;
1356 }
1357
1358 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1359 static reg_errcode_t
1360 calc_next (extra, node)
1361 void *extra;
1362 bin_tree_t *node;
1363 {
1364 switch (node->token.type)
1365 {
1366 case OP_DUP_ASTERISK:
1367 node->left->next = node;
1368 break;
1369 case CONCAT:
1370 node->left->next = node->right->first;
1371 node->right->next = node->next;
1372 break;
1373 default:
1374 if (node->left)
1375 node->left->next = node->next;
1376 if (node->right)
1377 node->right->next = node->next;
1378 break;
1379 }
1380 return REG_NOERROR;
1381 }
1382
1383 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1384 static reg_errcode_t
1385 link_nfa_nodes (extra, node)
1386 void *extra;
1387 bin_tree_t *node;
1388 {
1389 re_dfa_t *dfa = (re_dfa_t *) extra;
1390 int idx = node->node_idx;
1391 reg_errcode_t err = REG_NOERROR;
1392
1393 switch (node->token.type)
1394 {
1395 case CONCAT:
1396 break;
1397
1398 case END_OF_RE:
1399 assert (node->next == NULL);
1400 break;
1401
1402 case OP_DUP_ASTERISK:
1403 case OP_ALT:
1404 {
1405 int left, right;
1406 dfa->has_plural_match = 1;
1407 if (node->left != NULL)
1408 left = node->left->first->node_idx;
1409 else
1410 left = node->next->node_idx;
1411 if (node->right != NULL)
1412 right = node->right->first->node_idx;
1413 else
1414 right = node->next->node_idx;
1415 assert (left > -1);
1416 assert (right > -1);
1417 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1418 }
1419 break;
1420
1421 case ANCHOR:
1422 case OP_OPEN_SUBEXP:
1423 case OP_CLOSE_SUBEXP:
1424 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1425 break;
1426
1427 case OP_BACK_REF:
1428 dfa->nexts[idx] = node->next->node_idx;
1429 if (node->token.type == OP_BACK_REF)
1430 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1431 break;
1432
1433 default:
1434 assert (!IS_EPSILON_NODE (node->token.type));
1435 dfa->nexts[idx] = node->next->node_idx;
1436 break;
1437 }
1438
1439 return err;
1440 }
1441
1442 /* Duplicate the epsilon closure of the node ROOT_NODE.
1443 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1444 to their own constraint. */
1445
1446 static reg_errcode_t
1447 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1448 init_constraint)
1449 re_dfa_t *dfa;
1450 int top_org_node, top_clone_node, root_node;
1451 unsigned int init_constraint;
1452 {
1453 reg_errcode_t err;
1454 int org_node, clone_node, ret;
1455 unsigned int constraint = init_constraint;
1456 for (org_node = top_org_node, clone_node = top_clone_node;;)
1457 {
1458 int org_dest, clone_dest;
1459 if (dfa->nodes[org_node].type == OP_BACK_REF)
1460 {
1461 /* If the back reference epsilon-transit, its destination must
1462 also have the constraint. Then duplicate the epsilon closure
1463 of the destination of the back reference, and store it in
1464 edests of the back reference. */
1465 org_dest = dfa->nexts[org_node];
1466 re_node_set_empty (dfa->edests + clone_node);
1467 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1468 if (BE (err != REG_NOERROR, 0))
1469 return err;
1470 dfa->nexts[clone_node] = dfa->nexts[org_node];
1471 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1472 if (BE (ret < 0, 0))
1473 return REG_ESPACE;
1474 }
1475 else if (dfa->edests[org_node].nelem == 0)
1476 {
1477 /* In case of the node can't epsilon-transit, don't duplicate the
1478 destination and store the original destination as the
1479 destination of the node. */
1480 dfa->nexts[clone_node] = dfa->nexts[org_node];
1481 break;
1482 }
1483 else if (dfa->edests[org_node].nelem == 1)
1484 {
1485 /* In case of the node can epsilon-transit, and it has only one
1486 destination. */
1487 org_dest = dfa->edests[org_node].elems[0];
1488 re_node_set_empty (dfa->edests + clone_node);
1489 if (dfa->nodes[org_node].type == ANCHOR)
1490 {
1491 /* In case of the node has another constraint, append it. */
1492 if (org_node == root_node && clone_node != org_node)
1493 {
1494 /* ...but if the node is root_node itself, it means the
1495 epsilon closure have a loop, then tie it to the
1496 destination of the root_node. */
1497 ret = re_node_set_insert (dfa->edests + clone_node,
1498 org_dest);
1499 if (BE (ret < 0, 0))
1500 return REG_ESPACE;
1501 break;
1502 }
1503 constraint |= dfa->nodes[org_node].opr.ctx_type;
1504 }
1505 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1506 if (BE (err != REG_NOERROR, 0))
1507 return err;
1508 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 if (BE (ret < 0, 0))
1510 return REG_ESPACE;
1511 }
1512 else /* dfa->edests[org_node].nelem == 2 */
1513 {
1514 /* In case of the node can epsilon-transit, and it has two
1515 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1516 org_dest = dfa->edests[org_node].elems[0];
1517 re_node_set_empty (dfa->edests + clone_node);
1518 /* Search for a duplicated node which satisfies the constraint. */
1519 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1520 if (clone_dest == -1)
1521 {
1522 /* There are no such a duplicated node, create a new one. */
1523 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1524 if (BE (err != REG_NOERROR, 0))
1525 return err;
1526 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527 if (BE (ret < 0, 0))
1528 return REG_ESPACE;
1529 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1530 root_node, constraint);
1531 if (BE (err != REG_NOERROR, 0))
1532 return err;
1533 }
1534 else
1535 {
1536 /* There are a duplicated node which satisfy the constraint,
1537 use it to avoid infinite loop. */
1538 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1539 if (BE (ret < 0, 0))
1540 return REG_ESPACE;
1541 }
1542
1543 org_dest = dfa->edests[org_node].elems[1];
1544 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1545 if (BE (err != REG_NOERROR, 0))
1546 return err;
1547 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (ret < 0, 0))
1549 return REG_ESPACE;
1550 }
1551 org_node = org_dest;
1552 clone_node = clone_dest;
1553 }
1554 return REG_NOERROR;
1555 }
1556
1557 /* Search for a node which is duplicated from the node ORG_NODE, and
1558 satisfies the constraint CONSTRAINT. */
1559
1560 static int
1561 search_duplicated_node (dfa, org_node, constraint)
1562 re_dfa_t *dfa;
1563 int org_node;
1564 unsigned int constraint;
1565 {
1566 int idx;
1567 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1568 {
1569 if (org_node == dfa->org_indices[idx]
1570 && constraint == dfa->nodes[idx].constraint)
1571 return idx; /* Found. */
1572 }
1573 return -1; /* Not found. */
1574 }
1575
1576 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1577 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1578 otherwise return the error code. */
1579
1580 static reg_errcode_t
1581 duplicate_node (new_idx, dfa, org_idx, constraint)
1582 re_dfa_t *dfa;
1583 int *new_idx, org_idx;
1584 unsigned int constraint;
1585 {
1586 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1587 if (BE (dup_idx == -1, 0))
1588 return REG_ESPACE;
1589 dfa->nodes[dup_idx].constraint = constraint;
1590 if (dfa->nodes[org_idx].type == ANCHOR)
1591 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1592 dfa->nodes[dup_idx].duplicated = 1;
1593
1594 /* Store the index of the original node. */
1595 dfa->org_indices[dup_idx] = org_idx;
1596 *new_idx = dup_idx;
1597 return REG_NOERROR;
1598 }
1599
1600 static void
1601 calc_inveclosure (dfa)
1602 re_dfa_t *dfa;
1603 {
1604 int src, idx, dest;
1605 for (src = 0; src < dfa->nodes_len; ++src)
1606 {
1607 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1608 {
1609 dest = dfa->eclosures[src].elems[idx];
1610 re_node_set_insert_last (dfa->inveclosures + dest, src);
1611 }
1612 }
1613 }
1614
1615 /* Calculate "eclosure" for all the node in DFA. */
1616
1617 static reg_errcode_t
1618 calc_eclosure (dfa)
1619 re_dfa_t *dfa;
1620 {
1621 int node_idx, incomplete;
1622 #ifdef DEBUG
1623 assert (dfa->nodes_len > 0);
1624 #endif
1625 incomplete = 0;
1626 /* For each nodes, calculate epsilon closure. */
1627 for (node_idx = 0; ; ++node_idx)
1628 {
1629 reg_errcode_t err;
1630 re_node_set eclosure_elem;
1631 if (node_idx == dfa->nodes_len)
1632 {
1633 if (!incomplete)
1634 break;
1635 incomplete = 0;
1636 node_idx = 0;
1637 }
1638
1639 #ifdef DEBUG
1640 assert (dfa->eclosures[node_idx].nelem != -1);
1641 #endif
1642
1643 /* If we have already calculated, skip it. */
1644 if (dfa->eclosures[node_idx].nelem != 0)
1645 continue;
1646 /* Calculate epsilon closure of `node_idx'. */
1647 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1648 if (BE (err != REG_NOERROR, 0))
1649 return err;
1650
1651 if (dfa->eclosures[node_idx].nelem == 0)
1652 {
1653 incomplete = 1;
1654 re_node_set_free (&eclosure_elem);
1655 }
1656 }
1657 return REG_NOERROR;
1658 }
1659
1660 /* Calculate epsilon closure of NODE. */
1661
1662 static reg_errcode_t
1663 calc_eclosure_iter (new_set, dfa, node, root)
1664 re_node_set *new_set;
1665 re_dfa_t *dfa;
1666 int node, root;
1667 {
1668 reg_errcode_t err;
1669 unsigned int constraint;
1670 int i, incomplete;
1671 re_node_set eclosure;
1672 incomplete = 0;
1673 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1674 if (BE (err != REG_NOERROR, 0))
1675 return err;
1676
1677 /* This indicates that we are calculating this node now.
1678 We reference this value to avoid infinite loop. */
1679 dfa->eclosures[node].nelem = -1;
1680
1681 constraint = ((dfa->nodes[node].type == ANCHOR)
1682 ? dfa->nodes[node].opr.ctx_type : 0);
1683 /* If the current node has constraints, duplicate all nodes.
1684 Since they must inherit the constraints. */
1685 if (constraint
1686 && dfa->edests[node].nelem
1687 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1688 {
1689 int org_node, cur_node;
1690 org_node = cur_node = node;
1691 err = duplicate_node_closure (dfa, node, node, node, constraint);
1692 if (BE (err != REG_NOERROR, 0))
1693 return err;
1694 }
1695
1696 /* Expand each epsilon destination nodes. */
1697 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1698 for (i = 0; i < dfa->edests[node].nelem; ++i)
1699 {
1700 re_node_set eclosure_elem;
1701 int edest = dfa->edests[node].elems[i];
1702 /* If calculating the epsilon closure of `edest' is in progress,
1703 return intermediate result. */
1704 if (dfa->eclosures[edest].nelem == -1)
1705 {
1706 incomplete = 1;
1707 continue;
1708 }
1709 /* If we haven't calculated the epsilon closure of `edest' yet,
1710 calculate now. Otherwise use calculated epsilon closure. */
1711 if (dfa->eclosures[edest].nelem == 0)
1712 {
1713 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1714 if (BE (err != REG_NOERROR, 0))
1715 return err;
1716 }
1717 else
1718 eclosure_elem = dfa->eclosures[edest];
1719 /* Merge the epsilon closure of `edest'. */
1720 re_node_set_merge (&eclosure, &eclosure_elem);
1721 /* If the epsilon closure of `edest' is incomplete,
1722 the epsilon closure of this node is also incomplete. */
1723 if (dfa->eclosures[edest].nelem == 0)
1724 {
1725 incomplete = 1;
1726 re_node_set_free (&eclosure_elem);
1727 }
1728 }
1729
1730 /* Epsilon closures include itself. */
1731 re_node_set_insert (&eclosure, node);
1732 if (incomplete && !root)
1733 dfa->eclosures[node].nelem = 0;
1734 else
1735 dfa->eclosures[node] = eclosure;
1736 *new_set = eclosure;
1737 return REG_NOERROR;
1738 }
1739 \f
1740 /* Functions for token which are used in the parser. */
1741
1742 /* Fetch a token from INPUT.
1743 We must not use this function inside bracket expressions. */
1744
1745 static void
1746 fetch_token (result, input, syntax)
1747 re_token_t *result;
1748 re_string_t *input;
1749 reg_syntax_t syntax;
1750 {
1751 re_string_skip_bytes (input, peek_token (result, input, syntax));
1752 }
1753
1754 /* Peek a token from INPUT, and return the length of the token.
1755 We must not use this function inside bracket expressions. */
1756
1757 static int
1758 peek_token (token, input, syntax)
1759 re_token_t *token;
1760 re_string_t *input;
1761 reg_syntax_t syntax;
1762 {
1763 unsigned char c;
1764
1765 if (re_string_eoi (input))
1766 {
1767 token->type = END_OF_RE;
1768 return 0;
1769 }
1770
1771 c = re_string_peek_byte (input, 0);
1772 token->opr.c = c;
1773
1774 token->word_char = 0;
1775 #ifdef RE_ENABLE_I18N
1776 token->mb_partial = 0;
1777 if (input->mb_cur_max > 1 &&
1778 !re_string_first_byte (input, re_string_cur_idx (input)))
1779 {
1780 token->type = CHARACTER;
1781 token->mb_partial = 1;
1782 return 1;
1783 }
1784 #endif
1785 if (c == '\\')
1786 {
1787 unsigned char c2;
1788 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1789 {
1790 token->type = BACK_SLASH;
1791 return 1;
1792 }
1793
1794 c2 = re_string_peek_byte_case (input, 1);
1795 token->opr.c = c2;
1796 token->type = CHARACTER;
1797 #ifdef RE_ENABLE_I18N
1798 if (input->mb_cur_max > 1)
1799 {
1800 wint_t wc = re_string_wchar_at (input,
1801 re_string_cur_idx (input) + 1);
1802 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1803 }
1804 else
1805 #endif
1806 token->word_char = IS_WORD_CHAR (c2) != 0;
1807
1808 switch (c2)
1809 {
1810 case '|':
1811 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1812 token->type = OP_ALT;
1813 break;
1814 case '1': case '2': case '3': case '4': case '5':
1815 case '6': case '7': case '8': case '9':
1816 if (!(syntax & RE_NO_BK_REFS))
1817 {
1818 token->type = OP_BACK_REF;
1819 token->opr.idx = c2 - '1';
1820 }
1821 break;
1822 case '<':
1823 if (!(syntax & RE_NO_GNU_OPS))
1824 {
1825 token->type = ANCHOR;
1826 token->opr.ctx_type = WORD_FIRST;
1827 }
1828 break;
1829 case '>':
1830 if (!(syntax & RE_NO_GNU_OPS))
1831 {
1832 token->type = ANCHOR;
1833 token->opr.ctx_type = WORD_LAST;
1834 }
1835 break;
1836 case 'b':
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 {
1839 token->type = ANCHOR;
1840 token->opr.ctx_type = WORD_DELIM;
1841 }
1842 break;
1843 case 'B':
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 {
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = NOT_WORD_DELIM;
1848 }
1849 break;
1850 case 'w':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = OP_WORD;
1853 break;
1854 case 'W':
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = OP_NOTWORD;
1857 break;
1858 case 's':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = OP_SPACE;
1861 break;
1862 case 'S':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_NOTSPACE;
1865 break;
1866 case '`':
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 {
1869 token->type = ANCHOR;
1870 token->opr.ctx_type = BUF_FIRST;
1871 }
1872 break;
1873 case '\'':
1874 if (!(syntax & RE_NO_GNU_OPS))
1875 {
1876 token->type = ANCHOR;
1877 token->opr.ctx_type = BUF_LAST;
1878 }
1879 break;
1880 case '(':
1881 if (!(syntax & RE_NO_BK_PARENS))
1882 token->type = OP_OPEN_SUBEXP;
1883 break;
1884 case ')':
1885 if (!(syntax & RE_NO_BK_PARENS))
1886 token->type = OP_CLOSE_SUBEXP;
1887 break;
1888 case '+':
1889 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1890 token->type = OP_DUP_PLUS;
1891 break;
1892 case '?':
1893 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894 token->type = OP_DUP_QUESTION;
1895 break;
1896 case '{':
1897 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1898 token->type = OP_OPEN_DUP_NUM;
1899 break;
1900 case '}':
1901 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902 token->type = OP_CLOSE_DUP_NUM;
1903 break;
1904 default:
1905 break;
1906 }
1907 return 2;
1908 }
1909
1910 token->type = CHARACTER;
1911 #ifdef RE_ENABLE_I18N
1912 if (input->mb_cur_max > 1)
1913 {
1914 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1915 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1916 }
1917 else
1918 #endif
1919 token->word_char = IS_WORD_CHAR (token->opr.c);
1920
1921 switch (c)
1922 {
1923 case '\n':
1924 if (syntax & RE_NEWLINE_ALT)
1925 token->type = OP_ALT;
1926 break;
1927 case '|':
1928 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1929 token->type = OP_ALT;
1930 break;
1931 case '*':
1932 token->type = OP_DUP_ASTERISK;
1933 break;
1934 case '+':
1935 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1936 token->type = OP_DUP_PLUS;
1937 break;
1938 case '?':
1939 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940 token->type = OP_DUP_QUESTION;
1941 break;
1942 case '{':
1943 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1944 token->type = OP_OPEN_DUP_NUM;
1945 break;
1946 case '}':
1947 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948 token->type = OP_CLOSE_DUP_NUM;
1949 break;
1950 case '(':
1951 if (syntax & RE_NO_BK_PARENS)
1952 token->type = OP_OPEN_SUBEXP;
1953 break;
1954 case ')':
1955 if (syntax & RE_NO_BK_PARENS)
1956 token->type = OP_CLOSE_SUBEXP;
1957 break;
1958 case '[':
1959 token->type = OP_OPEN_BRACKET;
1960 break;
1961 case '.':
1962 token->type = OP_PERIOD;
1963 break;
1964 case '^':
1965 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1966 re_string_cur_idx (input) != 0)
1967 {
1968 char prev = re_string_peek_byte (input, -1);
1969 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1970 break;
1971 }
1972 token->type = ANCHOR;
1973 token->opr.ctx_type = LINE_FIRST;
1974 break;
1975 case '$':
1976 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1977 re_string_cur_idx (input) + 1 != re_string_length (input))
1978 {
1979 re_token_t next;
1980 re_string_skip_bytes (input, 1);
1981 peek_token (&next, input, syntax);
1982 re_string_skip_bytes (input, -1);
1983 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1984 break;
1985 }
1986 token->type = ANCHOR;
1987 token->opr.ctx_type = LINE_LAST;
1988 break;
1989 default:
1990 break;
1991 }
1992 return 1;
1993 }
1994
1995 /* Peek a token from INPUT, and return the length of the token.
1996 We must not use this function out of bracket expressions. */
1997
1998 static int
1999 peek_token_bracket (token, input, syntax)
2000 re_token_t *token;
2001 re_string_t *input;
2002 reg_syntax_t syntax;
2003 {
2004 unsigned char c;
2005 if (re_string_eoi (input))
2006 {
2007 token->type = END_OF_RE;
2008 return 0;
2009 }
2010 c = re_string_peek_byte (input, 0);
2011 token->opr.c = c;
2012
2013 #ifdef RE_ENABLE_I18N
2014 if (input->mb_cur_max > 1 &&
2015 !re_string_first_byte (input, re_string_cur_idx (input)))
2016 {
2017 token->type = CHARACTER;
2018 return 1;
2019 }
2020 #endif /* RE_ENABLE_I18N */
2021
2022 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2023 && re_string_cur_idx (input) + 1 < re_string_length (input))
2024 {
2025 /* In this case, '\' escape a character. */
2026 unsigned char c2;
2027 re_string_skip_bytes (input, 1);
2028 c2 = re_string_peek_byte (input, 0);
2029 token->opr.c = c2;
2030 token->type = CHARACTER;
2031 return 1;
2032 }
2033 if (c == '[') /* '[' is a special char in a bracket exps. */
2034 {
2035 unsigned char c2;
2036 int token_len;
2037 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2038 c2 = re_string_peek_byte (input, 1);
2039 else
2040 c2 = 0;
2041 token->opr.c = c2;
2042 token_len = 2;
2043 switch (c2)
2044 {
2045 case '.':
2046 token->type = OP_OPEN_COLL_ELEM;
2047 break;
2048 case '=':
2049 token->type = OP_OPEN_EQUIV_CLASS;
2050 break;
2051 case ':':
2052 if (syntax & RE_CHAR_CLASSES)
2053 {
2054 token->type = OP_OPEN_CHAR_CLASS;
2055 break;
2056 }
2057 /* else fall through. */
2058 default:
2059 token->type = CHARACTER;
2060 token->opr.c = c;
2061 token_len = 1;
2062 break;
2063 }
2064 return token_len;
2065 }
2066 switch (c)
2067 {
2068 case '-':
2069 token->type = OP_CHARSET_RANGE;
2070 break;
2071 case ']':
2072 token->type = OP_CLOSE_BRACKET;
2073 break;
2074 case '^':
2075 token->type = OP_NON_MATCH_LIST;
2076 break;
2077 default:
2078 token->type = CHARACTER;
2079 }
2080 return 1;
2081 }
2082 \f
2083 /* Functions for parser. */
2084
2085 /* Entry point of the parser.
2086 Parse the regular expression REGEXP and return the structure tree.
2087 If an error is occured, ERR is set by error code, and return NULL.
2088 This function build the following tree, from regular expression <reg_exp>:
2089 CAT
2090 / \
2091 / \
2092 <reg_exp> EOR
2093
2094 CAT means concatenation.
2095 EOR means end of regular expression. */
2096
2097 static bin_tree_t *
2098 parse (regexp, preg, syntax, err)
2099 re_string_t *regexp;
2100 regex_t *preg;
2101 reg_syntax_t syntax;
2102 reg_errcode_t *err;
2103 {
2104 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105 bin_tree_t *tree, *eor, *root;
2106 re_token_t current_token;
2107 dfa->syntax = syntax;
2108 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2109 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2110 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2111 return NULL;
2112 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2113 if (tree != NULL)
2114 root = create_tree (dfa, tree, eor, CONCAT);
2115 else
2116 root = eor;
2117 if (BE (eor == NULL || root == NULL, 0))
2118 {
2119 *err = REG_ESPACE;
2120 return NULL;
2121 }
2122 return root;
2123 }
2124
2125 /* This function build the following tree, from regular expression
2126 <branch1>|<branch2>:
2127 ALT
2128 / \
2129 / \
2130 <branch1> <branch2>
2131
2132 ALT means alternative, which represents the operator `|'. */
2133
2134 static bin_tree_t *
2135 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2136 re_string_t *regexp;
2137 regex_t *preg;
2138 re_token_t *token;
2139 reg_syntax_t syntax;
2140 int nest;
2141 reg_errcode_t *err;
2142 {
2143 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144 bin_tree_t *tree, *branch = NULL;
2145 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147 return NULL;
2148
2149 while (token->type == OP_ALT)
2150 {
2151 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152 if (token->type != OP_ALT && token->type != END_OF_RE
2153 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154 {
2155 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2156 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2157 return NULL;
2158 }
2159 else
2160 branch = NULL;
2161 tree = create_tree (dfa, tree, branch, OP_ALT);
2162 if (BE (tree == NULL, 0))
2163 {
2164 *err = REG_ESPACE;
2165 return NULL;
2166 }
2167 }
2168 return tree;
2169 }
2170
2171 /* This function build the following tree, from regular expression
2172 <exp1><exp2>:
2173 CAT
2174 / \
2175 / \
2176 <exp1> <exp2>
2177
2178 CAT means concatenation. */
2179
2180 static bin_tree_t *
2181 parse_branch (regexp, preg, token, syntax, nest, err)
2182 re_string_t *regexp;
2183 regex_t *preg;
2184 re_token_t *token;
2185 reg_syntax_t syntax;
2186 int nest;
2187 reg_errcode_t *err;
2188 {
2189 bin_tree_t *tree, *exp;
2190 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2191 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2192 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 return NULL;
2194
2195 while (token->type != OP_ALT && token->type != END_OF_RE
2196 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2197 {
2198 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2199 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2200 {
2201 return NULL;
2202 }
2203 if (tree != NULL && exp != NULL)
2204 {
2205 tree = create_tree (dfa, tree, exp, CONCAT);
2206 if (tree == NULL)
2207 {
2208 *err = REG_ESPACE;
2209 return NULL;
2210 }
2211 }
2212 else if (tree == NULL)
2213 tree = exp;
2214 /* Otherwise exp == NULL, we don't need to create new tree. */
2215 }
2216 return tree;
2217 }
2218
2219 /* This function build the following tree, from regular expression a*:
2220 *
2221 |
2222 a
2223 */
2224
2225 static bin_tree_t *
2226 parse_expression (regexp, preg, token, syntax, nest, err)
2227 re_string_t *regexp;
2228 regex_t *preg;
2229 re_token_t *token;
2230 reg_syntax_t syntax;
2231 int nest;
2232 reg_errcode_t *err;
2233 {
2234 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2235 bin_tree_t *tree;
2236 switch (token->type)
2237 {
2238 case CHARACTER:
2239 tree = create_token_tree (dfa, NULL, NULL, token);
2240 if (BE (tree == NULL, 0))
2241 {
2242 *err = REG_ESPACE;
2243 return NULL;
2244 }
2245 #ifdef RE_ENABLE_I18N
2246 if (dfa->mb_cur_max > 1)
2247 {
2248 while (!re_string_eoi (regexp)
2249 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2250 {
2251 bin_tree_t *mbc_remain;
2252 fetch_token (token, regexp, syntax);
2253 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2254 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2255 if (BE (mbc_remain == NULL || tree == NULL, 0))
2256 {
2257 *err = REG_ESPACE;
2258 return NULL;
2259 }
2260 }
2261 }
2262 #endif
2263 break;
2264 case OP_OPEN_SUBEXP:
2265 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2266 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 return NULL;
2268 break;
2269 case OP_OPEN_BRACKET:
2270 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2271 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2272 return NULL;
2273 break;
2274 case OP_BACK_REF:
2275 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2276 {
2277 *err = REG_ESUBREG;
2278 return NULL;
2279 }
2280 dfa->used_bkref_map |= 1 << token->opr.idx;
2281 tree = create_token_tree (dfa, NULL, NULL, token);
2282 if (BE (tree == NULL, 0))
2283 {
2284 *err = REG_ESPACE;
2285 return NULL;
2286 }
2287 ++dfa->nbackref;
2288 dfa->has_mb_node = 1;
2289 break;
2290 case OP_OPEN_DUP_NUM:
2291 if (syntax & RE_CONTEXT_INVALID_DUP)
2292 {
2293 *err = REG_BADRPT;
2294 return NULL;
2295 }
2296 /* FALLTHROUGH */
2297 case OP_DUP_ASTERISK:
2298 case OP_DUP_PLUS:
2299 case OP_DUP_QUESTION:
2300 if (syntax & RE_CONTEXT_INVALID_OPS)
2301 {
2302 *err = REG_BADRPT;
2303 return NULL;
2304 }
2305 else if (syntax & RE_CONTEXT_INDEP_OPS)
2306 {
2307 fetch_token (token, regexp, syntax);
2308 return parse_expression (regexp, preg, token, syntax, nest, err);
2309 }
2310 /* else fall through */
2311 case OP_CLOSE_SUBEXP:
2312 if ((token->type == OP_CLOSE_SUBEXP) &&
2313 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2314 {
2315 *err = REG_ERPAREN;
2316 return NULL;
2317 }
2318 /* else fall through */
2319 case OP_CLOSE_DUP_NUM:
2320 /* We treat it as a normal character. */
2321
2322 /* Then we can these characters as normal characters. */
2323 token->type = CHARACTER;
2324 /* mb_partial and word_char bits should be initialized already
2325 by peek_token. */
2326 tree = create_token_tree (dfa, NULL, NULL, token);
2327 if (BE (tree == NULL, 0))
2328 {
2329 *err = REG_ESPACE;
2330 return NULL;
2331 }
2332 break;
2333 case ANCHOR:
2334 if ((token->opr.ctx_type
2335 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2336 && dfa->word_ops_used == 0)
2337 init_word_char (dfa);
2338 if (token->opr.ctx_type == WORD_DELIM
2339 || token->opr.ctx_type == NOT_WORD_DELIM)
2340 {
2341 bin_tree_t *tree_first, *tree_last;
2342 if (token->opr.ctx_type == WORD_DELIM)
2343 {
2344 token->opr.ctx_type = WORD_FIRST;
2345 tree_first = create_token_tree (dfa, NULL, NULL, token);
2346 token->opr.ctx_type = WORD_LAST;
2347 }
2348 else
2349 {
2350 token->opr.ctx_type = INSIDE_WORD;
2351 tree_first = create_token_tree (dfa, NULL, NULL, token);
2352 token->opr.ctx_type = INSIDE_NOTWORD;
2353 }
2354 tree_last = create_token_tree (dfa, NULL, NULL, token);
2355 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2356 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357 {
2358 *err = REG_ESPACE;
2359 return NULL;
2360 }
2361 }
2362 else
2363 {
2364 tree = create_token_tree (dfa, NULL, NULL, token);
2365 if (BE (tree == NULL, 0))
2366 {
2367 *err = REG_ESPACE;
2368 return NULL;
2369 }
2370 }
2371 /* We must return here, since ANCHORs can't be followed
2372 by repetition operators.
2373 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2375 fetch_token (token, regexp, syntax);
2376 return tree;
2377 case OP_PERIOD:
2378 tree = create_token_tree (dfa, NULL, NULL, token);
2379 if (BE (tree == NULL, 0))
2380 {
2381 *err = REG_ESPACE;
2382 return NULL;
2383 }
2384 if (dfa->mb_cur_max > 1)
2385 dfa->has_mb_node = 1;
2386 break;
2387 case OP_WORD:
2388 case OP_NOTWORD:
2389 tree = build_charclass_op (dfa, regexp->trans,
2390 (const unsigned char *) "alnum",
2391 (const unsigned char *) "_",
2392 token->type == OP_NOTWORD, err);
2393 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394 return NULL;
2395 break;
2396 case OP_SPACE:
2397 case OP_NOTSPACE:
2398 tree = build_charclass_op (dfa, regexp->trans,
2399 (const unsigned char *) "space",
2400 (const unsigned char *) "",
2401 token->type == OP_NOTSPACE, err);
2402 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403 return NULL;
2404 break;
2405 case OP_ALT:
2406 case END_OF_RE:
2407 return NULL;
2408 case BACK_SLASH:
2409 *err = REG_EESCAPE;
2410 return NULL;
2411 default:
2412 /* Must not happen? */
2413 #ifdef DEBUG
2414 assert (0);
2415 #endif
2416 return NULL;
2417 }
2418 fetch_token (token, regexp, syntax);
2419
2420 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2422 {
2423 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2424 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425 return NULL;
2426 /* In BRE consecutive duplications are not allowed. */
2427 if ((syntax & RE_CONTEXT_INVALID_DUP)
2428 && (token->type == OP_DUP_ASTERISK
2429 || token->type == OP_OPEN_DUP_NUM))
2430 {
2431 *err = REG_BADRPT;
2432 return NULL;
2433 }
2434 }
2435
2436 return tree;
2437 }
2438
2439 /* This function build the following tree, from regular expression
2440 (<reg_exp>):
2441 SUBEXP
2442 |
2443 <reg_exp>
2444 */
2445
2446 static bin_tree_t *
2447 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2448 re_string_t *regexp;
2449 regex_t *preg;
2450 re_token_t *token;
2451 reg_syntax_t syntax;
2452 int nest;
2453 reg_errcode_t *err;
2454 {
2455 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2456 bin_tree_t *tree;
2457 size_t cur_nsub;
2458 cur_nsub = preg->re_nsub++;
2459
2460 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2461
2462 /* The subexpression may be a null string. */
2463 if (token->type == OP_CLOSE_SUBEXP)
2464 tree = NULL;
2465 else
2466 {
2467 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2468 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2469 *err = REG_EPAREN;
2470 if (BE (*err != REG_NOERROR, 0))
2471 return NULL;
2472 }
2473 dfa->completed_bkref_map |= 1 << cur_nsub;
2474
2475 tree = create_tree (dfa, tree, NULL, SUBEXP);
2476 if (BE (tree == NULL, 0))
2477 {
2478 *err = REG_ESPACE;
2479 return NULL;
2480 }
2481 tree->token.opr.idx = cur_nsub;
2482 return tree;
2483 }
2484
2485 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2486
2487 static bin_tree_t *
2488 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2489 bin_tree_t *elem;
2490 re_string_t *regexp;
2491 re_dfa_t *dfa;
2492 re_token_t *token;
2493 reg_syntax_t syntax;
2494 reg_errcode_t *err;
2495 {
2496 bin_tree_t *tree = NULL, *old_tree = NULL;
2497 int i, start, end, start_idx = re_string_cur_idx (regexp);
2498 re_token_t start_token = *token;
2499
2500 if (token->type == OP_OPEN_DUP_NUM)
2501 {
2502 end = 0;
2503 start = fetch_number (regexp, token, syntax);
2504 if (start == -1)
2505 {
2506 if (token->type == CHARACTER && token->opr.c == ',')
2507 start = 0; /* We treat "{,m}" as "{0,m}". */
2508 else
2509 {
2510 *err = REG_BADBR; /* <re>{} is invalid. */
2511 return NULL;
2512 }
2513 }
2514 if (BE (start != -2, 1))
2515 {
2516 /* We treat "{n}" as "{n,n}". */
2517 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2518 : ((token->type == CHARACTER && token->opr.c == ',')
2519 ? fetch_number (regexp, token, syntax) : -2));
2520 }
2521 if (BE (start == -2 || end == -2, 0))
2522 {
2523 /* Invalid sequence. */
2524 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2525 {
2526 if (token->type == END_OF_RE)
2527 *err = REG_EBRACE;
2528 else
2529 *err = REG_BADBR;
2530
2531 return NULL;
2532 }
2533
2534 /* If the syntax bit is set, rollback. */
2535 re_string_set_index (regexp, start_idx);
2536 *token = start_token;
2537 token->type = CHARACTER;
2538 /* mb_partial and word_char bits should be already initialized by
2539 peek_token. */
2540 return elem;
2541 }
2542
2543 if (BE (end != -1 && start > end, 0))
2544 {
2545 /* First number greater than second. */
2546 *err = REG_BADBR;
2547 return NULL;
2548 }
2549 }
2550 else
2551 {
2552 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2553 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2554 }
2555
2556 fetch_token (token, regexp, syntax);
2557
2558 if (BE (elem == NULL, 0))
2559 return NULL;
2560 if (BE (start == 0 && end == 0, 0))
2561 {
2562 postorder (elem, free_tree, NULL);
2563 return NULL;
2564 }
2565
2566 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2567 if (BE (start > 0, 0))
2568 {
2569 tree = elem;
2570 for (i = 2; i <= start; ++i)
2571 {
2572 elem = duplicate_tree (elem, dfa);
2573 tree = create_tree (dfa, tree, elem, CONCAT);
2574 if (BE (elem == NULL || tree == NULL, 0))
2575 goto parse_dup_op_espace;
2576 }
2577
2578 if (start == end)
2579 return tree;
2580
2581 /* Duplicate ELEM before it is marked optional. */
2582 elem = duplicate_tree (elem, dfa);
2583 old_tree = tree;
2584 }
2585 else
2586 old_tree = NULL;
2587
2588 if (elem->token.type == SUBEXP)
2589 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2590
2591 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2592 if (BE (tree == NULL, 0))
2593 goto parse_dup_op_espace;
2594
2595 /* This loop is actually executed only when end != -1,
2596 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2597 already created the start+1-th copy. */
2598 for (i = start + 2; i <= end; ++i)
2599 {
2600 elem = duplicate_tree (elem, dfa);
2601 tree = create_tree (dfa, tree, elem, CONCAT);
2602 if (BE (elem == NULL || tree == NULL, 0))
2603 goto parse_dup_op_espace;
2604
2605 tree = create_tree (dfa, tree, NULL, OP_ALT);
2606 if (BE (tree == NULL, 0))
2607 goto parse_dup_op_espace;
2608 }
2609
2610 if (old_tree)
2611 tree = create_tree (dfa, old_tree, tree, CONCAT);
2612
2613 return tree;
2614
2615 parse_dup_op_espace:
2616 *err = REG_ESPACE;
2617 return NULL;
2618 }
2619
2620 /* Size of the names for collating symbol/equivalence_class/character_class.
2621 I'm not sure, but maybe enough. */
2622 #define BRACKET_NAME_BUF_SIZE 32
2623
2624 #ifndef _LIBC
2625 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2626 Build the range expression which starts from START_ELEM, and ends
2627 at END_ELEM. The result are written to MBCSET and SBCSET.
2628 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2629 mbcset->range_ends, is a pointer argument sinse we may
2630 update it. */
2631
2632 static reg_errcode_t
2633 # ifdef RE_ENABLE_I18N
2634 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2635 re_charset_t *mbcset;
2636 int *range_alloc;
2637 # else /* not RE_ENABLE_I18N */
2638 build_range_exp (sbcset, start_elem, end_elem)
2639 # endif /* not RE_ENABLE_I18N */
2640 re_bitset_ptr_t sbcset;
2641 bracket_elem_t *start_elem, *end_elem;
2642 {
2643 unsigned int start_ch, end_ch;
2644 /* Equivalence Classes and Character Classes can't be a range start/end. */
2645 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2646 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2647 0))
2648 return REG_ERANGE;
2649
2650 /* We can handle no multi character collating elements without libc
2651 support. */
2652 if (BE ((start_elem->type == COLL_SYM
2653 && strlen ((char *) start_elem->opr.name) > 1)
2654 || (end_elem->type == COLL_SYM
2655 && strlen ((char *) end_elem->opr.name) > 1), 0))
2656 return REG_ECOLLATE;
2657
2658 # ifdef RE_ENABLE_I18N
2659 {
2660 wchar_t wc, start_wc, end_wc;
2661 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662
2663 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2664 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665 : 0));
2666 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2667 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668 : 0));
2669 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2670 ? __btowc (start_ch) : start_elem->opr.wch);
2671 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2672 ? __btowc (end_ch) : end_elem->opr.wch);
2673 if (start_wc == WEOF || end_wc == WEOF)
2674 return REG_ECOLLATE;
2675 cmp_buf[0] = start_wc;
2676 cmp_buf[4] = end_wc;
2677 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2678 return REG_ERANGE;
2679
2680 /* Got valid collation sequence values, add them as a new entry.
2681 However, for !_LIBC we have no collation elements: if the
2682 character set is single byte, the single byte character set
2683 that we build below suffices. parse_bracket_exp passes
2684 no MBCSET if dfa->mb_cur_max == 1. */
2685 if (mbcset)
2686 {
2687 /* Check the space of the arrays. */
2688 if (BE (*range_alloc == mbcset->nranges, 0))
2689 {
2690 /* There is not enough space, need realloc. */
2691 wchar_t *new_array_start, *new_array_end;
2692 int new_nranges;
2693
2694 /* +1 in case of mbcset->nranges is 0. */
2695 new_nranges = 2 * mbcset->nranges + 1;
2696 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2697 are NULL if *range_alloc == 0. */
2698 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2699 new_nranges);
2700 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2701 new_nranges);
2702
2703 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2704 return REG_ESPACE;
2705
2706 mbcset->range_starts = new_array_start;
2707 mbcset->range_ends = new_array_end;
2708 *range_alloc = new_nranges;
2709 }
2710
2711 mbcset->range_starts[mbcset->nranges] = start_wc;
2712 mbcset->range_ends[mbcset->nranges++] = end_wc;
2713 }
2714
2715 /* Build the table for single byte characters. */
2716 for (wc = 0; wc < SBC_MAX; ++wc)
2717 {
2718 cmp_buf[2] = wc;
2719 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2720 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2721 bitset_set (sbcset, wc);
2722 }
2723 }
2724 # else /* not RE_ENABLE_I18N */
2725 {
2726 unsigned int ch;
2727 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2728 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2729 : 0));
2730 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2731 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2732 : 0));
2733 if (start_ch > end_ch)
2734 return REG_ERANGE;
2735 /* Build the table for single byte characters. */
2736 for (ch = 0; ch < SBC_MAX; ++ch)
2737 if (start_ch <= ch && ch <= end_ch)
2738 bitset_set (sbcset, ch);
2739 }
2740 # endif /* not RE_ENABLE_I18N */
2741 return REG_NOERROR;
2742 }
2743 #endif /* not _LIBC */
2744
2745 #ifndef _LIBC
2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2747 Build the collating element which is represented by NAME.
2748 The result are written to MBCSET and SBCSET.
2749 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2750 pointer argument since we may update it. */
2751
2752 static reg_errcode_t
2753 # ifdef RE_ENABLE_I18N
2754 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2755 re_charset_t *mbcset;
2756 int *coll_sym_alloc;
2757 # else /* not RE_ENABLE_I18N */
2758 build_collating_symbol (sbcset, name)
2759 # endif /* not RE_ENABLE_I18N */
2760 re_bitset_ptr_t sbcset;
2761 const unsigned char *name;
2762 {
2763 size_t name_len = strlen ((const char *) name);
2764 if (BE (name_len != 1, 0))
2765 return REG_ECOLLATE;
2766 else
2767 {
2768 bitset_set (sbcset, name[0]);
2769 return REG_NOERROR;
2770 }
2771 }
2772 #endif /* not _LIBC */
2773
2774 /* This function parse bracket expression like "[abc]", "[a-c]",
2775 "[[.a-a.]]" etc. */
2776
2777 static bin_tree_t *
2778 parse_bracket_exp (regexp, dfa, token, syntax, err)
2779 re_string_t *regexp;
2780 re_dfa_t *dfa;
2781 re_token_t *token;
2782 reg_syntax_t syntax;
2783 reg_errcode_t *err;
2784 {
2785 #ifdef _LIBC
2786 const unsigned char *collseqmb;
2787 const char *collseqwc;
2788 uint32_t nrules;
2789 int32_t table_size;
2790 const int32_t *symb_table;
2791 const unsigned char *extra;
2792
2793 /* Local function for parse_bracket_exp used in _LIBC environement.
2794 Seek the collating symbol entry correspondings to NAME.
2795 Return the index of the symbol in the SYMB_TABLE. */
2796
2797 auto inline int32_t
2798 __attribute ((always_inline))
2799 seek_collating_symbol_entry (name, name_len)
2800 const unsigned char *name;
2801 size_t name_len;
2802 {
2803 int32_t hash = elem_hash ((const char *) name, name_len);
2804 int32_t elem = hash % table_size;
2805 int32_t second = hash % (table_size - 2);
2806 while (symb_table[2 * elem] != 0)
2807 {
2808 /* First compare the hashing value. */
2809 if (symb_table[2 * elem] == hash
2810 /* Compare the length of the name. */
2811 && name_len == extra[symb_table[2 * elem + 1]]
2812 /* Compare the name. */
2813 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2814 name_len) == 0)
2815 {
2816 /* Yep, this is the entry. */
2817 break;
2818 }
2819
2820 /* Next entry. */
2821 elem += second;
2822 }
2823 return elem;
2824 }
2825
2826 /* Local function for parse_bracket_exp used in _LIBC environement.
2827 Look up the collation sequence value of BR_ELEM.
2828 Return the value if succeeded, UINT_MAX otherwise. */
2829
2830 auto inline unsigned int
2831 __attribute ((always_inline))
2832 lookup_collation_sequence_value (br_elem)
2833 bracket_elem_t *br_elem;
2834 {
2835 if (br_elem->type == SB_CHAR)
2836 {
2837 /*
2838 if (MB_CUR_MAX == 1)
2839 */
2840 if (nrules == 0)
2841 return collseqmb[br_elem->opr.ch];
2842 else
2843 {
2844 wint_t wc = __btowc (br_elem->opr.ch);
2845 return __collseq_table_lookup (collseqwc, wc);
2846 }
2847 }
2848 else if (br_elem->type == MB_CHAR)
2849 {
2850 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2851 }
2852 else if (br_elem->type == COLL_SYM)
2853 {
2854 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2855 if (nrules != 0)
2856 {
2857 int32_t elem, idx;
2858 elem = seek_collating_symbol_entry (br_elem->opr.name,
2859 sym_name_len);
2860 if (symb_table[2 * elem] != 0)
2861 {
2862 /* We found the entry. */
2863 idx = symb_table[2 * elem + 1];
2864 /* Skip the name of collating element name. */
2865 idx += 1 + extra[idx];
2866 /* Skip the byte sequence of the collating element. */
2867 idx += 1 + extra[idx];
2868 /* Adjust for the alignment. */
2869 idx = (idx + 3) & ~3;
2870 /* Skip the multibyte collation sequence value. */
2871 idx += sizeof (unsigned int);
2872 /* Skip the wide char sequence of the collating element. */
2873 idx += sizeof (unsigned int) *
2874 (1 + *(unsigned int *) (extra + idx));
2875 /* Return the collation sequence value. */
2876 return *(unsigned int *) (extra + idx);
2877 }
2878 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2879 {
2880 /* No valid character. Match it as a single byte
2881 character. */
2882 return collseqmb[br_elem->opr.name[0]];
2883 }
2884 }
2885 else if (sym_name_len == 1)
2886 return collseqmb[br_elem->opr.name[0]];
2887 }
2888 return UINT_MAX;
2889 }
2890
2891 /* Local function for parse_bracket_exp used in _LIBC environement.
2892 Build the range expression which starts from START_ELEM, and ends
2893 at END_ELEM. The result are written to MBCSET and SBCSET.
2894 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2895 mbcset->range_ends, is a pointer argument sinse we may
2896 update it. */
2897
2898 auto inline reg_errcode_t
2899 __attribute ((always_inline))
2900 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2901 re_charset_t *mbcset;
2902 int *range_alloc;
2903 re_bitset_ptr_t sbcset;
2904 bracket_elem_t *start_elem, *end_elem;
2905 {
2906 unsigned int ch;
2907 uint32_t start_collseq;
2908 uint32_t end_collseq;
2909
2910 /* Equivalence Classes and Character Classes can't be a range
2911 start/end. */
2912 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2913 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2914 0))
2915 return REG_ERANGE;
2916
2917 start_collseq = lookup_collation_sequence_value (start_elem);
2918 end_collseq = lookup_collation_sequence_value (end_elem);
2919 /* Check start/end collation sequence values. */
2920 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2921 return REG_ECOLLATE;
2922 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2923 return REG_ERANGE;
2924
2925 /* Got valid collation sequence values, add them as a new entry.
2926 However, if we have no collation elements, and the character set
2927 is single byte, the single byte character set that we
2928 build below suffices. */
2929 if (nrules > 0 || dfa->mb_cur_max > 1)
2930 {
2931 /* Check the space of the arrays. */
2932 if (BE (*range_alloc == mbcset->nranges, 0))
2933 {
2934 /* There is not enough space, need realloc. */
2935 uint32_t *new_array_start;
2936 uint32_t *new_array_end;
2937 int new_nranges;
2938
2939 /* +1 in case of mbcset->nranges is 0. */
2940 new_nranges = 2 * mbcset->nranges + 1;
2941 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2942 new_nranges);
2943 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2944 new_nranges);
2945
2946 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2947 return REG_ESPACE;
2948
2949 mbcset->range_starts = new_array_start;
2950 mbcset->range_ends = new_array_end;
2951 *range_alloc = new_nranges;
2952 }
2953
2954 mbcset->range_starts[mbcset->nranges] = start_collseq;
2955 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2956 }
2957
2958 /* Build the table for single byte characters. */
2959 for (ch = 0; ch < SBC_MAX; ch++)
2960 {
2961 uint32_t ch_collseq;
2962 /*
2963 if (MB_CUR_MAX == 1)
2964 */
2965 if (nrules == 0)
2966 ch_collseq = collseqmb[ch];
2967 else
2968 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2969 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2970 bitset_set (sbcset, ch);
2971 }
2972 return REG_NOERROR;
2973 }
2974
2975 /* Local function for parse_bracket_exp used in _LIBC environement.
2976 Build the collating element which is represented by NAME.
2977 The result are written to MBCSET and SBCSET.
2978 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2979 pointer argument sinse we may update it. */
2980
2981 auto inline reg_errcode_t
2982 __attribute ((always_inline))
2983 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2984 re_charset_t *mbcset;
2985 int *coll_sym_alloc;
2986 re_bitset_ptr_t sbcset;
2987 const unsigned char *name;
2988 {
2989 int32_t elem, idx;
2990 size_t name_len = strlen ((const char *) name);
2991 if (nrules != 0)
2992 {
2993 elem = seek_collating_symbol_entry (name, name_len);
2994 if (symb_table[2 * elem] != 0)
2995 {
2996 /* We found the entry. */
2997 idx = symb_table[2 * elem + 1];
2998 /* Skip the name of collating element name. */
2999 idx += 1 + extra[idx];
3000 }
3001 else if (symb_table[2 * elem] == 0 && name_len == 1)
3002 {
3003 /* No valid character, treat it as a normal
3004 character. */
3005 bitset_set (sbcset, name[0]);
3006 return REG_NOERROR;
3007 }
3008 else
3009 return REG_ECOLLATE;
3010
3011 /* Got valid collation sequence, add it as a new entry. */
3012 /* Check the space of the arrays. */
3013 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3014 {
3015 /* Not enough, realloc it. */
3016 /* +1 in case of mbcset->ncoll_syms is 0. */
3017 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3018 /* Use realloc since mbcset->coll_syms is NULL
3019 if *alloc == 0. */
3020 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3021 new_coll_sym_alloc);
3022 if (BE (new_coll_syms == NULL, 0))
3023 return REG_ESPACE;
3024 mbcset->coll_syms = new_coll_syms;
3025 *coll_sym_alloc = new_coll_sym_alloc;
3026 }
3027 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3028 return REG_NOERROR;
3029 }
3030 else
3031 {
3032 if (BE (name_len != 1, 0))
3033 return REG_ECOLLATE;
3034 else
3035 {
3036 bitset_set (sbcset, name[0]);
3037 return REG_NOERROR;
3038 }
3039 }
3040 }
3041 #endif
3042
3043 re_token_t br_token;
3044 re_bitset_ptr_t sbcset;
3045 #ifdef RE_ENABLE_I18N
3046 re_charset_t *mbcset;
3047 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3048 int equiv_class_alloc = 0, char_class_alloc = 0;
3049 #endif /* not RE_ENABLE_I18N */
3050 int non_match = 0;
3051 bin_tree_t *work_tree;
3052 int token_len;
3053 int first_round = 1;
3054 #ifdef _LIBC
3055 collseqmb = (const unsigned char *)
3056 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3057 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3058 if (nrules)
3059 {
3060 /*
3061 if (MB_CUR_MAX > 1)
3062 */
3063 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3064 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3065 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3066 _NL_COLLATE_SYMB_TABLEMB);
3067 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3068 _NL_COLLATE_SYMB_EXTRAMB);
3069 }
3070 #endif
3071 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3072 #ifdef RE_ENABLE_I18N
3073 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3074 #endif /* RE_ENABLE_I18N */
3075 #ifdef RE_ENABLE_I18N
3076 if (BE (sbcset == NULL || mbcset == NULL, 0))
3077 #else
3078 if (BE (sbcset == NULL, 0))
3079 #endif /* RE_ENABLE_I18N */
3080 {
3081 *err = REG_ESPACE;
3082 return NULL;
3083 }
3084
3085 token_len = peek_token_bracket (token, regexp, syntax);
3086 if (BE (token->type == END_OF_RE, 0))
3087 {
3088 *err = REG_BADPAT;
3089 goto parse_bracket_exp_free_return;
3090 }
3091 if (token->type == OP_NON_MATCH_LIST)
3092 {
3093 #ifdef RE_ENABLE_I18N
3094 mbcset->non_match = 1;
3095 #endif /* not RE_ENABLE_I18N */
3096 non_match = 1;
3097 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3098 bitset_set (sbcset, '\0');
3099 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3100 token_len = peek_token_bracket (token, regexp, syntax);
3101 if (BE (token->type == END_OF_RE, 0))
3102 {
3103 *err = REG_BADPAT;
3104 goto parse_bracket_exp_free_return;
3105 }
3106 }
3107
3108 /* We treat the first ']' as a normal character. */
3109 if (token->type == OP_CLOSE_BRACKET)
3110 token->type = CHARACTER;
3111
3112 while (1)
3113 {
3114 bracket_elem_t start_elem, end_elem;
3115 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3116 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3117 reg_errcode_t ret;
3118 int token_len2 = 0, is_range_exp = 0;
3119 re_token_t token2;
3120
3121 start_elem.opr.name = start_name_buf;
3122 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3123 syntax, first_round);
3124 if (BE (ret != REG_NOERROR, 0))
3125 {
3126 *err = ret;
3127 goto parse_bracket_exp_free_return;
3128 }
3129 first_round = 0;
3130
3131 /* Get information about the next token. We need it in any case. */
3132 token_len = peek_token_bracket (token, regexp, syntax);
3133
3134 /* Do not check for ranges if we know they are not allowed. */
3135 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3136 {
3137 if (BE (token->type == END_OF_RE, 0))
3138 {
3139 *err = REG_EBRACK;
3140 goto parse_bracket_exp_free_return;
3141 }
3142 if (token->type == OP_CHARSET_RANGE)
3143 {
3144 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3145 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3146 if (BE (token2.type == END_OF_RE, 0))
3147 {
3148 *err = REG_EBRACK;
3149 goto parse_bracket_exp_free_return;
3150 }
3151 if (token2.type == OP_CLOSE_BRACKET)
3152 {
3153 /* We treat the last '-' as a normal character. */
3154 re_string_skip_bytes (regexp, -token_len);
3155 token->type = CHARACTER;
3156 }
3157 else
3158 is_range_exp = 1;
3159 }
3160 }
3161
3162 if (is_range_exp == 1)
3163 {
3164 end_elem.opr.name = end_name_buf;
3165 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3166 dfa, syntax, 1);
3167 if (BE (ret != REG_NOERROR, 0))
3168 {
3169 *err = ret;
3170 goto parse_bracket_exp_free_return;
3171 }
3172
3173 token_len = peek_token_bracket (token, regexp, syntax);
3174
3175 #ifdef _LIBC
3176 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3177 &start_elem, &end_elem);
3178 #else
3179 # ifdef RE_ENABLE_I18N
3180 *err = build_range_exp (sbcset,
3181 dfa->mb_cur_max > 1 ? mbcset : NULL,
3182 &range_alloc, &start_elem, &end_elem);
3183 # else
3184 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3185 # endif
3186 #endif /* RE_ENABLE_I18N */
3187 if (BE (*err != REG_NOERROR, 0))
3188 goto parse_bracket_exp_free_return;
3189 }
3190 else
3191 {
3192 switch (start_elem.type)
3193 {
3194 case SB_CHAR:
3195 bitset_set (sbcset, start_elem.opr.ch);
3196 break;
3197 #ifdef RE_ENABLE_I18N
3198 case MB_CHAR:
3199 /* Check whether the array has enough space. */
3200 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3201 {
3202 wchar_t *new_mbchars;
3203 /* Not enough, realloc it. */
3204 /* +1 in case of mbcset->nmbchars is 0. */
3205 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3206 /* Use realloc since array is NULL if *alloc == 0. */
3207 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3208 mbchar_alloc);
3209 if (BE (new_mbchars == NULL, 0))
3210 goto parse_bracket_exp_espace;
3211 mbcset->mbchars = new_mbchars;
3212 }
3213 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3214 break;
3215 #endif /* RE_ENABLE_I18N */
3216 case EQUIV_CLASS:
3217 *err = build_equiv_class (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219 mbcset, &equiv_class_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221 start_elem.opr.name);
3222 if (BE (*err != REG_NOERROR, 0))
3223 goto parse_bracket_exp_free_return;
3224 break;
3225 case COLL_SYM:
3226 *err = build_collating_symbol (sbcset,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset, &coll_sym_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem.opr.name);
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3233 break;
3234 case CHAR_CLASS:
3235 *err = build_charclass (regexp->trans, sbcset,
3236 #ifdef RE_ENABLE_I18N
3237 mbcset, &char_class_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239 start_elem.opr.name, syntax);
3240 if (BE (*err != REG_NOERROR, 0))
3241 goto parse_bracket_exp_free_return;
3242 break;
3243 default:
3244 assert (0);
3245 break;
3246 }
3247 }
3248 if (BE (token->type == END_OF_RE, 0))
3249 {
3250 *err = REG_EBRACK;
3251 goto parse_bracket_exp_free_return;
3252 }
3253 if (token->type == OP_CLOSE_BRACKET)
3254 break;
3255 }
3256
3257 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3258
3259 /* If it is non-matching list. */
3260 if (non_match)
3261 bitset_not (sbcset);
3262
3263 #ifdef RE_ENABLE_I18N
3264 /* Ensure only single byte characters are set. */
3265 if (dfa->mb_cur_max > 1)
3266 bitset_mask (sbcset, dfa->sb_char);
3267
3268 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3269 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3270 || mbcset->non_match)))
3271 {
3272 bin_tree_t *mbc_tree;
3273 int sbc_idx;
3274 /* Build a tree for complex bracket. */
3275 dfa->has_mb_node = 1;
3276 br_token.type = COMPLEX_BRACKET;
3277 br_token.opr.mbcset = mbcset;
3278 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3279 if (BE (mbc_tree == NULL, 0))
3280 goto parse_bracket_exp_espace;
3281 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3282 if (sbcset[sbc_idx])
3283 break;
3284 /* If there are no bits set in sbcset, there is no point
3285 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3286 if (sbc_idx < BITSET_UINTS)
3287 {
3288 /* Build a tree for simple bracket. */
3289 br_token.type = SIMPLE_BRACKET;
3290 br_token.opr.sbcset = sbcset;
3291 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292 if (BE (work_tree == NULL, 0))
3293 goto parse_bracket_exp_espace;
3294
3295 /* Then join them by ALT node. */
3296 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3297 if (BE (work_tree == NULL, 0))
3298 goto parse_bracket_exp_espace;
3299 }
3300 else
3301 {
3302 re_free (sbcset);
3303 work_tree = mbc_tree;
3304 }
3305 }
3306 else
3307 {
3308 /* Build a tree for simple bracket. */
3309 br_token.type = SIMPLE_BRACKET;
3310 br_token.opr.sbcset = sbcset;
3311 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3312 if (BE (work_tree == NULL, 0))
3313 goto parse_bracket_exp_espace;
3314
3315 free_charset (mbcset);
3316 }
3317 #endif /* not RE_ENABLE_I18N */
3318 return work_tree;
3319
3320 parse_bracket_exp_espace:
3321 *err = REG_ESPACE;
3322 parse_bracket_exp_free_return:
3323 re_free (sbcset);
3324 #ifdef RE_ENABLE_I18N
3325 free_charset (mbcset);
3326 #endif /* RE_ENABLE_I18N */
3327 return NULL;
3328 }
3329
3330 /* Parse an element in the bracket expression. */
3331
3332 static reg_errcode_t
3333 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3334 accept_hyphen)
3335 bracket_elem_t *elem;
3336 re_string_t *regexp;
3337 re_token_t *token;
3338 int token_len;
3339 re_dfa_t *dfa;
3340 reg_syntax_t syntax;
3341 int accept_hyphen;
3342 {
3343 #ifdef RE_ENABLE_I18N
3344 int cur_char_size;
3345 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3346 if (cur_char_size > 1)
3347 {
3348 elem->type = MB_CHAR;
3349 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3350 re_string_skip_bytes (regexp, cur_char_size);
3351 return REG_NOERROR;
3352 }
3353 #endif /* RE_ENABLE_I18N */
3354 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3355 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3356 || token->type == OP_OPEN_EQUIV_CLASS)
3357 return parse_bracket_symbol (elem, regexp, token);
3358 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3359 {
3360 /* A '-' must only appear as anything but a range indicator before
3361 the closing bracket. Everything else is an error. */
3362 re_token_t token2;
3363 (void) peek_token_bracket (&token2, regexp, syntax);
3364 if (token2.type != OP_CLOSE_BRACKET)
3365 /* The actual error value is not standardized since this whole
3366 case is undefined. But ERANGE makes good sense. */
3367 return REG_ERANGE;
3368 }
3369 elem->type = SB_CHAR;
3370 elem->opr.ch = token->opr.c;
3371 return REG_NOERROR;
3372 }
3373
3374 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3375 such as [:<character_class>:], [.<collating_element>.], and
3376 [=<equivalent_class>=]. */
3377
3378 static reg_errcode_t
3379 parse_bracket_symbol (elem, regexp, token)
3380 bracket_elem_t *elem;
3381 re_string_t *regexp;
3382 re_token_t *token;
3383 {
3384 unsigned char ch, delim = token->opr.c;
3385 int i = 0;
3386 if (re_string_eoi(regexp))
3387 return REG_EBRACK;
3388 for (;; ++i)
3389 {
3390 if (i >= BRACKET_NAME_BUF_SIZE)
3391 return REG_EBRACK;
3392 if (token->type == OP_OPEN_CHAR_CLASS)
3393 ch = re_string_fetch_byte_case (regexp);
3394 else
3395 ch = re_string_fetch_byte (regexp);
3396 if (re_string_eoi(regexp))
3397 return REG_EBRACK;
3398 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3399 break;
3400 elem->opr.name[i] = ch;
3401 }
3402 re_string_skip_bytes (regexp, 1);
3403 elem->opr.name[i] = '\0';
3404 switch (token->type)
3405 {
3406 case OP_OPEN_COLL_ELEM:
3407 elem->type = COLL_SYM;
3408 break;
3409 case OP_OPEN_EQUIV_CLASS:
3410 elem->type = EQUIV_CLASS;
3411 break;
3412 case OP_OPEN_CHAR_CLASS:
3413 elem->type = CHAR_CLASS;
3414 break;
3415 default:
3416 break;
3417 }
3418 return REG_NOERROR;
3419 }
3420
3421 /* Helper function for parse_bracket_exp.
3422 Build the equivalence class which is represented by NAME.
3423 The result are written to MBCSET and SBCSET.
3424 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3425 is a pointer argument sinse we may update it. */
3426
3427 static reg_errcode_t
3428 #ifdef RE_ENABLE_I18N
3429 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3430 re_charset_t *mbcset;
3431 int *equiv_class_alloc;
3432 #else /* not RE_ENABLE_I18N */
3433 build_equiv_class (sbcset, name)
3434 #endif /* not RE_ENABLE_I18N */
3435 re_bitset_ptr_t sbcset;
3436 const unsigned char *name;
3437 {
3438 #if defined _LIBC
3439 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3440 if (nrules != 0)
3441 {
3442 const int32_t *table, *indirect;
3443 const unsigned char *weights, *extra, *cp;
3444 unsigned char char_buf[2];
3445 int32_t idx1, idx2;
3446 unsigned int ch;
3447 size_t len;
3448 /* This #include defines a local function! */
3449 # include <locale/weight.h>
3450 /* Calculate the index for equivalence class. */
3451 cp = name;
3452 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3453 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3454 _NL_COLLATE_WEIGHTMB);
3455 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3456 _NL_COLLATE_EXTRAMB);
3457 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3458 _NL_COLLATE_INDIRECTMB);
3459 idx1 = findidx (&cp);
3460 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3461 /* This isn't a valid character. */
3462 return REG_ECOLLATE;
3463
3464 /* Build single byte matcing table for this equivalence class. */
3465 char_buf[1] = (unsigned char) '\0';
3466 len = weights[idx1];
3467 for (ch = 0; ch < SBC_MAX; ++ch)
3468 {
3469 char_buf[0] = ch;
3470 cp = char_buf;
3471 idx2 = findidx (&cp);
3472 /*
3473 idx2 = table[ch];
3474 */
3475 if (idx2 == 0)
3476 /* This isn't a valid character. */
3477 continue;
3478 if (len == weights[idx2])
3479 {
3480 int cnt = 0;
3481 while (cnt <= len &&
3482 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3483 ++cnt;
3484
3485 if (cnt > len)
3486 bitset_set (sbcset, ch);
3487 }
3488 }
3489 /* Check whether the array has enough space. */
3490 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3491 {
3492 /* Not enough, realloc it. */
3493 /* +1 in case of mbcset->nequiv_classes is 0. */
3494 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3495 /* Use realloc since the array is NULL if *alloc == 0. */
3496 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3497 int32_t,
3498 new_equiv_class_alloc);
3499 if (BE (new_equiv_classes == NULL, 0))
3500 return REG_ESPACE;
3501 mbcset->equiv_classes = new_equiv_classes;
3502 *equiv_class_alloc = new_equiv_class_alloc;
3503 }
3504 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3505 }
3506 else
3507 #endif /* _LIBC */
3508 {
3509 if (BE (strlen ((const char *) name) != 1, 0))
3510 return REG_ECOLLATE;
3511 bitset_set (sbcset, *name);
3512 }
3513 return REG_NOERROR;
3514 }
3515
3516 /* Helper function for parse_bracket_exp.
3517 Build the character class which is represented by NAME.
3518 The result are written to MBCSET and SBCSET.
3519 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3520 is a pointer argument sinse we may update it. */
3521
3522 static reg_errcode_t
3523 #ifdef RE_ENABLE_I18N
3524 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3525 re_charset_t *mbcset;
3526 int *char_class_alloc;
3527 #else /* not RE_ENABLE_I18N */
3528 build_charclass (trans, sbcset, class_name, syntax)
3529 #endif /* not RE_ENABLE_I18N */
3530 unsigned RE_TRANSLATE_TYPE trans;
3531 re_bitset_ptr_t sbcset;
3532 const unsigned char *class_name;
3533 reg_syntax_t syntax;
3534 {
3535 int i;
3536 const char *name = (const char *) class_name;
3537
3538 /* In case of REG_ICASE "upper" and "lower" match the both of
3539 upper and lower cases. */
3540 if ((syntax & RE_ICASE)
3541 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3542 name = "alpha";
3543
3544 #ifdef RE_ENABLE_I18N
3545 /* Check the space of the arrays. */
3546 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3547 {
3548 /* Not enough, realloc it. */
3549 /* +1 in case of mbcset->nchar_classes is 0. */
3550 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3551 /* Use realloc since array is NULL if *alloc == 0. */
3552 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3553 new_char_class_alloc);
3554 if (BE (new_char_classes == NULL, 0))
3555 return REG_ESPACE;
3556 mbcset->char_classes = new_char_classes;
3557 *char_class_alloc = new_char_class_alloc;
3558 }
3559 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3560 #endif /* RE_ENABLE_I18N */
3561
3562 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3563 for (i = 0; i < SBC_MAX; ++i) \
3564 { \
3565 if (ctype_func (i)) \
3566 { \
3567 int ch = trans ? trans[i] : i; \
3568 bitset_set (sbcset, ch); \
3569 } \
3570 }
3571
3572 if (strcmp (name, "alnum") == 0)
3573 BUILD_CHARCLASS_LOOP (isalnum)
3574 else if (strcmp (name, "cntrl") == 0)
3575 BUILD_CHARCLASS_LOOP (iscntrl)
3576 else if (strcmp (name, "lower") == 0)
3577 BUILD_CHARCLASS_LOOP (islower)
3578 else if (strcmp (name, "space") == 0)
3579 BUILD_CHARCLASS_LOOP (isspace)
3580 else if (strcmp (name, "alpha") == 0)
3581 BUILD_CHARCLASS_LOOP (isalpha)
3582 else if (strcmp (name, "digit") == 0)
3583 BUILD_CHARCLASS_LOOP (isdigit)
3584 else if (strcmp (name, "print") == 0)
3585 BUILD_CHARCLASS_LOOP (isprint)
3586 else if (strcmp (name, "upper") == 0)
3587 BUILD_CHARCLASS_LOOP (isupper)
3588 else if (strcmp (name, "blank") == 0)
3589 BUILD_CHARCLASS_LOOP (isblank)
3590 else if (strcmp (name, "graph") == 0)
3591 BUILD_CHARCLASS_LOOP (isgraph)
3592 else if (strcmp (name, "punct") == 0)
3593 BUILD_CHARCLASS_LOOP (ispunct)
3594 else if (strcmp (name, "xdigit") == 0)
3595 BUILD_CHARCLASS_LOOP (isxdigit)
3596 else
3597 return REG_ECTYPE;
3598
3599 return REG_NOERROR;
3600 }
3601
3602 static bin_tree_t *
3603 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3604 re_dfa_t *dfa;
3605 unsigned RE_TRANSLATE_TYPE trans;
3606 const unsigned char *class_name;
3607 const unsigned char *extra;
3608 int non_match;
3609 reg_errcode_t *err;
3610 {
3611 re_bitset_ptr_t sbcset;
3612 #ifdef RE_ENABLE_I18N
3613 re_charset_t *mbcset;
3614 int alloc = 0;
3615 #endif /* not RE_ENABLE_I18N */
3616 reg_errcode_t ret;
3617 re_token_t br_token;
3618 bin_tree_t *tree;
3619
3620 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3621 #ifdef RE_ENABLE_I18N
3622 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3623 #endif /* RE_ENABLE_I18N */
3624
3625 #ifdef RE_ENABLE_I18N
3626 if (BE (sbcset == NULL || mbcset == NULL, 0))
3627 #else /* not RE_ENABLE_I18N */
3628 if (BE (sbcset == NULL, 0))
3629 #endif /* not RE_ENABLE_I18N */
3630 {
3631 *err = REG_ESPACE;
3632 return NULL;
3633 }
3634
3635 if (non_match)
3636 {
3637 #ifdef RE_ENABLE_I18N
3638 /*
3639 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3640 bitset_set(cset->sbcset, '\0');
3641 */
3642 mbcset->non_match = 1;
3643 #endif /* not RE_ENABLE_I18N */
3644 }
3645
3646 /* We don't care the syntax in this case. */
3647 ret = build_charclass (trans, sbcset,
3648 #ifdef RE_ENABLE_I18N
3649 mbcset, &alloc,
3650 #endif /* RE_ENABLE_I18N */
3651 class_name, 0);
3652
3653 if (BE (ret != REG_NOERROR, 0))
3654 {
3655 re_free (sbcset);
3656 #ifdef RE_ENABLE_I18N
3657 free_charset (mbcset);
3658 #endif /* RE_ENABLE_I18N */
3659 *err = ret;
3660 return NULL;
3661 }
3662 /* \w match '_' also. */
3663 for (; *extra; extra++)
3664 bitset_set (sbcset, *extra);
3665
3666 /* If it is non-matching list. */
3667 if (non_match)
3668 bitset_not (sbcset);
3669
3670 #ifdef RE_ENABLE_I18N
3671 /* Ensure only single byte characters are set. */
3672 if (dfa->mb_cur_max > 1)
3673 bitset_mask (sbcset, dfa->sb_char);
3674 #endif
3675
3676 /* Build a tree for simple bracket. */
3677 br_token.type = SIMPLE_BRACKET;
3678 br_token.opr.sbcset = sbcset;
3679 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680 if (BE (tree == NULL, 0))
3681 goto build_word_op_espace;
3682
3683 #ifdef RE_ENABLE_I18N
3684 if (dfa->mb_cur_max > 1)
3685 {
3686 bin_tree_t *mbc_tree;
3687 /* Build a tree for complex bracket. */
3688 br_token.type = COMPLEX_BRACKET;
3689 br_token.opr.mbcset = mbcset;
3690 dfa->has_mb_node = 1;
3691 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3692 if (BE (mbc_tree == NULL, 0))
3693 goto build_word_op_espace;
3694 /* Then join them by ALT node. */
3695 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3696 if (BE (mbc_tree != NULL, 1))
3697 return tree;
3698 }
3699 else
3700 {
3701 free_charset (mbcset);
3702 return tree;
3703 }
3704 #else /* not RE_ENABLE_I18N */
3705 return tree;
3706 #endif /* not RE_ENABLE_I18N */
3707
3708 build_word_op_espace:
3709 re_free (sbcset);
3710 #ifdef RE_ENABLE_I18N
3711 free_charset (mbcset);
3712 #endif /* RE_ENABLE_I18N */
3713 *err = REG_ESPACE;
3714 return NULL;
3715 }
3716
3717 /* This is intended for the expressions like "a{1,3}".
3718 Fetch a number from `input', and return the number.
3719 Return -1, if the number field is empty like "{,1}".
3720 Return -2, If an error is occured. */
3721
3722 static int
3723 fetch_number (input, token, syntax)
3724 re_string_t *input;
3725 re_token_t *token;
3726 reg_syntax_t syntax;
3727 {
3728 int num = -1;
3729 unsigned char c;
3730 while (1)
3731 {
3732 fetch_token (token, input, syntax);
3733 c = token->opr.c;
3734 if (BE (token->type == END_OF_RE, 0))
3735 return -2;
3736 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3737 break;
3738 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3739 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3740 num = (num > RE_DUP_MAX) ? -2 : num;
3741 }
3742 return num;
3743 }
3744 \f
3745 #ifdef RE_ENABLE_I18N
3746 static void
3747 free_charset (re_charset_t *cset)
3748 {
3749 re_free (cset->mbchars);
3750 # ifdef _LIBC
3751 re_free (cset->coll_syms);
3752 re_free (cset->equiv_classes);
3753 re_free (cset->range_starts);
3754 re_free (cset->range_ends);
3755 # endif
3756 re_free (cset->char_classes);
3757 re_free (cset);
3758 }
3759 #endif /* RE_ENABLE_I18N */
3760 \f
3761 /* Functions for binary tree operation. */
3762
3763 /* Create a tree node. */
3764
3765 static bin_tree_t *
3766 create_tree (dfa, left, right, type)
3767 re_dfa_t *dfa;
3768 bin_tree_t *left;
3769 bin_tree_t *right;
3770 re_token_type_t type;
3771 {
3772 re_token_t t;
3773 t.type = type;
3774 return create_token_tree (dfa, left, right, &t);
3775 }
3776
3777 static bin_tree_t *
3778 create_token_tree (dfa, left, right, token)
3779 re_dfa_t *dfa;
3780 bin_tree_t *left;
3781 bin_tree_t *right;
3782 const re_token_t *token;
3783 {
3784 bin_tree_t *tree;
3785 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3786 {
3787 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3788
3789 if (storage == NULL)
3790 return NULL;
3791 storage->next = dfa->str_tree_storage;
3792 dfa->str_tree_storage = storage;
3793 dfa->str_tree_storage_idx = 0;
3794 }
3795 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3796
3797 tree->parent = NULL;
3798 tree->left = left;
3799 tree->right = right;
3800 tree->token = *token;
3801 tree->token.duplicated = 0;
3802 tree->token.opt_subexp = 0;
3803 tree->first = NULL;
3804 tree->next = NULL;
3805 tree->node_idx = -1;
3806
3807 if (left != NULL)
3808 left->parent = tree;
3809 if (right != NULL)
3810 right->parent = tree;
3811 return tree;
3812 }
3813
3814 /* Mark the tree SRC as an optional subexpression.
3815 To be called from preorder or postorder. */
3816
3817 static reg_errcode_t
3818 mark_opt_subexp (extra, node)
3819 void *extra;
3820 bin_tree_t *node;
3821 {
3822 int idx = (int) (long) extra;
3823 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3824 node->token.opt_subexp = 1;
3825
3826 return REG_NOERROR;
3827 }
3828
3829 /* Free the allocated memory inside NODE. */
3830
3831 static void
3832 free_token (re_token_t *node)
3833 {
3834 #ifdef RE_ENABLE_I18N
3835 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3836 free_charset (node->opr.mbcset);
3837 else
3838 #endif /* RE_ENABLE_I18N */
3839 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3840 re_free (node->opr.sbcset);
3841 }
3842
3843 /* Worker function for tree walking. Free the allocated memory inside NODE
3844 and its children. */
3845
3846 static reg_errcode_t
3847 free_tree (void *extra, bin_tree_t *node)
3848 {
3849 free_token (&node->token);
3850 return REG_NOERROR;
3851 }
3852
3853
3854 /* Duplicate the node SRC, and return new node. This is a preorder
3855 visit similar to the one implemented by the generic visitor, but
3856 we need more infrastructure to maintain two parallel trees --- so,
3857 it's easier to duplicate. */
3858
3859 static bin_tree_t *
3860 duplicate_tree (root, dfa)
3861 const bin_tree_t *root;
3862 re_dfa_t *dfa;
3863 {
3864 const bin_tree_t *node;
3865 bin_tree_t *dup_root;
3866 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3867
3868 for (node = root; ; )
3869 {
3870 /* Create a new tree and link it back to the current parent. */
3871 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3872 if (*p_new == NULL)
3873 return NULL;
3874 (*p_new)->parent = dup_node;
3875 (*p_new)->token.duplicated = 1;
3876 dup_node = *p_new;
3877
3878 /* Go to the left node, or up and to the right. */
3879 if (node->left)
3880 {
3881 node = node->left;
3882 p_new = &dup_node->left;
3883 }
3884 else
3885 {
3886 const bin_tree_t *prev = NULL;
3887 while (node->right == prev || node->right == NULL)
3888 {
3889 prev = node;
3890 node = node->parent;
3891 dup_node = dup_node->parent;
3892 if (!node)
3893 return dup_root;
3894 }
3895 node = node->right;
3896 p_new = &dup_node->right;
3897 }
3898 }
3899 }