]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
b168057a 2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
3b0bdc72 19
e054f494
RA
20#include <stdint.h>
21
8c0ab919
RM
22#ifdef _LIBC
23# include <locale/weight.h>
24#endif
25
3b0bdc72 26static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
01ed6ceb 27 size_t length, reg_syntax_t syntax);
3b0bdc72 28static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
29 const re_dfastate_t *init_state,
30 char *fastmap);
01ed6ceb 31static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
c0a0f9a3 32#ifdef RE_ENABLE_I18N
3b0bdc72 33static void free_charset (re_charset_t *cset);
c0a0f9a3 34#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
35static void free_workarea_compile (regex_t *preg);
36static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
37#ifdef RE_ENABLE_I18N
38static void optimize_utf8 (re_dfa_t *dfa);
39#endif
02f3550c 40static reg_errcode_t analyze (regex_t *preg);
02f3550c
UD
41static reg_errcode_t preorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44static reg_errcode_t postorder (bin_tree_t *root,
45 reg_errcode_t (fn (void *, bin_tree_t *)),
46 void *extra);
47static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50 bin_tree_t *node);
51static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
2d87db5b 54static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
01ed6ceb 55static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
a7d5c291 56 unsigned int constraint);
3b0bdc72 57static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 58static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
15a7d175 59 int node, int root);
963d8d78 60static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
3b0bdc72 61static int fetch_number (re_string_t *input, re_token_t *token,
15a7d175 62 reg_syntax_t syntax);
3b0bdc72 63static int peek_token (re_token_t *token, re_string_t *input,
9b0c3f38 64 reg_syntax_t syntax) internal_function;
3b0bdc72 65static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 66 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 67static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
3b0bdc72 70static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175
UD
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
3b0bdc72 73static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175
UD
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
3b0bdc72 76static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
3b0bdc72 79static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
80 re_dfa_t *dfa, re_token_t *token,
81 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 82static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
83 re_token_t *token, reg_syntax_t syntax,
84 reg_errcode_t *err);
3b0bdc72 85static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
86 re_string_t *regexp,
87 re_token_t *token, int token_len,
88 re_dfa_t *dfa,
78d8b07a
UD
89 reg_syntax_t syntax,
90 int accept_hyphen);
3b0bdc72 91static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
92 re_string_t *regexp,
93 re_token_t *token);
c0a0f9a3 94#ifdef RE_ENABLE_I18N
2c05d33f 95static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175
UD
96 re_charset_t *mbcset,
97 int *equiv_class_alloc,
98 const unsigned char *name);
997470b3 99static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 100 bitset_t sbcset,
15a7d175
UD
101 re_charset_t *mbcset,
102 int *char_class_alloc,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
c0a0f9a3 105#else /* not RE_ENABLE_I18N */
2c05d33f 106static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 107 const unsigned char *name);
997470b3 108static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 109 bitset_t sbcset,
15a7d175
UD
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
c0a0f9a3 112#endif /* not RE_ENABLE_I18N */
86576b62 113static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
997470b3 114 RE_TRANSLATE_TYPE trans,
06e8303a 115 const unsigned char *class_name,
10677727
UD
116 const unsigned char *extra,
117 int non_match, reg_errcode_t *err);
ee70274a
UD
118static bin_tree_t *create_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
02f3550c
UD
120 re_token_type_t type);
121static bin_tree_t *create_token_tree (re_dfa_t *dfa,
122 bin_tree_t *left, bin_tree_t *right,
123 const re_token_t *token);
3b0bdc72 124static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
02f3550c
UD
125static void free_token (re_token_t *node);
126static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
127static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
3b0bdc72
UD
128\f
129/* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
133
6455d255 134const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
135 {
136#define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
138 "\0"
139#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
141 "\0"
142#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144 "\0"
145#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147 "\0"
148#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150 "\0"
151#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153 "\0"
154#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156 "\0"
157#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
159 "\0"
160#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162 "\0"
163#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165 "\0"
166#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168 "\0"
169#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
171 "\0"
172#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
174 "\0"
175#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177 "\0"
178#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
180 "\0"
181#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
183 "\0"
184#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 };
187
6455d255 188const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
189 {
190 REG_NOERROR_IDX,
191 REG_NOMATCH_IDX,
192 REG_BADPAT_IDX,
193 REG_ECOLLATE_IDX,
194 REG_ECTYPE_IDX,
195 REG_EESCAPE_IDX,
196 REG_ESUBREG_IDX,
197 REG_EBRACK_IDX,
198 REG_EPAREN_IDX,
199 REG_EBRACE_IDX,
200 REG_BADBR_IDX,
201 REG_ERANGE_IDX,
202 REG_ESPACE_IDX,
203 REG_BADRPT_IDX,
204 REG_EEND_IDX,
205 REG_ESIZE_IDX,
206 REG_ERPAREN_IDX
207 };
208\f
209/* Entry points for GNU code. */
210
211/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
213 Returns 0 if the pattern was valid, otherwise an error string.
214
215 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
216 are set in BUFP on entry. */
217
218const char *
219re_compile_pattern (pattern, length, bufp)
220 const char *pattern;
221 size_t length;
222 struct re_pattern_buffer *bufp;
223{
224 reg_errcode_t ret;
225
3b0bdc72
UD
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
c06a6956
UD
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
3b0bdc72
UD
230
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
233
75e4a282 234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
235
236 if (!ret)
237 return NULL;
6455d255 238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
239}
240#ifdef _LIBC
241weak_alias (__re_compile_pattern, re_compile_pattern)
242#endif
243
244/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247/* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249reg_syntax_t re_syntax_options;
250
251
252/* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
255
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
258
259reg_syntax_t
260re_set_syntax (syntax)
261 reg_syntax_t syntax;
262{
263 reg_syntax_t ret = re_syntax_options;
264
265 re_syntax_options = syntax;
266 return ret;
267}
268#ifdef _LIBC
269weak_alias (__re_set_syntax, re_set_syntax)
270#endif
271
272int
273re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
275{
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
278
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
288 return 0;
289}
290#ifdef _LIBC
291weak_alias (__re_compile_fastmap, re_compile_fastmap)
292#endif
293
1b2c2628 294static inline void
25337753 295__attribute ((always_inline))
1b2c2628
UD
296re_set_fastmap (char *fastmap, int icase, int ch)
297{
298 fastmap[ch] = 1;
299 if (icase)
300 fastmap[tolower (ch)] = 1;
301}
302
3b0bdc72
UD
303/* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
305
306static void
0fd8ae9c
UD
307re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 char *fastmap)
3b0bdc72
UD
309{
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311 int node_cnt;
3c0fb574 312 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 {
315 int node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
317
318 if (type == CHARACTER)
74e12fbc
UD
319 {
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321#ifdef RE_ENABLE_I18N
14744156 322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 323 {
3c0fb574 324 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
74e12fbc
UD
325 wchar_t wc;
326 mbstate_t state;
327
328 p = buf;
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
2c05d33f 334 memset (&state, '\0', sizeof (state));
b3918c7d
UD
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
336 &state) == p - buf
88764ae2
UD
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
338 != (size_t) -1))
74e12fbc
UD
339 re_set_fastmap (fastmap, 0, buf[0]);
340 }
341#endif
342 }
3b0bdc72 343 else if (type == SIMPLE_BRACKET)
15a7d175 344 {
2c05d33f
UD
345 int i, ch;
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
347 {
348 int j;
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
353 }
15a7d175 354 }
c0a0f9a3 355#ifdef RE_ENABLE_I18N
3b0bdc72 356 else if (type == COMPLEX_BRACKET)
15a7d175 357 {
15a7d175 358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
bdb56bac
UD
359 int i;
360
c0a0f9a3 361# ifdef _LIBC
bdb56bac
UD
362 /* See if we have to try all bytes which start multiple collation
363 elements.
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
15a7d175 370 {
15a7d175
UD
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2c05d33f
UD
373 for (i = 0; i < SBC_MAX; ++i)
374 if (table[i] < 0)
375 re_set_fastmap (fastmap, icase, i);
15a7d175 376 }
bdb56bac
UD
377# endif /* _LIBC */
378
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
815d8147 384 && (cset->nchar_classes || cset->non_match || cset->nranges
bdb56bac
UD
385# ifdef _LIBC
386 || cset->nequiv_classes
387# endif /* _LIBC */
388 ))
15a7d175 389 {
bdb56bac
UD
390 unsigned char c = 0;
391 do
74e12fbc 392 {
bdb56bac
UD
393 mbstate_t mbs;
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
74e12fbc 397 }
bdb56bac 398 while (++c != 0);
15a7d175 399 }
bdb56bac
UD
400
401 else
402 {
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
405 {
406 char buf[256];
407 mbstate_t state;
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
412 {
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
414 != (size_t) -1)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 }
aa732e2b
UD
417 }
418 }
15a7d175 419 }
c0a0f9a3 420#endif /* RE_ENABLE_I18N */
c0d5034e
UD
421 else if (type == OP_PERIOD
422#ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424#endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
15a7d175
UD
426 {
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
430 return;
431 }
3b0bdc72
UD
432 }
433}
434\f
435/* Entry point for POSIX code. */
436/* regcomp takes a regular expression as a string and compiles it.
437
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
440
441 `buffer' to the compiled pattern;
442 `used' to the length of the compiled pattern;
443 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 `fastmap' to an allocated space for the fastmap;
448 `fastmap_accurate' to zero;
449 `re_nsub' to the number of subexpressions in PATTERN.
450
451 PATTERN is the address of the pattern string.
452
453 CFLAGS is a series of bits which affect compilation.
454
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
457
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
460
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
463
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
466 registers.
467
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
470
471int
472regcomp (preg, pattern, cflags)
75e4a282
UD
473 regex_t *__restrict preg;
474 const char *__restrict pattern;
3b0bdc72
UD
475 int cflags;
476{
477 reg_errcode_t ret;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 479 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
480
481 preg->buffer = NULL;
482 preg->allocated = 0;
483 preg->used = 0;
484
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 487 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
488 return REG_ESPACE;
489
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
491
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
499 }
500 else
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
504
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
506
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
510 ret = REG_EPAREN;
511
a9388965 512 /* We have already checked preg->fastmap != NULL. */
bc15410e 513 if (BE (ret == REG_NOERROR, 1))
71ccd330 514 /* Compute the fastmap now, since regexec cannot modify the pattern
86576b62 515 buffer. This function never fails in this implementation. */
83b038f2 516 (void) re_compile_fastmap (preg);
71ccd330 517 else
3b0bdc72 518 {
71ccd330
UD
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
3b0bdc72
UD
522 }
523
524 return (int) ret;
525}
526#ifdef _LIBC
527weak_alias (__regcomp, regcomp)
528#endif
529
530/* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
532
533size_t
534regerror (errcode, preg, errbuf, errbuf_size)
535 int errcode;
2d87db5b
UD
536 const regex_t *__restrict preg;
537 char *__restrict errbuf;
3b0bdc72
UD
538 size_t errbuf_size;
539{
540 const char *msg;
541 size_t msg_size;
542
bc15410e 543 if (BE (errcode < 0
15a7d175
UD
544 || errcode >= (int) (sizeof (__re_error_msgid_idx)
545 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
546 /* Only error codes returned by the rest of the code should be passed
547 to this routine. If we are given anything else, or if other regex
548 code generates an invalid error code, then the program has a bug.
549 Dump core so we can fix it. */
550 abort ();
551
6455d255 552 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
553
554 msg_size = strlen (msg) + 1; /* Includes the null. */
555
bc15410e 556 if (BE (errbuf_size != 0, 1))
3b0bdc72 557 {
bc15410e 558 if (BE (msg_size > errbuf_size, 0))
15a7d175 559 {
3b0bdc72
UD
560#if defined HAVE_MEMPCPY || defined _LIBC
561 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
562#else
15a7d175
UD
563 memcpy (errbuf, msg, errbuf_size - 1);
564 errbuf[errbuf_size - 1] = 0;
3b0bdc72 565#endif
15a7d175 566 }
3b0bdc72 567 else
15a7d175 568 memcpy (errbuf, msg, msg_size);
3b0bdc72
UD
569 }
570
571 return msg_size;
572}
573#ifdef _LIBC
574weak_alias (__regerror, regerror)
575#endif
576
3b0bdc72 577
e40a38b3
UD
578#ifdef RE_ENABLE_I18N
579/* This static array is used for the map to single-byte characters when
580 UTF-8 is used. Otherwise we would allocate memory just to initialize
581 it the same all the time. UTF-8 is the preferred encoding so this is
582 a worthwhile optimization. */
2c05d33f 583static const bitset_t utf8_sb_map =
e40a38b3
UD
584{
585 /* Set the first 128 bits. */
2c05d33f 586 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
e40a38b3
UD
587};
588#endif
589
590
71ccd330
UD
591static void
592free_dfa_content (re_dfa_t *dfa)
3b0bdc72
UD
593{
594 int i, j;
3b0bdc72 595
ee70274a
UD
596 if (dfa->nodes)
597 for (i = 0; i < dfa->nodes_len; ++i)
02f3550c 598 free_token (dfa->nodes + i);
71ccd330
UD
599 re_free (dfa->nexts);
600 for (i = 0; i < dfa->nodes_len; ++i)
601 {
602 if (dfa->eclosures != NULL)
603 re_node_set_free (dfa->eclosures + i);
604 if (dfa->inveclosures != NULL)
605 re_node_set_free (dfa->inveclosures + i);
606 if (dfa->edests != NULL)
607 re_node_set_free (dfa->edests + i);
608 }
609 re_free (dfa->edests);
610 re_free (dfa->eclosures);
611 re_free (dfa->inveclosures);
612 re_free (dfa->nodes);
3b0bdc72 613
ee70274a
UD
614 if (dfa->state_table)
615 for (i = 0; i <= dfa->state_hash_mask; ++i)
616 {
617 struct re_state_table_entry *entry = dfa->state_table + i;
618 for (j = 0; j < entry->num; ++j)
619 {
620 re_dfastate_t *state = entry->array[j];
621 free_state (state);
622 }
21f5de55 623 re_free (entry->array);
ee70274a 624 }
71ccd330 625 re_free (dfa->state_table);
65e6becf 626#ifdef RE_ENABLE_I18N
e40a38b3
UD
627 if (dfa->sb_char != utf8_sb_map)
628 re_free (dfa->sb_char);
a96c63ed 629#endif
c06a6956 630 re_free (dfa->subexp_map);
0742e48e 631#ifdef DEBUG
71ccd330 632 re_free (dfa->re_str);
0742e48e 633#endif
71ccd330
UD
634
635 re_free (dfa);
636}
637
638
639/* Free dynamically allocated space used by PREG. */
640
641void
642regfree (preg)
643 regex_t *preg;
644{
645 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
646 if (BE (dfa != NULL, 1))
647 free_dfa_content (dfa);
86576b62
UD
648 preg->buffer = NULL;
649 preg->allocated = 0;
71ccd330 650
3b0bdc72 651 re_free (preg->fastmap);
86576b62
UD
652 preg->fastmap = NULL;
653
654 re_free (preg->translate);
655 preg->translate = NULL;
3b0bdc72
UD
656}
657#ifdef _LIBC
658weak_alias (__regfree, regfree)
659#endif
660\f
661/* Entry points compatible with 4.2 BSD regex library. We don't define
662 them unless specifically requested. */
663
664#if defined _REGEX_RE_COMP || defined _LIBC
665
666/* BSD has one and only one pattern buffer. */
667static struct re_pattern_buffer re_comp_buf;
668
669char *
670# ifdef _LIBC
671/* Make these definitions weak in libc, so POSIX programs can redefine
672 these names if they don't use our functions, and still use
673 regcomp/regexec above without link errors. */
674weak_function
675# endif
676re_comp (s)
677 const char *s;
678{
679 reg_errcode_t ret;
240e87c2 680 char *fastmap;
3b0bdc72
UD
681
682 if (!s)
683 {
684 if (!re_comp_buf.buffer)
685 return gettext ("No previous regular expression");
686 return 0;
687 }
688
240e87c2
RM
689 if (re_comp_buf.buffer)
690 {
691 fastmap = re_comp_buf.fastmap;
692 re_comp_buf.fastmap = NULL;
693 __regfree (&re_comp_buf);
1b2c2628 694 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
695 re_comp_buf.fastmap = fastmap;
696 }
697
698 if (re_comp_buf.fastmap == NULL)
3b0bdc72
UD
699 {
700 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
701 if (re_comp_buf.fastmap == NULL)
6455d255
UD
702 return (char *) gettext (__re_error_msgid
703 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
704 }
705
706 /* Since `re_exec' always passes NULL for the `regs' argument, we
707 don't need to initialize the pattern buffer fields which affect it. */
708
709 /* Match anchors at newlines. */
710 re_comp_buf.newline_anchor = 1;
711
712 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
713
714 if (!ret)
715 return NULL;
716
717 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 718 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 719}
240e87c2
RM
720
721#ifdef _LIBC
c877418f 722libc_freeres_fn (free_mem)
240e87c2
RM
723{
724 __regfree (&re_comp_buf);
725}
240e87c2
RM
726#endif
727
3b0bdc72
UD
728#endif /* _REGEX_RE_COMP */
729\f
730/* Internal entry point.
731 Compile the regular expression PATTERN, whose length is LENGTH.
732 SYNTAX indicate regular expression's syntax. */
733
734static reg_errcode_t
0fd8ae9c
UD
735re_compile_internal (regex_t *preg, const char * pattern, size_t length,
736 reg_syntax_t syntax)
3b0bdc72
UD
737{
738 reg_errcode_t err = REG_NOERROR;
739 re_dfa_t *dfa;
740 re_string_t regexp;
741
742 /* Initialize the pattern buffer. */
743 preg->fastmap_accurate = 0;
744 preg->syntax = syntax;
745 preg->not_bol = preg->not_eol = 0;
746 preg->used = 0;
747 preg->re_nsub = 0;
1b2c2628
UD
748 preg->can_be_null = 0;
749 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
750
751 /* Initialize the dfa. */
752 dfa = (re_dfa_t *) preg->buffer;
951d6408 753 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3b0bdc72
UD
754 {
755 /* If zero allocated, but buffer is non-null, try to realloc
756 enough space. This loses if buffer's address is bogus, but
757 that is the user's responsibility. If ->buffer is NULL this
758 is a simple allocation. */
759 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
760 if (dfa == NULL)
761 return REG_ESPACE;
3b0bdc72 762 preg->allocated = sizeof (re_dfa_t);
951d6408 763 preg->buffer = (unsigned char *) dfa;
3b0bdc72 764 }
3b0bdc72
UD
765 preg->used = sizeof (re_dfa_t);
766
767 err = init_dfa (dfa, length);
bc15410e 768 if (BE (err != REG_NOERROR, 0))
3b0bdc72 769 {
ee70274a 770 free_dfa_content (dfa);
3b0bdc72 771 preg->buffer = NULL;
ca3c505e 772 preg->allocated = 0;
3b0bdc72
UD
773 return err;
774 }
0742e48e 775#ifdef DEBUG
01ed6ceb 776 /* Note: length+1 will not overflow since it is checked in init_dfa. */
0742e48e
UD
777 dfa->re_str = re_malloc (char, length + 1);
778 strncpy (dfa->re_str, pattern, length + 1);
779#endif
3b0bdc72 780
96c91d1c
UD
781 __libc_lock_init (dfa->lock);
782
612546c6 783 err = re_string_construct (&regexp, pattern, length, preg->translate,
f0c7c524 784 syntax & RE_ICASE, dfa);
bc15410e 785 if (BE (err != REG_NOERROR, 0))
3b0bdc72 786 {
ee70274a
UD
787 re_compile_internal_free_return:
788 free_workarea_compile (preg);
789 re_string_destruct (&regexp);
790 free_dfa_content (dfa);
3b0bdc72 791 preg->buffer = NULL;
ca3c505e 792 preg->allocated = 0;
3b0bdc72
UD
793 return err;
794 }
795
796 /* Parse the regular expression, and build a structure tree. */
797 preg->re_nsub = 0;
798 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 799 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
800 goto re_compile_internal_free_return;
801
02f3550c
UD
802 /* Analyze the tree and create the nfa. */
803 err = analyze (preg);
804 if (BE (err != REG_NOERROR, 0))
805 goto re_compile_internal_free_return;
806
14744156
UD
807#ifdef RE_ENABLE_I18N
808 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 809 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
810 optimize_utf8 (dfa);
811#endif
812
3b0bdc72
UD
813 /* Then create the initial state of the dfa. */
814 err = create_initial_state (dfa);
3b0bdc72 815
3b0bdc72
UD
816 /* Release work areas. */
817 free_workarea_compile (preg);
818 re_string_destruct (&regexp);
819
71ccd330
UD
820 if (BE (err != REG_NOERROR, 0))
821 {
71ccd330
UD
822 free_dfa_content (dfa);
823 preg->buffer = NULL;
ca3c505e 824 preg->allocated = 0;
71ccd330
UD
825 }
826
3b0bdc72
UD
827 return err;
828}
829
830/* Initialize DFA. We use the length of the regular expression PAT_LEN
831 as the initial length of some arrays. */
832
833static reg_errcode_t
0fd8ae9c 834init_dfa (re_dfa_t *dfa, size_t pat_len)
3b0bdc72 835{
01ed6ceb 836 unsigned int table_size;
e40a38b3
UD
837#ifndef _LIBC
838 char *codeset_name;
839#endif
81c64d40
UD
840
841 memset (dfa, '\0', sizeof (re_dfa_t));
842
ee70274a
UD
843 /* Force allocation of str_tree_storage the first time. */
844 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
845
01ed6ceb
UD
846 /* Avoid overflows. */
847 if (pat_len == SIZE_MAX)
848 return REG_ESPACE;
849
3b0bdc72
UD
850 dfa->nodes_alloc = pat_len + 1;
851 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
852
3b0bdc72 853 /* table_size = 2 ^ ceil(log pat_len) */
01ed6ceb 854 for (table_size = 1; ; table_size <<= 1)
3b0bdc72
UD
855 if (table_size > pat_len)
856 break;
857
858 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
859 dfa->state_hash_mask = table_size - 1;
860
3c0fb574
UD
861 dfa->mb_cur_max = MB_CUR_MAX;
862#ifdef _LIBC
bb3f4825 863 if (dfa->mb_cur_max == 6
3c0fb574
UD
864 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
865 dfa->is_utf8 = 1;
f0c7c524
UD
866 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
867 != 0);
e40a38b3
UD
868#else
869# ifdef HAVE_LANGINFO_CODESET
870 codeset_name = nl_langinfo (CODESET);
871# else
872 codeset_name = getenv ("LC_ALL");
ea161661 873 if (codeset_name == NULL || codeset_name[0] == '\0')
e40a38b3 874 codeset_name = getenv ("LC_CTYPE");
ea161661 875 if (codeset_name == NULL || codeset_name[0] == '\0')
e40a38b3
UD
876 codeset_name = getenv ("LANG");
877 if (codeset_name == NULL)
878 codeset_name = "";
879 else if (strchr (codeset_name, '.') != NULL)
880 codeset_name = strchr (codeset_name, '.') + 1;
881# endif
882
883 if (strcasecmp (codeset_name, "UTF-8") == 0
884 || strcasecmp (codeset_name, "UTF8") == 0)
885 dfa->is_utf8 = 1;
886
887 /* We check exhaustively in the loop below if this charset is a
888 superset of ASCII. */
889 dfa->map_notascii = 0;
3c0fb574 890#endif
e40a38b3 891
65e6becf
UD
892#ifdef RE_ENABLE_I18N
893 if (dfa->mb_cur_max > 1)
894 {
65e6becf 895 if (dfa->is_utf8)
e40a38b3 896 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
65e6becf 897 else
e40a38b3
UD
898 {
899 int i, j, ch;
900
2c05d33f 901 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
e40a38b3
UD
902 if (BE (dfa->sb_char == NULL, 0))
903 return REG_ESPACE;
904
2c05d33f
UD
905 /* Set the bits corresponding to single byte chars. */
906 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
907 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
e40a38b3 908 {
2d87db5b 909 wint_t wch = __btowc (ch);
e40a38b3 910 if (wch != WEOF)
2c05d33f 911 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
e40a38b3 912# ifndef _LIBC
2d87db5b 913 if (isascii (ch) && wch != ch)
e40a38b3
UD
914 dfa->map_notascii = 1;
915# endif
916 }
917 }
65e6becf
UD
918 }
919#endif
3c0fb574 920
ae73c6c1 921 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
ee70274a 922 return REG_ESPACE;
3b0bdc72
UD
923 return REG_NOERROR;
924}
925
926/* Initialize WORD_CHAR table, which indicate which character is
927 "word". In this case "word" means that it is the word construction
928 character used by some operators like "\<", "\>", etc. */
929
56b168be 930static void
9b0c3f38 931internal_function
0fd8ae9c 932init_word_char (re_dfa_t *dfa)
3b0bdc72 933{
56b168be 934 dfa->word_ops_used = 1;
9f115170
UD
935 int i = 0;
936 int ch = 0;
937 if (BE (dfa->map_notascii == 0, 1))
938 {
939 if (sizeof (dfa->word_char[0]) == 8)
940 {
329bc018
RM
941 /* The extra temporaries here avoid "implicitly truncated"
942 warnings in the case when this is dead code, i.e. 32-bit. */
943 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
944 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
945 dfa->word_char[0] = wc0;
946 dfa->word_char[1] = wc1;
9f115170
UD
947 i = 2;
948 }
949 else if (sizeof (dfa->word_char[0]) == 4)
950 {
951 dfa->word_char[0] = UINT32_C (0x00000000);
952 dfa->word_char[1] = UINT32_C (0x03ff0000);
953 dfa->word_char[2] = UINT32_C (0x87fffffe);
954 dfa->word_char[3] = UINT32_C (0x07fffffe);
955 i = 4;
956 }
957 else
958 abort ();
959 ch = 128;
960
961 if (BE (dfa->is_utf8, 1))
962 {
963 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964 return;
965 }
966 }
967
968 for (; i < BITSET_WORDS; ++i)
969 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3b0bdc72 970 if (isalnum (ch) || ch == '_')
2c05d33f 971 dfa->word_char[i] |= (bitset_word_t) 1 << j;
3b0bdc72
UD
972}
973
974/* Free the work area which are only used while compiling. */
975
976static void
0fd8ae9c 977free_workarea_compile (regex_t *preg)
3b0bdc72
UD
978{
979 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
ee70274a
UD
980 bin_tree_storage_t *storage, *next;
981 for (storage = dfa->str_tree_storage; storage; storage = next)
982 {
983 next = storage->next;
984 re_free (storage);
985 }
986 dfa->str_tree_storage = NULL;
987 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 988 dfa->str_tree = NULL;
a7d5c291
UD
989 re_free (dfa->org_indices);
990 dfa->org_indices = NULL;
3b0bdc72
UD
991}
992
993/* Create initial states for all contexts. */
994
995static reg_errcode_t
0fd8ae9c 996create_initial_state (re_dfa_t *dfa)
3b0bdc72
UD
997{
998 int first, i;
999 reg_errcode_t err;
1000 re_node_set init_nodes;
1001
1002 /* Initial states have the epsilon closure of the node which is
1003 the first node of the regular expression. */
02f3550c 1004 first = dfa->str_tree->first->node_idx;
3b0bdc72
UD
1005 dfa->init_node = first;
1006 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 1007 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
1008 return err;
1009
1010 /* The back-references which are in initial states can epsilon transit,
1011 since in this case all of the subexpressions can be null.
1012 Then we add epsilon closures of the nodes which are the next nodes of
1013 the back-references. */
1014 if (dfa->nbackref > 0)
1015 for (i = 0; i < init_nodes.nelem; ++i)
1016 {
15a7d175
UD
1017 int node_idx = init_nodes.elems[i];
1018 re_token_type_t type = dfa->nodes[node_idx].type;
1019
1020 int clexp_idx;
1021 if (type != OP_BACK_REF)
1022 continue;
1023 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1024 {
1025 re_token_t *clexp_node;
1026 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1027 if (clexp_node->type == OP_CLOSE_SUBEXP
ae73c6c1 1028 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
15a7d175
UD
1029 break;
1030 }
1031 if (clexp_idx == init_nodes.nelem)
1032 continue;
1033
1034 if (type == OP_BACK_REF)
1035 {
1036 int dest_idx = dfa->edests[node_idx].elems[0];
1037 if (!re_node_set_contains (&init_nodes, dest_idx))
1038 {
2da42bc0
UD
1039 reg_errcode_t err = re_node_set_merge (&init_nodes,
1040 dfa->eclosures
1041 + dest_idx);
1042 if (err != REG_NOERROR)
1043 return err;
15a7d175
UD
1044 i = 0;
1045 }
1046 }
3b0bdc72
UD
1047 }
1048
1049 /* It must be the first time to invoke acquire_state. */
a9388965
UD
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 1052 if (BE (dfa->init_state == NULL, 0))
a9388965 1053 return err;
3b0bdc72
UD
1054 if (dfa->init_state->has_constraint)
1055 {
a9388965 1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1057 CONTEXT_WORD);
a9388965 1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1059 CONTEXT_NEWLINE);
a9388965 1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
1061 &init_nodes,
1062 CONTEXT_NEWLINE
1063 | CONTEXT_BEGBUF);
bc15410e 1064 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
1065 || dfa->init_state_begbuf == NULL, 0))
1066 return err;
3b0bdc72
UD
1067 }
1068 else
1069 dfa->init_state_word = dfa->init_state_nl
1070 = dfa->init_state_begbuf = dfa->init_state;
1071
3b0bdc72
UD
1072 re_node_set_free (&init_nodes);
1073 return REG_NOERROR;
1074}
1075\f
14744156
UD
1076#ifdef RE_ENABLE_I18N
1077/* If it is possible to do searching in single byte encoding instead of UTF-8
1078 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1079 DFA nodes where needed. */
1080
1081static void
0fd8ae9c 1082optimize_utf8 (re_dfa_t *dfa)
14744156 1083{
ad7f28c2 1084 int node, i, mb_chars = 0, has_period = 0;
14744156
UD
1085
1086 for (node = 0; node < dfa->nodes_len; ++node)
1087 switch (dfa->nodes[node].type)
1088 {
1089 case CHARACTER:
14744156 1090 if (dfa->nodes[node].opr.c >= 0x80)
5f93cd52 1091 mb_chars = 1;
14744156
UD
1092 break;
1093 case ANCHOR:
bc3e1c12 1094 switch (dfa->nodes[node].opr.ctx_type)
14744156
UD
1095 {
1096 case LINE_FIRST:
1097 case LINE_LAST:
1098 case BUF_FIRST:
1099 case BUF_LAST:
1100 break;
1101 default:
0caca71a
UD
1102 /* Word anchors etc. cannot be handled. It's okay to test
1103 opr.ctx_type since constraints (for all DFA nodes) are
1104 created by ORing one or more opr.ctx_type values. */
14744156
UD
1105 return;
1106 }
1107 break;
ad7f28c2 1108 case OP_PERIOD:
21f5de55
PE
1109 has_period = 1;
1110 break;
14744156
UD
1111 case OP_BACK_REF:
1112 case OP_ALT:
1113 case END_OF_RE:
14744156 1114 case OP_DUP_ASTERISK:
14744156
UD
1115 case OP_OPEN_SUBEXP:
1116 case OP_CLOSE_SUBEXP:
1117 break;
02f3550c
UD
1118 case COMPLEX_BRACKET:
1119 return;
a8067e8f 1120 case SIMPLE_BRACKET:
2c05d33f
UD
1121 /* Just double check. The non-ASCII range starts at 0x80. */
1122 assert (0x80 % BITSET_WORD_BITS == 0);
21f5de55 1123 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
a8067e8f
UD
1124 if (dfa->nodes[node].opr.sbcset[i])
1125 return;
1126 break;
14744156 1127 default:
02f3550c 1128 abort ();
14744156
UD
1129 }
1130
ad7f28c2 1131 if (mb_chars || has_period)
5f93cd52 1132 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1133 {
1134 if (dfa->nodes[node].type == CHARACTER
1135 && dfa->nodes[node].opr.c >= 0x80)
1136 dfa->nodes[node].mb_partial = 0;
1137 else if (dfa->nodes[node].type == OP_PERIOD)
1138 dfa->nodes[node].type = OP_UTF8_PERIOD;
1139 }
5f93cd52 1140
14744156
UD
1141 /* The search can be in single byte locale. */
1142 dfa->mb_cur_max = 1;
1143 dfa->is_utf8 = 0;
ad7f28c2 1144 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1145}
1146#endif
1147\f
3b0bdc72
UD
1148/* Analyze the structure tree, and calculate "first", "next", "edest",
1149 "eclosure", and "inveclosure". */
1150
1151static reg_errcode_t
0fd8ae9c 1152analyze (regex_t *preg)
3b0bdc72 1153{
02f3550c 1154 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3b0bdc72
UD
1155 reg_errcode_t ret;
1156
1157 /* Allocate arrays. */
3b0bdc72 1158 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
a7d5c291 1159 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3b0bdc72
UD
1160 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1161 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1162 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
963d8d78 1163 || dfa->eclosures == NULL, 0))
3b0bdc72 1164 return REG_ESPACE;
02f3550c
UD
1165
1166 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1167 if (dfa->subexp_map != NULL)
3b0bdc72 1168 {
02f3550c
UD
1169 int i;
1170 for (i = 0; i < preg->re_nsub; i++)
1171 dfa->subexp_map[i] = i;
1172 preorder (dfa->str_tree, optimize_subexps, dfa);
1173 for (i = 0; i < preg->re_nsub; i++)
1174 if (dfa->subexp_map[i] != i)
1175 break;
1176 if (i == preg->re_nsub)
1177 {
1178 free (dfa->subexp_map);
1179 dfa->subexp_map = NULL;
1180 }
3b0bdc72
UD
1181 }
1182
02f3550c
UD
1183 ret = postorder (dfa->str_tree, lower_subexps, preg);
1184 if (BE (ret != REG_NOERROR, 0))
1185 return ret;
1186 ret = postorder (dfa->str_tree, calc_first, dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1188 return ret;
1189 preorder (dfa->str_tree, calc_next, dfa);
1190 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1191 if (BE (ret != REG_NOERROR, 0))
1192 return ret;
1193 ret = calc_eclosure (dfa);
1194 if (BE (ret != REG_NOERROR, 0))
1195 return ret;
963d8d78
UD
1196
1197 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1198 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1199 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1200 || dfa->nbackref)
1201 {
1202 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1203 if (BE (dfa->inveclosures == NULL, 0))
21f5de55 1204 return REG_ESPACE;
963d8d78
UD
1205 ret = calc_inveclosure (dfa);
1206 }
1207
02f3550c
UD
1208 return ret;
1209}
1210
1211/* Our parse trees are very unbalanced, so we cannot use a stack to
1212 implement parse tree visits. Instead, we use parent pointers and
1213 some hairy code in these two functions. */
1214static reg_errcode_t
0fd8ae9c
UD
1215postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1216 void *extra)
02f3550c
UD
1217{
1218 bin_tree_t *node, *prev;
1219
1220 for (node = root; ; )
3b0bdc72 1221 {
02f3550c
UD
1222 /* Descend down the tree, preferably to the left (or to the right
1223 if that's the only child). */
1224 while (node->left || node->right)
1225 if (node->left)
21f5de55
PE
1226 node = node->left;
1227 else
1228 node = node->right;
02f3550c
UD
1229
1230 do
1231 {
1232 reg_errcode_t err = fn (extra, node);
1233 if (BE (err != REG_NOERROR, 0))
1234 return err;
21f5de55 1235 if (node->parent == NULL)
02f3550c
UD
1236 return REG_NOERROR;
1237 prev = node;
1238 node = node->parent;
1239 }
1240 /* Go up while we have a node that is reached from the right. */
1241 while (node->right == prev || node->right == NULL);
1242 node = node->right;
3b0bdc72 1243 }
3b0bdc72
UD
1244}
1245
02f3550c 1246static reg_errcode_t
0fd8ae9c
UD
1247preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248 void *extra)
02f3550c
UD
1249{
1250 bin_tree_t *node;
1251
1252 for (node = root; ; )
1253 {
1254 reg_errcode_t err = fn (extra, node);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
0ecb606c 1257
02f3550c
UD
1258 /* Go to the left node, or up and to the right. */
1259 if (node->left)
1260 node = node->left;
1261 else
1262 {
1263 bin_tree_t *prev = NULL;
1264 while (node->right == prev || node->right == NULL)
1265 {
1266 prev = node;
1267 node = node->parent;
1268 if (!node)
21f5de55 1269 return REG_NOERROR;
02f3550c
UD
1270 }
1271 node = node->right;
1272 }
1273 }
1274}
1275
1276/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1277 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1278 backreferences as well. Requires a preorder visit. */
0ecb606c 1279static reg_errcode_t
0fd8ae9c 1280optimize_subexps (void *extra, bin_tree_t *node)
0ecb606c 1281{
02f3550c 1282 re_dfa_t *dfa = (re_dfa_t *) extra;
0ecb606c 1283
02f3550c 1284 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3b0bdc72 1285 {
02f3550c
UD
1286 int idx = node->token.opr.idx;
1287 node->token.opr.idx = dfa->subexp_map[idx];
1288 dfa->used_bkref_map |= 1 << node->token.opr.idx;
0ecb606c 1289 }
02f3550c
UD
1290
1291 else if (node->token.type == SUBEXP
21f5de55 1292 && node->left && node->left->token.type == SUBEXP)
0ecb606c 1293 {
02f3550c
UD
1294 int other_idx = node->left->token.opr.idx;
1295
1296 node->left = node->left->left;
1297 if (node->left)
21f5de55 1298 node->left->parent = node;
02f3550c
UD
1299
1300 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
2c05d33f
UD
1301 if (other_idx < BITSET_WORD_BITS)
1302 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3b0bdc72 1303 }
02f3550c 1304
3b0bdc72
UD
1305 return REG_NOERROR;
1306}
1307
02f3550c
UD
1308/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1309 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1310static reg_errcode_t
0fd8ae9c 1311lower_subexps (void *extra, bin_tree_t *node)
3b0bdc72 1312{
02f3550c
UD
1313 regex_t *preg = (regex_t *) extra;
1314 reg_errcode_t err = REG_NOERROR;
3b0bdc72 1315
02f3550c 1316 if (node->left && node->left->token.type == SUBEXP)
0ecb606c 1317 {
02f3550c
UD
1318 node->left = lower_subexp (&err, preg, node->left);
1319 if (node->left)
1320 node->left->parent = node;
1321 }
1322 if (node->right && node->right->token.type == SUBEXP)
1323 {
1324 node->right = lower_subexp (&err, preg, node->right);
1325 if (node->right)
1326 node->right->parent = node;
3b0bdc72 1327 }
3b0bdc72 1328
02f3550c
UD
1329 return err;
1330}
3b0bdc72 1331
02f3550c 1332static bin_tree_t *
0fd8ae9c 1333lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
0ecb606c 1334{
02f3550c
UD
1335 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1336 bin_tree_t *body = node->left;
1337 bin_tree_t *op, *cls, *tree1, *tree;
1338
1339 if (preg->no_sub
744eb12b
UD
1340 /* We do not optimize empty subexpressions, because otherwise we may
1341 have bad CONCAT nodes with NULL children. This is obviously not
1342 very common, so we do not lose much. An example that triggers
1343 this case is the sed "script" /\(\)/x. */
1344 && node->left != NULL
2c05d33f
UD
1345 && (node->token.opr.idx >= BITSET_WORD_BITS
1346 || !(dfa->used_bkref_map
1347 & ((bitset_word_t) 1 << node->token.opr.idx))))
02f3550c
UD
1348 return node->left;
1349
1350 /* Convert the SUBEXP node to the concatenation of an
1351 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1352 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1353 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1354 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1355 tree = create_tree (dfa, op, tree1, CONCAT);
1356 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
0ecb606c 1357 {
02f3550c
UD
1358 *err = REG_ESPACE;
1359 return NULL;
0ecb606c 1360 }
3b0bdc72 1361
02f3550c
UD
1362 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1363 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1364 return tree;
1365}
a334319f 1366
02f3550c
UD
1367/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1368 nodes. Requires a postorder visit. */
1369static reg_errcode_t
0fd8ae9c 1370calc_first (void *extra, bin_tree_t *node)
02f3550c
UD
1371{
1372 re_dfa_t *dfa = (re_dfa_t *) extra;
1373 if (node->token.type == CONCAT)
1374 {
1375 node->first = node->left->first;
1376 node->node_idx = node->left->node_idx;
1377 }
1378 else
1379 {
1380 node->first = node;
1381 node->node_idx = re_dfa_add_node (dfa, node->token);
1382 if (BE (node->node_idx == -1, 0))
21f5de55 1383 return REG_ESPACE;
0caca71a 1384 if (node->token.type == ANCHOR)
21f5de55 1385 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
02f3550c
UD
1386 }
1387 return REG_NOERROR;
1388}
1389
1390/* Pass 2: compute NEXT on the tree. Preorder visit. */
1391static reg_errcode_t
0fd8ae9c 1392calc_next (void *extra, bin_tree_t *node)
02f3550c
UD
1393{
1394 switch (node->token.type)
3b0bdc72
UD
1395 {
1396 case OP_DUP_ASTERISK:
02f3550c 1397 node->left->next = node;
3b0bdc72
UD
1398 break;
1399 case CONCAT:
02f3550c
UD
1400 node->left->next = node->right->first;
1401 node->right->next = node->next;
1402 break;
3b0bdc72 1403 default:
02f3550c
UD
1404 if (node->left)
1405 node->left->next = node->next;
1406 if (node->right)
21f5de55 1407 node->right->next = node->next;
3b0bdc72
UD
1408 break;
1409 }
02f3550c 1410 return REG_NOERROR;
3b0bdc72
UD
1411}
1412
02f3550c
UD
1413/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1414static reg_errcode_t
0fd8ae9c 1415link_nfa_nodes (void *extra, bin_tree_t *node)
a334319f 1416{
02f3550c
UD
1417 re_dfa_t *dfa = (re_dfa_t *) extra;
1418 int idx = node->node_idx;
1419 reg_errcode_t err = REG_NOERROR;
1420
1421 switch (node->token.type)
3b0bdc72 1422 {
02f3550c
UD
1423 case CONCAT:
1424 break;
1425
1426 case END_OF_RE:
1427 assert (node->next == NULL);
1428 break;
1429
1430 case OP_DUP_ASTERISK:
1431 case OP_ALT:
1432 {
1433 int left, right;
1434 dfa->has_plural_match = 1;
1435 if (node->left != NULL)
1436 left = node->left->first->node_idx;
1437 else
1438 left = node->next->node_idx;
1439 if (node->right != NULL)
1440 right = node->right->first->node_idx;
1441 else
1442 right = node->next->node_idx;
1443 assert (left > -1);
1444 assert (right > -1);
1445 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1446 }
1447 break;
1448
1449 case ANCHOR:
1450 case OP_OPEN_SUBEXP:
1451 case OP_CLOSE_SUBEXP:
1452 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1453 break;
1454
1455 case OP_BACK_REF:
1456 dfa->nexts[idx] = node->next->node_idx;
1457 if (node->token.type == OP_BACK_REF)
2da42bc0 1458 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
02f3550c
UD
1459 break;
1460
1461 default:
1462 assert (!IS_EPSILON_NODE (node->token.type));
1463 dfa->nexts[idx] = node->next->node_idx;
1464 break;
3b0bdc72 1465 }
02f3550c
UD
1466
1467 return err;
3b0bdc72
UD
1468}
1469
485d775d
UD
1470/* Duplicate the epsilon closure of the node ROOT_NODE.
1471 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1472 to their own constraint. */
1473
1474static reg_errcode_t
9b0c3f38 1475internal_function
0fd8ae9c
UD
1476duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1477 int root_node, unsigned int init_constraint)
485d775d 1478{
485d775d
UD
1479 int org_node, clone_node, ret;
1480 unsigned int constraint = init_constraint;
1481 for (org_node = top_org_node, clone_node = top_clone_node;;)
1482 {
1483 int org_dest, clone_dest;
1484 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1485 {
485d775d
UD
1486 /* If the back reference epsilon-transit, its destination must
1487 also have the constraint. Then duplicate the epsilon closure
1488 of the destination of the back reference, and store it in
1489 edests of the back reference. */
15a7d175
UD
1490 org_dest = dfa->nexts[org_node];
1491 re_node_set_empty (dfa->edests + clone_node);
2d87db5b
UD
1492 clone_dest = duplicate_node (dfa, org_dest, constraint);
1493 if (BE (clone_dest == -1, 0))
1494 return REG_ESPACE;
15a7d175
UD
1495 dfa->nexts[clone_node] = dfa->nexts[org_node];
1496 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1497 if (BE (ret < 0, 0))
1498 return REG_ESPACE;
1499 }
485d775d 1500 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1501 {
485d775d
UD
1502 /* In case of the node can't epsilon-transit, don't duplicate the
1503 destination and store the original destination as the
1504 destination of the node. */
15a7d175
UD
1505 dfa->nexts[clone_node] = dfa->nexts[org_node];
1506 break;
1507 }
485d775d 1508 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1509 {
485d775d
UD
1510 /* In case of the node can epsilon-transit, and it has only one
1511 destination. */
15a7d175
UD
1512 org_dest = dfa->edests[org_node].elems[0];
1513 re_node_set_empty (dfa->edests + clone_node);
0caca71a
UD
1514 /* If the node is root_node itself, it means the epsilon clsoure
1515 has a loop. Then tie it to the destination of the root_node. */
1516 if (org_node == root_node && clone_node != org_node)
15a7d175 1517 {
0caca71a
UD
1518 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1521 break;
15a7d175 1522 }
0caca71a
UD
1523 /* In case of the node has another constraint, add it. */
1524 constraint |= dfa->nodes[org_node].constraint;
2d87db5b
UD
1525 clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 if (BE (clone_dest == -1, 0))
1527 return REG_ESPACE;
15a7d175
UD
1528 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 if (BE (ret < 0, 0))
1530 return REG_ESPACE;
1531 }
485d775d 1532 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1533 {
485d775d 1534 /* In case of the node can epsilon-transit, and it has two
02f3550c 1535 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
15a7d175
UD
1536 org_dest = dfa->edests[org_node].elems[0];
1537 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1538 /* Search for a duplicated node which satisfies the constraint. */
1539 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1540 if (clone_dest == -1)
1541 {
0caca71a 1542 /* There is no such duplicated node, create a new one. */
2d87db5b
UD
1543 reg_errcode_t err;
1544 clone_dest = duplicate_node (dfa, org_dest, constraint);
1545 if (BE (clone_dest == -1, 0))
1546 return REG_ESPACE;
7de66108
UD
1547 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (ret < 0, 0))
1549 return REG_ESPACE;
1550 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1551 root_node, constraint);
1552 if (BE (err != REG_NOERROR, 0))
1553 return err;
a7d5c291
UD
1554 }
1555 else
1556 {
0caca71a 1557 /* There is a duplicated node which satisfies the constraint,
a7d5c291
UD
1558 use it to avoid infinite loop. */
1559 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (BE (ret < 0, 0))
1561 return REG_ESPACE;
1562 }
15a7d175
UD
1563
1564 org_dest = dfa->edests[org_node].elems[1];
2d87db5b
UD
1565 clone_dest = duplicate_node (dfa, org_dest, constraint);
1566 if (BE (clone_dest == -1, 0))
1567 return REG_ESPACE;
15a7d175
UD
1568 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1569 if (BE (ret < 0, 0))
1570 return REG_ESPACE;
1571 }
485d775d
UD
1572 org_node = org_dest;
1573 clone_node = clone_dest;
1574 }
1575 return REG_NOERROR;
1576}
1577
a7d5c291
UD
1578/* Search for a node which is duplicated from the node ORG_NODE, and
1579 satisfies the constraint CONSTRAINT. */
1580
1581static int
0fd8ae9c
UD
1582search_duplicated_node (const re_dfa_t *dfa, int org_node,
1583 unsigned int constraint)
a7d5c291
UD
1584{
1585 int idx;
1586 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1587 {
1588 if (org_node == dfa->org_indices[idx]
1589 && constraint == dfa->nodes[idx].constraint)
1590 return idx; /* Found. */
1591 }
1592 return -1; /* Not found. */
1593}
1594
a9388965 1595/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
2d87db5b
UD
1596 Return the index of the new node, or -1 if insufficient storage is
1597 available. */
a9388965 1598
2d87db5b 1599static int
0fd8ae9c 1600duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
3b0bdc72 1601{
02f3550c 1602 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
2d87db5b
UD
1603 if (BE (dup_idx != -1, 1))
1604 {
1605 dfa->nodes[dup_idx].constraint = constraint;
0caca71a 1606 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
2d87db5b
UD
1607 dfa->nodes[dup_idx].duplicated = 1;
1608
1609 /* Store the index of the original node. */
1610 dfa->org_indices[dup_idx] = org_idx;
1611 }
1612 return dup_idx;
3b0bdc72
UD
1613}
1614
963d8d78 1615static reg_errcode_t
0fd8ae9c 1616calc_inveclosure (re_dfa_t *dfa)
3b0bdc72 1617{
963d8d78
UD
1618 int src, idx, ret;
1619 for (idx = 0; idx < dfa->nodes_len; ++idx)
1620 re_node_set_init_empty (dfa->inveclosures + idx);
1621
3b0bdc72
UD
1622 for (src = 0; src < dfa->nodes_len; ++src)
1623 {
963d8d78 1624 int *elems = dfa->eclosures[src].elems;
3b0bdc72 1625 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175 1626 {
963d8d78
UD
1627 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1628 if (BE (ret == -1, 0))
1629 return REG_ESPACE;
15a7d175 1630 }
3b0bdc72 1631 }
963d8d78
UD
1632
1633 return REG_NOERROR;
3b0bdc72
UD
1634}
1635
1636/* Calculate "eclosure" for all the node in DFA. */
1637
1638static reg_errcode_t
0fd8ae9c 1639calc_eclosure (re_dfa_t *dfa)
3b0bdc72 1640{
485d775d 1641 int node_idx, incomplete;
3b0bdc72
UD
1642#ifdef DEBUG
1643 assert (dfa->nodes_len > 0);
1644#endif
485d775d 1645 incomplete = 0;
3b0bdc72 1646 /* For each nodes, calculate epsilon closure. */
485d775d 1647 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1648 {
a9388965 1649 reg_errcode_t err;
3b0bdc72 1650 re_node_set eclosure_elem;
485d775d 1651 if (node_idx == dfa->nodes_len)
15a7d175
UD
1652 {
1653 if (!incomplete)
1654 break;
1655 incomplete = 0;
1656 node_idx = 0;
1657 }
3b0bdc72
UD
1658
1659#ifdef DEBUG
3b0bdc72
UD
1660 assert (dfa->eclosures[node_idx].nelem != -1);
1661#endif
c06a6956 1662
3b0bdc72
UD
1663 /* If we have already calculated, skip it. */
1664 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1665 continue;
3b0bdc72 1666 /* Calculate epsilon closure of `node_idx'. */
a9388965 1667 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1668 if (BE (err != REG_NOERROR, 0))
15a7d175 1669 return err;
3b0bdc72
UD
1670
1671 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175
UD
1672 {
1673 incomplete = 1;
1674 re_node_set_free (&eclosure_elem);
1675 }
3b0bdc72 1676 }
3b0bdc72
UD
1677 return REG_NOERROR;
1678}
1679
1680/* Calculate epsilon closure of NODE. */
1681
a9388965 1682static reg_errcode_t
0fd8ae9c 1683calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
3b0bdc72 1684{
a9388965 1685 reg_errcode_t err;
21f5de55 1686 int i;
3b0bdc72 1687 re_node_set eclosure;
21f5de55
PE
1688 int ret;
1689 int incomplete = 0;
a9388965 1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1691 if (BE (err != REG_NOERROR, 0))
a9388965 1692 return err;
3b0bdc72
UD
1693
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa->eclosures[node].nelem = -1;
1697
0caca71a
UD
1698 /* If the current node has constraints, duplicate all nodes
1699 since they must inherit the constraints. */
1700 if (dfa->nodes[node].constraint
b4ae56bd
UD
1701 && dfa->edests[node].nelem
1702 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
485d775d 1703 {
0caca71a
UD
1704 err = duplicate_node_closure (dfa, node, node, node,
1705 dfa->nodes[node].constraint);
485d775d 1706 if (BE (err != REG_NOERROR, 0))
15a7d175 1707 return err;
485d775d 1708 }
3b0bdc72
UD
1709
1710 /* Expand each epsilon destination nodes. */
485d775d 1711 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1712 for (i = 0; i < dfa->edests[node].nelem; ++i)
1713 {
15a7d175
UD
1714 re_node_set eclosure_elem;
1715 int edest = dfa->edests[node].elems[i];
1716 /* If calculating the epsilon closure of `edest' is in progress,
1717 return intermediate result. */
1718 if (dfa->eclosures[edest].nelem == -1)
1719 {
1720 incomplete = 1;
1721 continue;
1722 }
1723 /* If we haven't calculated the epsilon closure of `edest' yet,
1724 calculate now. Otherwise use calculated epsilon closure. */
1725 if (dfa->eclosures[edest].nelem == 0)
1726 {
1727 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1728 if (BE (err != REG_NOERROR, 0))
1729 return err;
1730 }
1731 else
1732 eclosure_elem = dfa->eclosures[edest];
1733 /* Merge the epsilon closure of `edest'. */
2da42bc0
UD
1734 err = re_node_set_merge (&eclosure, &eclosure_elem);
1735 if (BE (err != REG_NOERROR, 0))
1736 return err;
15a7d175
UD
1737 /* If the epsilon closure of `edest' is incomplete,
1738 the epsilon closure of this node is also incomplete. */
1739 if (dfa->eclosures[edest].nelem == 0)
1740 {
1741 incomplete = 1;
1742 re_node_set_free (&eclosure_elem);
1743 }
3b0bdc72
UD
1744 }
1745
21f5de55
PE
1746 /* An epsilon closure includes itself. */
1747 ret = re_node_set_insert (&eclosure, node);
1748 if (BE (ret < 0, 0))
1749 return REG_ESPACE;
3b0bdc72
UD
1750 if (incomplete && !root)
1751 dfa->eclosures[node].nelem = 0;
1752 else
1753 dfa->eclosures[node] = eclosure;
a9388965
UD
1754 *new_set = eclosure;
1755 return REG_NOERROR;
3b0bdc72
UD
1756}
1757\f
1758/* Functions for token which are used in the parser. */
1759
1760/* Fetch a token from INPUT.
1761 We must not use this function inside bracket expressions. */
1762
f0d77aa8 1763static void
9b0c3f38 1764internal_function
0fd8ae9c 1765fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
3b0bdc72 1766{
f0d77aa8 1767 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1768}
1769
1770/* Peek a token from INPUT, and return the length of the token.
1771 We must not use this function inside bracket expressions. */
1772
1773static int
9b0c3f38 1774internal_function
0fd8ae9c 1775peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
1776{
1777 unsigned char c;
1778
1779 if (re_string_eoi (input))
1780 {
1781 token->type = END_OF_RE;
1782 return 0;
1783 }
1784
1785 c = re_string_peek_byte (input, 0);
1786 token->opr.c = c;
1787
65e6becf 1788 token->word_char = 0;
3b0bdc72
UD
1789#ifdef RE_ENABLE_I18N
1790 token->mb_partial = 0;
3c0fb574 1791 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1792 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 {
1794 token->type = CHARACTER;
1795 token->mb_partial = 1;
1796 return 1;
1797 }
1798#endif
1799 if (c == '\\')
1800 {
1801 unsigned char c2;
1802 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1803 {
1804 token->type = BACK_SLASH;
1805 return 1;
1806 }
3b0bdc72
UD
1807
1808 c2 = re_string_peek_byte_case (input, 1);
1809 token->opr.c = c2;
1810 token->type = CHARACTER;
65e6becf
UD
1811#ifdef RE_ENABLE_I18N
1812 if (input->mb_cur_max > 1)
1813 {
1814 wint_t wc = re_string_wchar_at (input,
1815 re_string_cur_idx (input) + 1);
1816 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817 }
1818 else
1819#endif
1820 token->word_char = IS_WORD_CHAR (c2) != 0;
1821
3b0bdc72 1822 switch (c2)
15a7d175
UD
1823 {
1824 case '|':
1825 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1826 token->type = OP_ALT;
1827 break;
1828 case '1': case '2': case '3': case '4': case '5':
1829 case '6': case '7': case '8': case '9':
1830 if (!(syntax & RE_NO_BK_REFS))
1831 {
1832 token->type = OP_BACK_REF;
ae73c6c1 1833 token->opr.idx = c2 - '1';
15a7d175
UD
1834 }
1835 break;
1836 case '<':
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 {
1839 token->type = ANCHOR;
bb3f4825 1840 token->opr.ctx_type = WORD_FIRST;
15a7d175
UD
1841 }
1842 break;
1843 case '>':
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 {
1846 token->type = ANCHOR;
bb3f4825 1847 token->opr.ctx_type = WORD_LAST;
15a7d175
UD
1848 }
1849 break;
1850 case 'b':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
bb3f4825 1854 token->opr.ctx_type = WORD_DELIM;
15a7d175
UD
1855 }
1856 break;
1857 case 'B':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
24992143 1861 token->opr.ctx_type = NOT_WORD_DELIM;
15a7d175
UD
1862 }
1863 break;
1864 case 'w':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_WORD;
1867 break;
1868 case 'W':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_NOTWORD;
1871 break;
e2b6bfa3
UD
1872 case 's':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_SPACE;
1875 break;
1876 case 'S':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_NOTSPACE;
1879 break;
15a7d175
UD
1880 case '`':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 {
1883 token->type = ANCHOR;
bb3f4825 1884 token->opr.ctx_type = BUF_FIRST;
15a7d175
UD
1885 }
1886 break;
1887 case '\'':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 {
1890 token->type = ANCHOR;
bb3f4825 1891 token->opr.ctx_type = BUF_LAST;
15a7d175
UD
1892 }
1893 break;
1894 case '(':
1895 if (!(syntax & RE_NO_BK_PARENS))
1896 token->type = OP_OPEN_SUBEXP;
1897 break;
1898 case ')':
1899 if (!(syntax & RE_NO_BK_PARENS))
1900 token->type = OP_CLOSE_SUBEXP;
1901 break;
1902 case '+':
1903 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904 token->type = OP_DUP_PLUS;
1905 break;
1906 case '?':
1907 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908 token->type = OP_DUP_QUESTION;
1909 break;
1910 case '{':
1911 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912 token->type = OP_OPEN_DUP_NUM;
1913 break;
1914 case '}':
1915 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916 token->type = OP_CLOSE_DUP_NUM;
1917 break;
1918 default:
1919 break;
1920 }
3b0bdc72
UD
1921 return 2;
1922 }
1923
1924 token->type = CHARACTER;
65e6becf
UD
1925#ifdef RE_ENABLE_I18N
1926 if (input->mb_cur_max > 1)
1927 {
1928 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1929 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930 }
1931 else
1932#endif
1933 token->word_char = IS_WORD_CHAR (token->opr.c);
1934
3b0bdc72
UD
1935 switch (c)
1936 {
1937 case '\n':
1938 if (syntax & RE_NEWLINE_ALT)
15a7d175 1939 token->type = OP_ALT;
3b0bdc72
UD
1940 break;
1941 case '|':
1942 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1943 token->type = OP_ALT;
3b0bdc72
UD
1944 break;
1945 case '*':
1946 token->type = OP_DUP_ASTERISK;
1947 break;
1948 case '+':
1949 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1950 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1951 break;
1952 case '?':
1953 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1954 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1955 break;
1956 case '{':
1957 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1958 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1959 break;
1960 case '}':
1961 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1962 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1963 break;
1964 case '(':
1965 if (syntax & RE_NO_BK_PARENS)
15a7d175 1966 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1967 break;
1968 case ')':
1969 if (syntax & RE_NO_BK_PARENS)
15a7d175 1970 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1971 break;
1972 case '[':
1973 token->type = OP_OPEN_BRACKET;
1974 break;
1975 case '.':
1976 token->type = OP_PERIOD;
1977 break;
1978 case '^':
134abcb5 1979 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
1980 re_string_cur_idx (input) != 0)
1981 {
1982 char prev = re_string_peek_byte (input, -1);
134abcb5 1983 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1984 break;
1985 }
3b0bdc72 1986 token->type = ANCHOR;
bb3f4825 1987 token->opr.ctx_type = LINE_FIRST;
3b0bdc72
UD
1988 break;
1989 case '$':
1990 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
1991 re_string_cur_idx (input) + 1 != re_string_length (input))
1992 {
1993 re_token_t next;
1994 re_string_skip_bytes (input, 1);
1995 peek_token (&next, input, syntax);
1996 re_string_skip_bytes (input, -1);
1997 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1998 break;
1999 }
3b0bdc72 2000 token->type = ANCHOR;
bb3f4825 2001 token->opr.ctx_type = LINE_LAST;
3b0bdc72
UD
2002 break;
2003 default:
2004 break;
2005 }
2006 return 1;
2007}
2008
2009/* Peek a token from INPUT, and return the length of the token.
2010 We must not use this function out of bracket expressions. */
2011
2012static int
9b0c3f38 2013internal_function
0fd8ae9c 2014peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
2015{
2016 unsigned char c;
2017 if (re_string_eoi (input))
2018 {
2019 token->type = END_OF_RE;
2020 return 0;
2021 }
2022 c = re_string_peek_byte (input, 0);
2023 token->opr.c = c;
2024
2025#ifdef RE_ENABLE_I18N
3c0fb574 2026 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
2027 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 {
2029 token->type = CHARACTER;
2030 return 1;
2031 }
2032#endif /* RE_ENABLE_I18N */
2033
294b6bcc
UD
2034 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2035 && re_string_cur_idx (input) + 1 < re_string_length (input))
3b0bdc72
UD
2036 {
2037 /* In this case, '\' escape a character. */
2038 unsigned char c2;
82bbb29e
UD
2039 re_string_skip_bytes (input, 1);
2040 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
2041 token->opr.c = c2;
2042 token->type = CHARACTER;
2043 return 1;
2044 }
2045 if (c == '[') /* '[' is a special char in a bracket exps. */
2046 {
2047 unsigned char c2;
2048 int token_len;
294b6bcc
UD
2049 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2050 c2 = re_string_peek_byte (input, 1);
2051 else
2052 c2 = 0;
3b0bdc72
UD
2053 token->opr.c = c2;
2054 token_len = 2;
2055 switch (c2)
15a7d175
UD
2056 {
2057 case '.':
2058 token->type = OP_OPEN_COLL_ELEM;
2059 break;
2060 case '=':
2061 token->type = OP_OPEN_EQUIV_CLASS;
2062 break;
2063 case ':':
2064 if (syntax & RE_CHAR_CLASSES)
2065 {
2066 token->type = OP_OPEN_CHAR_CLASS;
2067 break;
2068 }
2069 /* else fall through. */
2070 default:
2071 token->type = CHARACTER;
2072 token->opr.c = c;
2073 token_len = 1;
2074 break;
2075 }
3b0bdc72
UD
2076 return token_len;
2077 }
2078 switch (c)
2079 {
2080 case '-':
2081 token->type = OP_CHARSET_RANGE;
2082 break;
2083 case ']':
2084 token->type = OP_CLOSE_BRACKET;
2085 break;
2086 case '^':
2087 token->type = OP_NON_MATCH_LIST;
2088 break;
2089 default:
2090 token->type = CHARACTER;
2091 }
2092 return 1;
2093}
2094\f
2095/* Functions for parser. */
2096
2097/* Entry point of the parser.
2098 Parse the regular expression REGEXP and return the structure tree.
2099 If an error is occured, ERR is set by error code, and return NULL.
2100 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
2101 CAT
2102 / \
2103 / \
3b0bdc72
UD
2104 <reg_exp> EOR
2105
2106 CAT means concatenation.
2107 EOR means end of regular expression. */
2108
2109static bin_tree_t *
0fd8ae9c
UD
2110parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2111 reg_errcode_t *err)
3b0bdc72
UD
2112{
2113 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2114 bin_tree_t *tree, *eor, *root;
2115 re_token_t current_token;
56b168be 2116 dfa->syntax = syntax;
f0d77aa8 2117 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2118 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 2119 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72 2120 return NULL;
02f3550c 2121 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
3b0bdc72 2122 if (tree != NULL)
02f3550c 2123 root = create_tree (dfa, tree, eor, CONCAT);
3b0bdc72
UD
2124 else
2125 root = eor;
ee70274a 2126 if (BE (eor == NULL || root == NULL, 0))
485d775d
UD
2127 {
2128 *err = REG_ESPACE;
2129 return NULL;
2130 }
3b0bdc72
UD
2131 return root;
2132}
2133
2134/* This function build the following tree, from regular expression
2135 <branch1>|<branch2>:
15a7d175
UD
2136 ALT
2137 / \
2138 / \
3b0bdc72
UD
2139 <branch1> <branch2>
2140
2141 ALT means alternative, which represents the operator `|'. */
2142
2143static bin_tree_t *
0fd8ae9c
UD
2144parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2145 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2146{
2147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2148 bin_tree_t *tree, *branch = NULL;
3b0bdc72 2149 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 2150 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2151 return NULL;
2152
2153 while (token->type == OP_ALT)
2154 {
f0d77aa8 2155 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2156 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
2157 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158 {
2159 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2160 if (BE (*err != REG_NOERROR && branch == NULL, 0))
aa6ec754
AS
2161 {
2162 if (tree != NULL)
2163 postorder (tree, free_tree, NULL);
2164 return NULL;
2165 }
15a7d175 2166 }
9b88fc16
UD
2167 else
2168 branch = NULL;
02f3550c 2169 tree = create_tree (dfa, tree, branch, OP_ALT);
ee70274a 2170 if (BE (tree == NULL, 0))
15a7d175
UD
2171 {
2172 *err = REG_ESPACE;
2173 return NULL;
2174 }
3b0bdc72
UD
2175 }
2176 return tree;
2177}
2178
2179/* This function build the following tree, from regular expression
2180 <exp1><exp2>:
15a7d175
UD
2181 CAT
2182 / \
3b0bdc72
UD
2183 / \
2184 <exp1> <exp2>
2185
2186 CAT means concatenation. */
2187
2188static bin_tree_t *
0fd8ae9c
UD
2189parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2191{
2192 bin_tree_t *tree, *exp;
ee70274a 2193 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3b0bdc72 2194 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2195 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2196 return NULL;
2197
2198 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2199 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72
UD
2200 {
2201 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2202 if (BE (*err != REG_NOERROR && exp == NULL, 0))
15a7d175 2203 {
b833d51f
UD
2204 if (tree != NULL)
2205 postorder (tree, free_tree, NULL);
15a7d175
UD
2206 return NULL;
2207 }
3b0bdc72 2208 if (tree != NULL && exp != NULL)
15a7d175 2209 {
e9b9cbf5
UD
2210 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2211 if (newtree == NULL)
15a7d175 2212 {
e9b9cbf5
UD
2213 postorder (exp, free_tree, NULL);
2214 postorder (tree, free_tree, NULL);
15a7d175
UD
2215 *err = REG_ESPACE;
2216 return NULL;
2217 }
e9b9cbf5 2218 tree = newtree;
15a7d175 2219 }
3b0bdc72 2220 else if (tree == NULL)
15a7d175 2221 tree = exp;
3b0bdc72
UD
2222 /* Otherwise exp == NULL, we don't need to create new tree. */
2223 }
2224 return tree;
2225}
2226
2227/* This function build the following tree, from regular expression a*:
15a7d175
UD
2228 *
2229 |
2230 a
3b0bdc72
UD
2231*/
2232
2233static bin_tree_t *
0fd8ae9c
UD
2234parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2235 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2236{
2237 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2238 bin_tree_t *tree;
3b0bdc72
UD
2239 switch (token->type)
2240 {
2241 case CHARACTER:
02f3550c 2242 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2243 if (BE (tree == NULL, 0))
15a7d175
UD
2244 {
2245 *err = REG_ESPACE;
2246 return NULL;
2247 }
3b0bdc72 2248#ifdef RE_ENABLE_I18N
3c0fb574 2249 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2250 {
2251 while (!re_string_eoi (regexp)
2252 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2253 {
15a7d175 2254 bin_tree_t *mbc_remain;
f0d77aa8 2255 fetch_token (token, regexp, syntax);
02f3550c
UD
2256 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2257 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
ee70274a 2258 if (BE (mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2259 {
2260 *err = REG_ESPACE;
2261 return NULL;
2262 }
15a7d175 2263 }
3b0bdc72
UD
2264 }
2265#endif
2266 break;
2267 case OP_OPEN_SUBEXP:
2268 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2269 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2270 return NULL;
3b0bdc72
UD
2271 break;
2272 case OP_OPEN_BRACKET:
2273 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2274 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2275 return NULL;
3b0bdc72
UD
2276 break;
2277 case OP_BACK_REF:
ae73c6c1 2278 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
15a7d175
UD
2279 {
2280 *err = REG_ESUBREG;
2281 return NULL;
2282 }
ae73c6c1 2283 dfa->used_bkref_map |= 1 << token->opr.idx;
02f3550c 2284 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2285 if (BE (tree == NULL, 0))
15a7d175
UD
2286 {
2287 *err = REG_ESPACE;
2288 return NULL;
2289 }
3b0bdc72
UD
2290 ++dfa->nbackref;
2291 dfa->has_mb_node = 1;
2292 break;
06e8303a
UD
2293 case OP_OPEN_DUP_NUM:
2294 if (syntax & RE_CONTEXT_INVALID_DUP)
2295 {
2296 *err = REG_BADRPT;
2297 return NULL;
2298 }
2299 /* FALLTHROUGH */
3b0bdc72
UD
2300 case OP_DUP_ASTERISK:
2301 case OP_DUP_PLUS:
2302 case OP_DUP_QUESTION:
3b0bdc72 2303 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2304 {
2305 *err = REG_BADRPT;
2306 return NULL;
2307 }
3b0bdc72 2308 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2309 {
f0d77aa8 2310 fetch_token (token, regexp, syntax);
15a7d175
UD
2311 return parse_expression (regexp, preg, token, syntax, nest, err);
2312 }
3b0bdc72
UD
2313 /* else fall through */
2314 case OP_CLOSE_SUBEXP:
2315 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2316 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2317 {
2318 *err = REG_ERPAREN;
2319 return NULL;
2320 }
3b0bdc72
UD
2321 /* else fall through */
2322 case OP_CLOSE_DUP_NUM:
2323 /* We treat it as a normal character. */
2324
2325 /* Then we can these characters as normal characters. */
2326 token->type = CHARACTER;
65e6becf
UD
2327 /* mb_partial and word_char bits should be initialized already
2328 by peek_token. */
02f3550c 2329 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2330 if (BE (tree == NULL, 0))
15a7d175
UD
2331 {
2332 *err = REG_ESPACE;
2333 return NULL;
2334 }
3b0bdc72
UD
2335 break;
2336 case ANCHOR:
bb3f4825 2337 if ((token->opr.ctx_type
24992143 2338 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
56b168be
UD
2339 && dfa->word_ops_used == 0)
2340 init_word_char (dfa);
24992143 2341 if (token->opr.ctx_type == WORD_DELIM
21f5de55 2342 || token->opr.ctx_type == NOT_WORD_DELIM)
15a7d175
UD
2343 {
2344 bin_tree_t *tree_first, *tree_last;
24992143
UD
2345 if (token->opr.ctx_type == WORD_DELIM)
2346 {
2347 token->opr.ctx_type = WORD_FIRST;
02f3550c 2348 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2349 token->opr.ctx_type = WORD_LAST;
21f5de55
PE
2350 }
2351 else
2352 {
24992143 2353 token->opr.ctx_type = INSIDE_WORD;
02f3550c 2354 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2355 token->opr.ctx_type = INSIDE_NOTWORD;
21f5de55 2356 }
02f3550c
UD
2357 tree_last = create_token_tree (dfa, NULL, NULL, token);
2358 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
ee70274a 2359 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
15a7d175
UD
2360 {
2361 *err = REG_ESPACE;
2362 return NULL;
2363 }
2364 }
3b0bdc72 2365 else
15a7d175 2366 {
02f3550c 2367 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2368 if (BE (tree == NULL, 0))
54e1cabc
UD
2369 {
2370 *err = REG_ESPACE;
2371 return NULL;
2372 }
15a7d175 2373 }
3b0bdc72 2374 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2375 by repetition operators.
2376 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2377 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2378 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2379 return tree;
2380 case OP_PERIOD:
02f3550c 2381 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2382 if (BE (tree == NULL, 0))
15a7d175
UD
2383 {
2384 *err = REG_ESPACE;
2385 return NULL;
2386 }
3c0fb574 2387 if (dfa->mb_cur_max > 1)
15a7d175 2388 dfa->has_mb_node = 1;
3b0bdc72
UD
2389 break;
2390 case OP_WORD:
3b0bdc72 2391 case OP_NOTWORD:
266c1f50
UD
2392 tree = build_charclass_op (dfa, regexp->trans,
2393 (const unsigned char *) "alnum",
2394 (const unsigned char *) "_",
2395 token->type == OP_NOTWORD, err);
e2b6bfa3
UD
2396 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2397 return NULL;
2398 break;
2399 case OP_SPACE:
e2b6bfa3 2400 case OP_NOTSPACE:
266c1f50
UD
2401 tree = build_charclass_op (dfa, regexp->trans,
2402 (const unsigned char *) "space",
2403 (const unsigned char *) "",
2404 token->type == OP_NOTSPACE, err);
bc15410e 2405 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2406 return NULL;
3b0bdc72
UD
2407 break;
2408 case OP_ALT:
2409 case END_OF_RE:
2410 return NULL;
2411 case BACK_SLASH:
2412 *err = REG_EESCAPE;
2413 return NULL;
2414 default:
2415 /* Must not happen? */
2416#ifdef DEBUG
2417 assert (0);
2418#endif
2419 return NULL;
2420 }
f0d77aa8 2421 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2422
2423 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2424 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72 2425 {
4d43ef1e
AS
2426 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2427 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2428 {
2429 if (tree != NULL)
2430 postorder (tree, free_tree, NULL);
2431 return NULL;
2432 }
2433 tree = dup_tree;
c34bfc8d
UD
2434 /* In BRE consecutive duplications are not allowed. */
2435 if ((syntax & RE_CONTEXT_INVALID_DUP)
2436 && (token->type == OP_DUP_ASTERISK
2437 || token->type == OP_OPEN_DUP_NUM))
2438 {
4d43ef1e
AS
2439 if (tree != NULL)
2440 postorder (tree, free_tree, NULL);
c34bfc8d
UD
2441 *err = REG_BADRPT;
2442 return NULL;
2443 }
3b0bdc72
UD
2444 }
2445
2446 return tree;
2447}
2448
2449/* This function build the following tree, from regular expression
2450 (<reg_exp>):
15a7d175
UD
2451 SUBEXP
2452 |
2453 <reg_exp>
3b0bdc72
UD
2454*/
2455
2456static bin_tree_t *
0fd8ae9c
UD
2457parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2458 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2459{
2460 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02f3550c 2461 bin_tree_t *tree;
3b0bdc72
UD
2462 size_t cur_nsub;
2463 cur_nsub = preg->re_nsub++;
81c64d40 2464
f0d77aa8 2465 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2466
2467 /* The subexpression may be a null string. */
2468 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2469 tree = NULL;
3b0bdc72
UD
2470 else
2471 {
2472 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
02f3550c 2473 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
a129c80d
UD
2474 {
2475 if (tree != NULL)
e9b9cbf5 2476 postorder (tree, free_tree, NULL);
a129c80d
UD
2477 *err = REG_EPAREN;
2478 }
02f3550c 2479 if (BE (*err != REG_NOERROR, 0))
15a7d175 2480 return NULL;
3b0bdc72 2481 }
01ed6ceb
UD
2482
2483 if (cur_nsub <= '9' - '1')
2484 dfa->completed_bkref_map |= 1 << cur_nsub;
02f3550c
UD
2485
2486 tree = create_tree (dfa, tree, NULL, SUBEXP);
2487 if (BE (tree == NULL, 0))
485d775d
UD
2488 {
2489 *err = REG_ESPACE;
2490 return NULL;
2491 }
02f3550c 2492 tree->token.opr.idx = cur_nsub;
3b0bdc72
UD
2493 return tree;
2494}
2495
2496/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2497
2498static bin_tree_t *
0fd8ae9c
UD
2499parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2500 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72 2501{
20dc2f79 2502 bin_tree_t *tree = NULL, *old_tree = NULL;
6b6557e8 2503 int i, start, end, start_idx = re_string_cur_idx (regexp);
3b0bdc72 2504 re_token_t start_token = *token;
6b6557e8 2505
3b0bdc72
UD
2506 if (token->type == OP_OPEN_DUP_NUM)
2507 {
6b6557e8
UD
2508 end = 0;
2509 start = fetch_number (regexp, token, syntax);
3b0bdc72 2510 if (start == -1)
15a7d175
UD
2511 {
2512 if (token->type == CHARACTER && token->opr.c == ',')
2513 start = 0; /* We treat "{,m}" as "{0,m}". */
2514 else
2515 {
2516 *err = REG_BADBR; /* <re>{} is invalid. */
2517 return NULL;
2518 }
2519 }
602c2f9d 2520 if (BE (start != -2, 1))
15a7d175
UD
2521 {
2522 /* We treat "{n}" as "{n,n}". */
2523 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2524 : ((token->type == CHARACTER && token->opr.c == ',')
2525 ? fetch_number (regexp, token, syntax) : -2));
2526 }
602c2f9d 2527 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2528 {
2529 /* Invalid sequence. */
6b6557e8
UD
2530 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2531 {
2532 if (token->type == END_OF_RE)
2533 *err = REG_EBRACE;
2534 else
2535 *err = REG_BADBR;
2536
2537 return NULL;
2538 }
2539
2540 /* If the syntax bit is set, rollback. */
2541 re_string_set_index (regexp, start_idx);
2542 *token = start_token;
2543 token->type = CHARACTER;
2544 /* mb_partial and word_char bits should be already initialized by
2545 peek_token. */
2546 return elem;
15a7d175 2547 }
6b6557e8 2548
52db8039 2549 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
15a7d175 2550 {
6b6557e8
UD
2551 /* First number greater than second. */
2552 *err = REG_BADBR;
15a7d175
UD
2553 return NULL;
2554 }
6b6557e8
UD
2555 }
2556 else
2557 {
2558 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2559 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2560 }
2561
20dc2f79
UD
2562 fetch_token (token, regexp, syntax);
2563
02f3550c 2564 if (BE (elem == NULL, 0))
20dc2f79 2565 return NULL;
02f3550c
UD
2566 if (BE (start == 0 && end == 0, 0))
2567 {
2568 postorder (elem, free_tree, NULL);
2569 return NULL;
2570 }
602c2f9d 2571
6b6557e8 2572 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
20dc2f79 2573 if (BE (start > 0, 0))
6b6557e8
UD
2574 {
2575 tree = elem;
2576 for (i = 2; i <= start; ++i)
7de66108 2577 {
6b6557e8 2578 elem = duplicate_tree (elem, dfa);
02f3550c 2579 tree = create_tree (dfa, tree, elem, CONCAT);
6b6557e8 2580 if (BE (elem == NULL || tree == NULL, 0))
7de66108
UD
2581 goto parse_dup_op_espace;
2582 }
6b6557e8 2583
20dc2f79
UD
2584 if (start == end)
2585 return tree;
a96c63ed 2586
20dc2f79
UD
2587 /* Duplicate ELEM before it is marked optional. */
2588 elem = duplicate_tree (elem, dfa);
7ee03f00
JM
2589 if (BE (elem == NULL, 0))
2590 goto parse_dup_op_espace;
20dc2f79
UD
2591 old_tree = tree;
2592 }
2593 else
2594 old_tree = NULL;
2595
02f3550c
UD
2596 if (elem->token.type == SUBEXP)
2597 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2598
2599 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
20dc2f79
UD
2600 if (BE (tree == NULL, 0))
2601 goto parse_dup_op_espace;
2602
2603 /* This loop is actually executed only when end != -1,
2604 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2605 already created the start+1-th copy. */
2606 for (i = start + 2; i <= end; ++i)
2607 {
2608 elem = duplicate_tree (elem, dfa);
02f3550c 2609 tree = create_tree (dfa, tree, elem, CONCAT);
6b6557e8 2610 if (BE (elem == NULL || tree == NULL, 0))
21f5de55 2611 goto parse_dup_op_espace;
6b6557e8 2612
02f3550c 2613 tree = create_tree (dfa, tree, NULL, OP_ALT);
20dc2f79 2614 if (BE (tree == NULL, 0))
21f5de55 2615 goto parse_dup_op_espace;
3b0bdc72 2616 }
6b6557e8 2617
20dc2f79 2618 if (old_tree)
02f3550c 2619 tree = create_tree (dfa, old_tree, tree, CONCAT);
20dc2f79 2620
3b0bdc72
UD
2621 return tree;
2622
2623 parse_dup_op_espace:
3b0bdc72
UD
2624 *err = REG_ESPACE;
2625 return NULL;
3b0bdc72
UD
2626}
2627
2628/* Size of the names for collating symbol/equivalence_class/character_class.
2629 I'm not sure, but maybe enough. */
2630#define BRACKET_NAME_BUF_SIZE 32
2631
434d3784
UD
2632#ifndef _LIBC
2633 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2634 Build the range expression which starts from START_ELEM, and ends
2635 at END_ELEM. The result are written to MBCSET and SBCSET.
2636 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2637 mbcset->range_ends, is a pointer argument sinse we may
2638 update it. */
2639
2640static reg_errcode_t
9b0c3f38 2641internal_function
c0a0f9a3 2642# ifdef RE_ENABLE_I18N
0fd8ae9c
UD
2643build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2644 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
c0a0f9a3 2645# else /* not RE_ENABLE_I18N */
0fd8ae9c
UD
2646build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2647 bracket_elem_t *end_elem)
c0a0f9a3 2648# endif /* not RE_ENABLE_I18N */
434d3784
UD
2649{
2650 unsigned int start_ch, end_ch;
2651 /* Equivalence Classes and Character Classes can't be a range start/end. */
2652 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2653 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2654 0))
434d3784
UD
2655 return REG_ERANGE;
2656
2657 /* We can handle no multi character collating elements without libc
2658 support. */
62439eac 2659 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2660 && strlen ((char *) start_elem->opr.name) > 1)
2661 || (end_elem->type == COLL_SYM
2662 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2663 return REG_ECOLLATE;
2664
2665# ifdef RE_ENABLE_I18N
2666 {
2d87db5b
UD
2667 wchar_t wc;
2668 wint_t start_wc;
2669 wint_t end_wc;
434d3784
UD
2670 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2671
2672 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2673 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2674 : 0));
434d3784 2675 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2676 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2677 : 0));
434d3784 2678 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
15a7d175 2679 ? __btowc (start_ch) : start_elem->opr.wch);
434d3784 2680 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
15a7d175 2681 ? __btowc (end_ch) : end_elem->opr.wch);
4bb333cd
UD
2682 if (start_wc == WEOF || end_wc == WEOF)
2683 return REG_ECOLLATE;
434d3784
UD
2684 cmp_buf[0] = start_wc;
2685 cmp_buf[4] = end_wc;
2686 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2687 return REG_ERANGE;
2688
10677727
UD
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2694 if (mbcset)
434d3784 2695 {
21f5de55
PE
2696 /* Check the space of the arrays. */
2697 if (BE (*range_alloc == mbcset->nranges, 0))
2698 {
10677727
UD
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start, *new_array_end;
2701 int new_nranges;
2702
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges = 2 * mbcset->nranges + 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
21f5de55 2708 new_nranges);
10677727 2709 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
21f5de55 2710 new_nranges);
10677727
UD
2711
2712 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2713 return REG_ESPACE;
2714
2715 mbcset->range_starts = new_array_start;
2716 mbcset->range_ends = new_array_end;
2717 *range_alloc = new_nranges;
21f5de55 2718 }
10677727 2719
21f5de55
PE
2720 mbcset->range_starts[mbcset->nranges] = start_wc;
2721 mbcset->range_ends[mbcset->nranges++] = end_wc;
434d3784
UD
2722 }
2723
434d3784 2724 /* Build the table for single byte characters. */
1e7947dc 2725 for (wc = 0; wc < SBC_MAX; ++wc)
434d3784 2726 {
15a7d175
UD
2727 cmp_buf[2] = wc;
2728 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730 bitset_set (sbcset, wc);
434d3784
UD
2731 }
2732 }
2733# else /* not RE_ENABLE_I18N */
2734 {
2735 unsigned int ch;
2736 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2737 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2738 : 0));
434d3784 2739 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2740 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2741 : 0));
434d3784
UD
2742 if (start_ch > end_ch)
2743 return REG_ERANGE;
2744 /* Build the table for single byte characters. */
1e7947dc 2745 for (ch = 0; ch < SBC_MAX; ++ch)
434d3784 2746 if (start_ch <= ch && ch <= end_ch)
15a7d175 2747 bitset_set (sbcset, ch);
434d3784
UD
2748 }
2749# endif /* not RE_ENABLE_I18N */
2750 return REG_NOERROR;
2751}
2752#endif /* not _LIBC */
2753
2754#ifndef _LIBC
2755/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756 Build the collating element which is represented by NAME.
2757 The result are written to MBCSET and SBCSET.
2758 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759 pointer argument since we may update it. */
2760
2761static reg_errcode_t
9b0c3f38 2762internal_function
c0a0f9a3 2763# ifdef RE_ENABLE_I18N
0fd8ae9c
UD
2764build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765 int *coll_sym_alloc, const unsigned char *name)
c0a0f9a3 2766# else /* not RE_ENABLE_I18N */
0fd8ae9c 2767build_collating_symbol (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 2768# endif /* not RE_ENABLE_I18N */
434d3784 2769{
62439eac
UD
2770 size_t name_len = strlen ((const char *) name);
2771 if (BE (name_len != 1, 0))
434d3784
UD
2772 return REG_ECOLLATE;
2773 else
2774 {
62439eac 2775 bitset_set (sbcset, name[0]);
434d3784
UD
2776 return REG_NOERROR;
2777 }
2778}
2779#endif /* not _LIBC */
2780
3b0bdc72
UD
2781/* This function parse bracket expression like "[abc]", "[a-c]",
2782 "[[.a-a.]]" etc. */
2783
2784static bin_tree_t *
0fd8ae9c
UD
2785parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786 reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72
UD
2787{
2788#ifdef _LIBC
62439eac
UD
2789 const unsigned char *collseqmb;
2790 const char *collseqwc;
3b0bdc72
UD
2791 uint32_t nrules;
2792 int32_t table_size;
2793 const int32_t *symb_table;
2794 const unsigned char *extra;
2795
434d3784 2796 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72 2797 Seek the collating symbol entry correspondings to NAME.
7e2f0d2d
AS
2798 Return the index of the symbol in the SYMB_TABLE,
2799 or -1 if not found. */
3b0bdc72 2800
7090d3ca 2801 auto inline int32_t
25337753 2802 __attribute ((always_inline))
7e2f0d2d 2803 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
3b0bdc72 2804 {
7e2f0d2d 2805 int32_t elem;
a334319f 2806
7e2f0d2d
AS
2807 for (elem = 0; elem < table_size; elem++)
2808 if (symb_table[2 * elem] != 0)
2809 {
2810 int32_t idx = symb_table[2 * elem + 1];
2811 /* Skip the name of collating element name. */
2812 idx += 1 + extra[idx];
2813 if (/* Compare the length of the name. */
2814 name_len == extra[idx]
2815 /* Compare the name. */
2816 && memcmp (name, &extra[idx + 1], name_len) == 0)
2817 /* Yep, this is the entry. */
2818 return elem;
2819 }
2820 return -1;
3b0bdc72
UD
2821 }
2822
a532a41d 2823 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
2824 Look up the collation sequence value of BR_ELEM.
2825 Return the value if succeeded, UINT_MAX otherwise. */
2826
7090d3ca 2827 auto inline unsigned int
25337753 2828 __attribute ((always_inline))
7e2f0d2d 2829 lookup_collation_sequence_value (bracket_elem_t *br_elem)
3b0bdc72
UD
2830 {
2831 if (br_elem->type == SB_CHAR)
15a7d175
UD
2832 {
2833 /*
2834 if (MB_CUR_MAX == 1)
2835 */
2836 if (nrules == 0)
2837 return collseqmb[br_elem->opr.ch];
2838 else
2839 {
2840 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2841 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2842 }
2843 }
3b0bdc72 2844 else if (br_elem->type == MB_CHAR)
15a7d175 2845 {
a532a41d
UD
2846 if (nrules != 0)
2847 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2848 }
3b0bdc72 2849 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2850 {
2851 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2852 if (nrules != 0)
2853 {
2854 int32_t elem, idx;
2855 elem = seek_collating_symbol_entry (br_elem->opr.name,
2856 sym_name_len);
7e2f0d2d 2857 if (elem != -1)
15a7d175
UD
2858 {
2859 /* We found the entry. */
2860 idx = symb_table[2 * elem + 1];
2861 /* Skip the name of collating element name. */
2862 idx += 1 + extra[idx];
2863 /* Skip the byte sequence of the collating element. */
2864 idx += 1 + extra[idx];
2865 /* Adjust for the alignment. */
2866 idx = (idx + 3) & ~3;
2867 /* Skip the multibyte collation sequence value. */
2868 idx += sizeof (unsigned int);
2869 /* Skip the wide char sequence of the collating element. */
2870 idx += sizeof (unsigned int) *
2871 (1 + *(unsigned int *) (extra + idx));
2872 /* Return the collation sequence value. */
2873 return *(unsigned int *) (extra + idx);
2874 }
7e2f0d2d 2875 else if (sym_name_len == 1)
15a7d175
UD
2876 {
2877 /* No valid character. Match it as a single byte
2878 character. */
2879 return collseqmb[br_elem->opr.name[0]];
2880 }
2881 }
2882 else if (sym_name_len == 1)
2883 return collseqmb[br_elem->opr.name[0]];
2884 }
3b0bdc72
UD
2885 return UINT_MAX;
2886 }
2887
434d3784 2888 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2889 Build the range expression which starts from START_ELEM, and ends
2890 at END_ELEM. The result are written to MBCSET and SBCSET.
2891 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2892 mbcset->range_ends, is a pointer argument sinse we may
2893 update it. */
2894
7090d3ca 2895 auto inline reg_errcode_t
25337753 2896 __attribute ((always_inline))
7e2f0d2d
AS
2897 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2898 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
3b0bdc72
UD
2899 {
2900 unsigned int ch;
2901 uint32_t start_collseq;
2902 uint32_t end_collseq;
2903
434d3784 2904 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2905 start/end. */
bc15410e 2906 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2907 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2908 0))
2909 return REG_ERANGE;
3b0bdc72
UD
2910
2911 start_collseq = lookup_collation_sequence_value (start_elem);
2912 end_collseq = lookup_collation_sequence_value (end_elem);
2913 /* Check start/end collation sequence values. */
bc15410e 2914 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2915 return REG_ECOLLATE;
bc15410e 2916 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2917 return REG_ERANGE;
3b0bdc72 2918
10677727
UD
2919 /* Got valid collation sequence values, add them as a new entry.
2920 However, if we have no collation elements, and the character set
2921 is single byte, the single byte character set that we
2922 build below suffices. */
2923 if (nrules > 0 || dfa->mb_cur_max > 1)
2924 {
21f5de55
PE
2925 /* Check the space of the arrays. */
2926 if (BE (*range_alloc == mbcset->nranges, 0))
10677727
UD
2927 {
2928 /* There is not enough space, need realloc. */
2929 uint32_t *new_array_start;
2930 uint32_t *new_array_end;
2931 int new_nranges;
2932
2933 /* +1 in case of mbcset->nranges is 0. */
2934 new_nranges = 2 * mbcset->nranges + 1;
2935 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2936 new_nranges);
2937 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
21f5de55 2938 new_nranges);
10677727
UD
2939
2940 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
21f5de55 2941 return REG_ESPACE;
10677727
UD
2942
2943 mbcset->range_starts = new_array_start;
2944 mbcset->range_ends = new_array_end;
2945 *range_alloc = new_nranges;
2946 }
2947
21f5de55
PE
2948 mbcset->range_starts[mbcset->nranges] = start_collseq;
2949 mbcset->range_ends[mbcset->nranges++] = end_collseq;
10677727 2950 }
3b0bdc72
UD
2951
2952 /* Build the table for single byte characters. */
1e7947dc 2953 for (ch = 0; ch < SBC_MAX; ch++)
15a7d175
UD
2954 {
2955 uint32_t ch_collseq;
2956 /*
2957 if (MB_CUR_MAX == 1)
2958 */
2959 if (nrules == 0)
2960 ch_collseq = collseqmb[ch];
2961 else
25337753 2962 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
2963 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2964 bitset_set (sbcset, ch);
2965 }
3b0bdc72
UD
2966 return REG_NOERROR;
2967 }
3b0bdc72 2968
434d3784 2969 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2970 Build the collating element which is represented by NAME.
2971 The result are written to MBCSET and SBCSET.
2972 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2973 pointer argument sinse we may update it. */
2974
7090d3ca 2975 auto inline reg_errcode_t
25337753 2976 __attribute ((always_inline))
7e2f0d2d
AS
2977 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2978 int *coll_sym_alloc, const unsigned char *name)
3b0bdc72 2979 {
3b0bdc72 2980 int32_t elem, idx;
62439eac 2981 size_t name_len = strlen ((const char *) name);
3b0bdc72 2982 if (nrules != 0)
15a7d175
UD
2983 {
2984 elem = seek_collating_symbol_entry (name, name_len);
7e2f0d2d 2985 if (elem != -1)
15a7d175
UD
2986 {
2987 /* We found the entry. */
2988 idx = symb_table[2 * elem + 1];
2989 /* Skip the name of collating element name. */
2990 idx += 1 + extra[idx];
2991 }
7e2f0d2d 2992 else if (name_len == 1)
15a7d175
UD
2993 {
2994 /* No valid character, treat it as a normal
2995 character. */
2996 bitset_set (sbcset, name[0]);
2997 return REG_NOERROR;
2998 }
2999 else
3000 return REG_ECOLLATE;
3b0bdc72 3001
15a7d175
UD
3002 /* Got valid collation sequence, add it as a new entry. */
3003 /* Check the space of the arrays. */
951d6408 3004 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
15a7d175
UD
3005 {
3006 /* Not enough, realloc it. */
3007 /* +1 in case of mbcset->ncoll_syms is 0. */
951d6408 3008 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
15a7d175
UD
3009 /* Use realloc since mbcset->coll_syms is NULL
3010 if *alloc == 0. */
951d6408
UD
3011 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3012 new_coll_sym_alloc);
3013 if (BE (new_coll_syms == NULL, 0))
15a7d175 3014 return REG_ESPACE;
951d6408
UD
3015 mbcset->coll_syms = new_coll_syms;
3016 *coll_sym_alloc = new_coll_sym_alloc;
15a7d175
UD
3017 }
3018 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
15a7d175
UD
3019 return REG_NOERROR;
3020 }
3b0bdc72 3021 else
15a7d175
UD
3022 {
3023 if (BE (name_len != 1, 0))
3024 return REG_ECOLLATE;
3025 else
3026 {
3027 bitset_set (sbcset, name[0]);
3028 return REG_NOERROR;
3029 }
3030 }
3b0bdc72 3031 }
434d3784
UD
3032#endif
3033
3b0bdc72
UD
3034 re_token_t br_token;
3035 re_bitset_ptr_t sbcset;
c0a0f9a3 3036#ifdef RE_ENABLE_I18N
3b0bdc72 3037 re_charset_t *mbcset;
3b0bdc72
UD
3038 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3039 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3 3040#endif /* not RE_ENABLE_I18N */
10677727 3041 int non_match = 0;
c0a0f9a3 3042 bin_tree_t *work_tree;
ee70274a 3043 int token_len;
8cae99db 3044 int first_round = 1;
3b0bdc72 3045#ifdef _LIBC
62439eac
UD
3046 collseqmb = (const unsigned char *)
3047 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
3048 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3049 if (nrules)
3050 {
3051 /*
3052 if (MB_CUR_MAX > 1)
3053 */
7d4722e3 3054 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
3055 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3056 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3057 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 3058 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3059 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3060 }
3061#endif
2c05d33f 3062 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3063#ifdef RE_ENABLE_I18N
3b0bdc72 3064 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3065#endif /* RE_ENABLE_I18N */
3066#ifdef RE_ENABLE_I18N
bc15410e 3067 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3068#else
3069 if (BE (sbcset == NULL, 0))
3070#endif /* RE_ENABLE_I18N */
3b0bdc72 3071 {
a129c80d
UD
3072 re_free (sbcset);
3073#ifdef RE_ENABLE_I18N
3074 re_free (mbcset);
3075#endif
3b0bdc72
UD
3076 *err = REG_ESPACE;
3077 return NULL;
3078 }
3079
3080 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3081 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 3082 {
3b0bdc72 3083 *err = REG_BADPAT;
434d3784 3084 goto parse_bracket_exp_free_return;
3b0bdc72
UD
3085 }
3086 if (token->type == OP_NON_MATCH_LIST)
3087 {
c0a0f9a3 3088#ifdef RE_ENABLE_I18N
3b0bdc72 3089 mbcset->non_match = 1;
c0a0f9a3 3090#endif /* not RE_ENABLE_I18N */
10677727 3091 non_match = 1;
3b0bdc72 3092 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
784aacea 3093 bitset_set (sbcset, '\n');
3b0bdc72
UD
3094 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3095 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3096 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
3097 {
3098 *err = REG_BADPAT;
3099 goto parse_bracket_exp_free_return;
3100 }
3b0bdc72
UD
3101 }
3102
3103 /* We treat the first ']' as a normal character. */
3104 if (token->type == OP_CLOSE_BRACKET)
3105 token->type = CHARACTER;
3106
3107 while (1)
3108 {
3109 bracket_elem_t start_elem, end_elem;
62439eac
UD
3110 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3111 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72
UD
3112 reg_errcode_t ret;
3113 int token_len2 = 0, is_range_exp = 0;
3114 re_token_t token2;
3115
3116 start_elem.opr.name = start_name_buf;
3117 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 3118 syntax, first_round);
bc15410e 3119 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
3120 {
3121 *err = ret;
3122 goto parse_bracket_exp_free_return;
3123 }
78d8b07a 3124 first_round = 0;
3b0bdc72 3125
78d8b07a 3126 /* Get information about the next token. We need it in any case. */
3b0bdc72 3127 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3128
3129 /* Do not check for ranges if we know they are not allowed. */
3130 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3131 {
15a7d175
UD
3132 if (BE (token->type == END_OF_RE, 0))
3133 {
78d8b07a 3134 *err = REG_EBRACK;
15a7d175
UD
3135 goto parse_bracket_exp_free_return;
3136 }
78d8b07a 3137 if (token->type == OP_CHARSET_RANGE)
15a7d175 3138 {
78d8b07a
UD
3139 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3140 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3141 if (BE (token2.type == END_OF_RE, 0))
3142 {
3143 *err = REG_EBRACK;
3144 goto parse_bracket_exp_free_return;
3145 }
3146 if (token2.type == OP_CLOSE_BRACKET)
3147 {
3148 /* We treat the last '-' as a normal character. */
3149 re_string_skip_bytes (regexp, -token_len);
3150 token->type = CHARACTER;
3151 }
3152 else
3153 is_range_exp = 1;
15a7d175 3154 }
15a7d175 3155 }
3b0bdc72
UD
3156
3157 if (is_range_exp == 1)
15a7d175
UD
3158 {
3159 end_elem.opr.name = end_name_buf;
3160 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
78d8b07a 3161 dfa, syntax, 1);
15a7d175
UD
3162 if (BE (ret != REG_NOERROR, 0))
3163 {
3164 *err = ret;
3165 goto parse_bracket_exp_free_return;
3166 }
3167
3168 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3169
10677727
UD
3170#ifdef _LIBC
3171 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3172 &start_elem, &end_elem);
3173#else
3174# ifdef RE_ENABLE_I18N
15a7d175 3175 *err = build_range_exp (sbcset,
10677727
UD
3176 dfa->mb_cur_max > 1 ? mbcset : NULL,
3177 &range_alloc, &start_elem, &end_elem);
3178# else
3179 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3180# endif
c0a0f9a3 3181#endif /* RE_ENABLE_I18N */
15a7d175
UD
3182 if (BE (*err != REG_NOERROR, 0))
3183 goto parse_bracket_exp_free_return;
3184 }
3b0bdc72 3185 else
15a7d175
UD
3186 {
3187 switch (start_elem.type)
3188 {
3189 case SB_CHAR:
3190 bitset_set (sbcset, start_elem.opr.ch);
3191 break;
c0a0f9a3 3192#ifdef RE_ENABLE_I18N
15a7d175
UD
3193 case MB_CHAR:
3194 /* Check whether the array has enough space. */
951d6408 3195 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
15a7d175 3196 {
951d6408 3197 wchar_t *new_mbchars;
15a7d175
UD
3198 /* Not enough, realloc it. */
3199 /* +1 in case of mbcset->nmbchars is 0. */
3200 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3201 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3202 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3203 mbchar_alloc);
3204 if (BE (new_mbchars == NULL, 0))
15a7d175 3205 goto parse_bracket_exp_espace;
951d6408 3206 mbcset->mbchars = new_mbchars;
15a7d175
UD
3207 }
3208 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3209 break;
c0a0f9a3 3210#endif /* RE_ENABLE_I18N */
15a7d175
UD
3211 case EQUIV_CLASS:
3212 *err = build_equiv_class (sbcset,
c0a0f9a3 3213#ifdef RE_ENABLE_I18N
15a7d175 3214 mbcset, &equiv_class_alloc,
c0a0f9a3 3215#endif /* RE_ENABLE_I18N */
3b0bdc72 3216 start_elem.opr.name);
15a7d175
UD
3217 if (BE (*err != REG_NOERROR, 0))
3218 goto parse_bracket_exp_free_return;
3219 break;
3220 case COLL_SYM:
3221 *err = build_collating_symbol (sbcset,
c0a0f9a3 3222#ifdef RE_ENABLE_I18N
15a7d175 3223 mbcset, &coll_sym_alloc,
c0a0f9a3 3224#endif /* RE_ENABLE_I18N */
3b0bdc72 3225 start_elem.opr.name);
15a7d175
UD
3226 if (BE (*err != REG_NOERROR, 0))
3227 goto parse_bracket_exp_free_return;
3228 break;
3229 case CHAR_CLASS:
66b110e8 3230 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3231#ifdef RE_ENABLE_I18N
7b7b9e70 3232 mbcset, &char_class_alloc,
c0a0f9a3 3233#endif /* RE_ENABLE_I18N */
7b7b9e70
UD
3234 start_elem.opr.name, syntax);
3235 if (BE (*err != REG_NOERROR, 0))
3236 goto parse_bracket_exp_free_return;
15a7d175
UD
3237 break;
3238 default:
3239 assert (0);
3240 break;
3241 }
3242 }
78d8b07a
UD
3243 if (BE (token->type == END_OF_RE, 0))
3244 {
3245 *err = REG_EBRACK;
3246 goto parse_bracket_exp_free_return;
3247 }
3b0bdc72 3248 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3249 break;
3b0bdc72
UD
3250 }
3251
3252 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3253
3254 /* If it is non-matching list. */
c0a0f9a3 3255 if (non_match)
3b0bdc72 3256 bitset_not (sbcset);
10677727 3257
65e6becf
UD
3258#ifdef RE_ENABLE_I18N
3259 /* Ensure only single byte characters are set. */
3260 if (dfa->mb_cur_max > 1)
3261 bitset_mask (sbcset, dfa->sb_char);
3b0bdc72
UD
3262
3263 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3264 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3265 || mbcset->non_match)))
3b0bdc72 3266 {
3b0bdc72 3267 bin_tree_t *mbc_tree;
ad7f28c2 3268 int sbc_idx;
3b0bdc72 3269 /* Build a tree for complex bracket. */
ad7f28c2 3270 dfa->has_mb_node = 1;
02f3550c
UD
3271 br_token.type = COMPLEX_BRACKET;
3272 br_token.opr.mbcset = mbcset;
3273 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274 if (BE (mbc_tree == NULL, 0))
3275 goto parse_bracket_exp_espace;
2c05d33f 3276 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
ad7f28c2
UD
3277 if (sbcset[sbc_idx])
3278 break;
3279 /* If there are no bits set in sbcset, there is no point
3280 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
2c05d33f 3281 if (sbc_idx < BITSET_WORDS)
02f3550c 3282 {
21f5de55
PE
3283 /* Build a tree for simple bracket. */
3284 br_token.type = SIMPLE_BRACKET;
3285 br_token.opr.sbcset = sbcset;
3286 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3287 if (BE (work_tree == NULL, 0))
3288 goto parse_bracket_exp_espace;
02f3550c 3289
21f5de55
PE
3290 /* Then join them by ALT node. */
3291 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3292 if (BE (work_tree == NULL, 0))
3293 goto parse_bracket_exp_espace;
02f3550c
UD
3294 }
3295 else
ad7f28c2
UD
3296 {
3297 re_free (sbcset);
02f3550c 3298 work_tree = mbc_tree;
ad7f28c2 3299 }
3b0bdc72
UD
3300 }
3301 else
963d8d78 3302#endif /* not RE_ENABLE_I18N */
3b0bdc72 3303 {
963d8d78
UD
3304#ifdef RE_ENABLE_I18N
3305 free_charset (mbcset);
3306#endif
02f3550c
UD
3307 /* Build a tree for simple bracket. */
3308 br_token.type = SIMPLE_BRACKET;
3309 br_token.opr.sbcset = sbcset;
3310 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3311 if (BE (work_tree == NULL, 0))
21f5de55 3312 goto parse_bracket_exp_espace;
3b0bdc72 3313 }
02f3550c 3314 return work_tree;
3b0bdc72
UD
3315
3316 parse_bracket_exp_espace:
3b0bdc72 3317 *err = REG_ESPACE;
434d3784
UD
3318 parse_bracket_exp_free_return:
3319 re_free (sbcset);
c0a0f9a3 3320#ifdef RE_ENABLE_I18N
434d3784 3321 free_charset (mbcset);
c0a0f9a3 3322#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3323 return NULL;
3324}
3325
434d3784
UD
3326/* Parse an element in the bracket expression. */
3327
3b0bdc72 3328static reg_errcode_t
0fd8ae9c
UD
3329parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3330 re_token_t *token, int token_len, re_dfa_t *dfa,
3331 reg_syntax_t syntax, int accept_hyphen)
3b0bdc72
UD
3332{
3333#ifdef RE_ENABLE_I18N
3334 int cur_char_size;
3335 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3336 if (cur_char_size > 1)
3337 {
3338 elem->type = MB_CHAR;
3339 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3340 re_string_skip_bytes (regexp, cur_char_size);
3341 return REG_NOERROR;
3342 }
3343#endif /* RE_ENABLE_I18N */
3344 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3345 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3346 || token->type == OP_OPEN_EQUIV_CLASS)
3347 return parse_bracket_symbol (elem, regexp, token);
78d8b07a
UD
3348 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3349 {
3350 /* A '-' must only appear as anything but a range indicator before
3351 the closing bracket. Everything else is an error. */
3352 re_token_t token2;
3353 (void) peek_token_bracket (&token2, regexp, syntax);
3354 if (token2.type != OP_CLOSE_BRACKET)
3355 /* The actual error value is not standardized since this whole
3356 case is undefined. But ERANGE makes good sense. */
3357 return REG_ERANGE;
3358 }
3b0bdc72
UD
3359 elem->type = SB_CHAR;
3360 elem->opr.ch = token->opr.c;
3361 return REG_NOERROR;
3362}
3363
434d3784
UD
3364/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3365 such as [:<character_class>:], [.<collating_element>.], and
3366 [=<equivalent_class>=]. */
3367
3b0bdc72 3368static reg_errcode_t
0fd8ae9c
UD
3369parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3370 re_token_t *token)
3b0bdc72
UD
3371{
3372 unsigned char ch, delim = token->opr.c;
3373 int i = 0;
294b6bcc
UD
3374 if (re_string_eoi(regexp))
3375 return REG_EBRACK;
602c2f9d 3376 for (;; ++i)
3b0bdc72 3377 {
294b6bcc 3378 if (i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3379 return REG_EBRACK;
3b0bdc72 3380 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3381 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3382 else
15a7d175 3383 ch = re_string_fetch_byte (regexp);
294b6bcc
UD
3384 if (re_string_eoi(regexp))
3385 return REG_EBRACK;
3b0bdc72 3386 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3387 break;
3b0bdc72
UD
3388 elem->opr.name[i] = ch;
3389 }
3390 re_string_skip_bytes (regexp, 1);
3391 elem->opr.name[i] = '\0';
3392 switch (token->type)
3393 {
3394 case OP_OPEN_COLL_ELEM:
3395 elem->type = COLL_SYM;
3396 break;
3397 case OP_OPEN_EQUIV_CLASS:
3398 elem->type = EQUIV_CLASS;
3399 break;
3400 case OP_OPEN_CHAR_CLASS:
3401 elem->type = CHAR_CLASS;
3402 break;
3403 default:
3404 break;
3405 }
3406 return REG_NOERROR;
3407}
3408
3409 /* Helper function for parse_bracket_exp.
3410 Build the equivalence class which is represented by NAME.
3411 The result are written to MBCSET and SBCSET.
3412 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3413 is a pointer argument sinse we may update it. */
3414
3415static reg_errcode_t
c0a0f9a3 3416#ifdef RE_ENABLE_I18N
0fd8ae9c
UD
3417build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3418 int *equiv_class_alloc, const unsigned char *name)
c0a0f9a3 3419#else /* not RE_ENABLE_I18N */
0fd8ae9c 3420build_equiv_class (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 3421#endif /* not RE_ENABLE_I18N */
3b0bdc72 3422{
0fd8ae9c 3423#ifdef _LIBC
3b0bdc72
UD
3424 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3425 if (nrules != 0)
3426 {
3427 const int32_t *table, *indirect;
3428 const unsigned char *weights, *extra, *cp;
62439eac 3429 unsigned char char_buf[2];
3b0bdc72
UD
3430 int32_t idx1, idx2;
3431 unsigned int ch;
3432 size_t len;
3b0bdc72 3433 /* Calculate the index for equivalence class. */
62439eac 3434 cp = name;
3b0bdc72
UD
3435 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3436 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3437 _NL_COLLATE_WEIGHTMB);
3438 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3439 _NL_COLLATE_EXTRAMB);
3b0bdc72 3440 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3441 _NL_COLLATE_INDIRECTMB);
8c0ab919 3442 idx1 = findidx (table, indirect, extra, &cp, -1);
f3a6cc0a 3443 if (BE (idx1 == 0 || *cp != '\0', 0))
15a7d175
UD
3444 /* This isn't a valid character. */
3445 return REG_ECOLLATE;
3b0bdc72
UD
3446
3447 /* Build single byte matcing table for this equivalence class. */
b7d1c5fa 3448 len = weights[idx1 & 0xffffff];
3b0bdc72 3449 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3450 {
3451 char_buf[0] = ch;
3452 cp = char_buf;
8c0ab919 3453 idx2 = findidx (table, indirect, extra, &cp, 1);
3b0bdc72 3454/*
15a7d175 3455 idx2 = table[ch];
3b0bdc72 3456*/
15a7d175
UD
3457 if (idx2 == 0)
3458 /* This isn't a valid character. */
3459 continue;
b7d1c5fa
UD
3460 /* Compare only if the length matches and the collation rule
3461 index is the same. */
3462 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
15a7d175
UD
3463 {
3464 int cnt = 0;
b7d1c5fa 3465
15a7d175 3466 while (cnt <= len &&
b7d1c5fa
UD
3467 weights[(idx1 & 0xffffff) + 1 + cnt]
3468 == weights[(idx2 & 0xffffff) + 1 + cnt])
15a7d175
UD
3469 ++cnt;
3470
3471 if (cnt > len)
3472 bitset_set (sbcset, ch);
3473 }
3474 }
a9388965 3475 /* Check whether the array has enough space. */
951d6408 3476 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
15a7d175
UD
3477 {
3478 /* Not enough, realloc it. */
3479 /* +1 in case of mbcset->nequiv_classes is 0. */
951d6408 3480 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3481 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3482 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3483 int32_t,
3484 new_equiv_class_alloc);
3485 if (BE (new_equiv_classes == NULL, 0))
15a7d175 3486 return REG_ESPACE;
951d6408
UD
3487 mbcset->equiv_classes = new_equiv_classes;
3488 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3489 }
3b0bdc72
UD
3490 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3491 }
3492 else
10677727 3493#endif /* _LIBC */
3b0bdc72 3494 {
62439eac 3495 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3496 return REG_ECOLLATE;
62439eac 3497 bitset_set (sbcset, *name);
3b0bdc72
UD
3498 }
3499 return REG_NOERROR;
3500}
3501
3502 /* Helper function for parse_bracket_exp.
3503 Build the character class which is represented by NAME.
3504 The result are written to MBCSET and SBCSET.
3505 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3506 is a pointer argument sinse we may update it. */
3507
3508static reg_errcode_t
c0a0f9a3 3509#ifdef RE_ENABLE_I18N
0fd8ae9c
UD
3510build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3511 re_charset_t *mbcset, int *char_class_alloc,
3512 const unsigned char *class_name, reg_syntax_t syntax)
c0a0f9a3 3513#else /* not RE_ENABLE_I18N */
0fd8ae9c
UD
3514build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3515 const unsigned char *class_name, reg_syntax_t syntax)
c0a0f9a3 3516#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3517{
3518 int i;
62439eac 3519 const char *name = (const char *) class_name;
c0a0f9a3
UD
3520
3521 /* In case of REG_ICASE "upper" and "lower" match the both of
3522 upper and lower cases. */
3523 if ((syntax & RE_ICASE)
3524 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3525 name = "alpha";
3526
3527#ifdef RE_ENABLE_I18N
3b0bdc72 3528 /* Check the space of the arrays. */
951d6408 3529 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
a9388965
UD
3530 {
3531 /* Not enough, realloc it. */
3532 /* +1 in case of mbcset->nchar_classes is 0. */
951d6408 3533 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3534 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3535 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3536 new_char_class_alloc);
3537 if (BE (new_char_classes == NULL, 0))
15a7d175 3538 return REG_ESPACE;
951d6408
UD
3539 mbcset->char_classes = new_char_classes;
3540 *char_class_alloc = new_char_class_alloc;
a9388965 3541 }
3b0bdc72 3542 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3543#endif /* RE_ENABLE_I18N */
3b0bdc72 3544
66b110e8 3545#define BUILD_CHARCLASS_LOOP(ctype_func) \
513bbb25
UD
3546 do { \
3547 if (BE (trans != NULL, 0)) \
0ecb606c 3548 { \
513bbb25
UD
3549 for (i = 0; i < SBC_MAX; ++i) \
3550 if (ctype_func (i)) \
3551 bitset_set (sbcset, trans[i]); \
3552 } \
3553 else \
3554 { \
3555 for (i = 0; i < SBC_MAX; ++i) \
3556 if (ctype_func (i)) \
3557 bitset_set (sbcset, i); \
3558 } \
3559 } while (0)
3b0bdc72
UD
3560
3561 if (strcmp (name, "alnum") == 0)
513bbb25 3562 BUILD_CHARCLASS_LOOP (isalnum);
3b0bdc72 3563 else if (strcmp (name, "cntrl") == 0)
513bbb25 3564 BUILD_CHARCLASS_LOOP (iscntrl);
3b0bdc72 3565 else if (strcmp (name, "lower") == 0)
513bbb25 3566 BUILD_CHARCLASS_LOOP (islower);
3b0bdc72 3567 else if (strcmp (name, "space") == 0)
513bbb25 3568 BUILD_CHARCLASS_LOOP (isspace);
3b0bdc72 3569 else if (strcmp (name, "alpha") == 0)
513bbb25 3570 BUILD_CHARCLASS_LOOP (isalpha);
3b0bdc72 3571 else if (strcmp (name, "digit") == 0)
513bbb25 3572 BUILD_CHARCLASS_LOOP (isdigit);
3b0bdc72 3573 else if (strcmp (name, "print") == 0)
513bbb25 3574 BUILD_CHARCLASS_LOOP (isprint);
3b0bdc72 3575 else if (strcmp (name, "upper") == 0)
513bbb25 3576 BUILD_CHARCLASS_LOOP (isupper);
3b0bdc72 3577 else if (strcmp (name, "blank") == 0)
513bbb25 3578 BUILD_CHARCLASS_LOOP (isblank);
3b0bdc72 3579 else if (strcmp (name, "graph") == 0)
513bbb25 3580 BUILD_CHARCLASS_LOOP (isgraph);
3b0bdc72 3581 else if (strcmp (name, "punct") == 0)
513bbb25 3582 BUILD_CHARCLASS_LOOP (ispunct);
3b0bdc72 3583 else if (strcmp (name, "xdigit") == 0)
513bbb25 3584 BUILD_CHARCLASS_LOOP (isxdigit);
3b0bdc72
UD
3585 else
3586 return REG_ECTYPE;
3587
3588 return REG_NOERROR;
3589}
3590
3591static bin_tree_t *
0fd8ae9c
UD
3592build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3593 const unsigned char *class_name,
3594 const unsigned char *extra, int non_match,
3595 reg_errcode_t *err)
3b0bdc72
UD
3596{
3597 re_bitset_ptr_t sbcset;
c0a0f9a3 3598#ifdef RE_ENABLE_I18N
3b0bdc72 3599 re_charset_t *mbcset;
c0a0f9a3 3600 int alloc = 0;
c0a0f9a3 3601#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3602 reg_errcode_t ret;
3603 re_token_t br_token;
3604 bin_tree_t *tree;
3b0bdc72 3605
2c05d33f 3606 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3607#ifdef RE_ENABLE_I18N
3b0bdc72 3608 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3609#endif /* RE_ENABLE_I18N */
3610
3611#ifdef RE_ENABLE_I18N
bc15410e 3612 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3613#else /* not RE_ENABLE_I18N */
3614 if (BE (sbcset == NULL, 0))
3615#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3616 {
3617 *err = REG_ESPACE;
3618 return NULL;
3619 }
3620
10677727 3621 if (non_match)
3b0bdc72 3622 {
c0a0f9a3 3623#ifdef RE_ENABLE_I18N
c0a0f9a3 3624 mbcset->non_match = 1;
c0a0f9a3 3625#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3626 }
3627
602c2f9d 3628 /* We don't care the syntax in this case. */
66b110e8 3629 ret = build_charclass (trans, sbcset,
c0a0f9a3 3630#ifdef RE_ENABLE_I18N
15a7d175 3631 mbcset, &alloc,
c0a0f9a3 3632#endif /* RE_ENABLE_I18N */
e2b6bfa3 3633 class_name, 0);
c0a0f9a3 3634
bc15410e 3635 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3636 {
3637 re_free (sbcset);
c0a0f9a3 3638#ifdef RE_ENABLE_I18N
3b0bdc72 3639 free_charset (mbcset);
c0a0f9a3 3640#endif /* RE_ENABLE_I18N */
7b7b9e70 3641 *err = ret;
3b0bdc72
UD
3642 return NULL;
3643 }
434d3784 3644 /* \w match '_' also. */
e2b6bfa3
UD
3645 for (; *extra; extra++)
3646 bitset_set (sbcset, *extra);
3b0bdc72
UD
3647
3648 /* If it is non-matching list. */
c0a0f9a3 3649 if (non_match)
3b0bdc72
UD
3650 bitset_not (sbcset);
3651
65e6becf
UD
3652#ifdef RE_ENABLE_I18N
3653 /* Ensure only single byte characters are set. */
3654 if (dfa->mb_cur_max > 1)
3655 bitset_mask (sbcset, dfa->sb_char);
3656#endif
3657
3b0bdc72
UD
3658 /* Build a tree for simple bracket. */
3659 br_token.type = SIMPLE_BRACKET;
3660 br_token.opr.sbcset = sbcset;
02f3550c 3661 tree = create_token_tree (dfa, NULL, NULL, &br_token);
ee70274a 3662 if (BE (tree == NULL, 0))
3b0bdc72
UD
3663 goto build_word_op_espace;
3664
c0a0f9a3 3665#ifdef RE_ENABLE_I18N
3c0fb574 3666 if (dfa->mb_cur_max > 1)
3b0bdc72 3667 {
3b0bdc72
UD
3668 bin_tree_t *mbc_tree;
3669 /* Build a tree for complex bracket. */
3670 br_token.type = COMPLEX_BRACKET;
3671 br_token.opr.mbcset = mbcset;
3672 dfa->has_mb_node = 1;
02f3550c 3673 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
ee70274a 3674 if (BE (mbc_tree == NULL, 0))
15a7d175 3675 goto build_word_op_espace;
3b0bdc72 3676 /* Then join them by ALT node. */
02f3550c 3677 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
ee70274a 3678 if (BE (mbc_tree != NULL, 1))
15a7d175 3679 return tree;
3b0bdc72
UD
3680 }
3681 else
3682 {
3683 free_charset (mbcset);
3684 return tree;
3685 }
c0a0f9a3
UD
3686#else /* not RE_ENABLE_I18N */
3687 return tree;
3688#endif /* not RE_ENABLE_I18N */
3689
3b0bdc72
UD
3690 build_word_op_espace:
3691 re_free (sbcset);
c0a0f9a3 3692#ifdef RE_ENABLE_I18N
3b0bdc72 3693 free_charset (mbcset);
c0a0f9a3 3694#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3695 *err = REG_ESPACE;
3696 return NULL;
3697}
3698
3699/* This is intended for the expressions like "a{1,3}".
3700 Fetch a number from `input', and return the number.
3701 Return -1, if the number field is empty like "{,1}".
3702 Return -2, If an error is occured. */
3703
3704static int
0fd8ae9c 3705fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3b0bdc72
UD
3706{
3707 int num = -1;
3708 unsigned char c;
3709 while (1)
3710 {
f0d77aa8 3711 fetch_token (token, input, syntax);
3b0bdc72 3712 c = token->opr.c;
602c2f9d 3713 if (BE (token->type == END_OF_RE, 0))
15a7d175 3714 return -2;
3b0bdc72 3715 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3716 break;
602c2f9d 3717 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
15a7d175 3718 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
602c2f9d 3719 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3720 }
3b0bdc72
UD
3721 return num;
3722}
3723\f
c0a0f9a3 3724#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3725static void
3726free_charset (re_charset_t *cset)
3727{
3728 re_free (cset->mbchars);
c0a0f9a3 3729# ifdef _LIBC
3b0bdc72
UD
3730 re_free (cset->coll_syms);
3731 re_free (cset->equiv_classes);
3732 re_free (cset->range_starts);
3733 re_free (cset->range_ends);
c0a0f9a3 3734# endif
3b0bdc72
UD
3735 re_free (cset->char_classes);
3736 re_free (cset);
3737}
c0a0f9a3 3738#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3739\f
3740/* Functions for binary tree operation. */
3741
ee70274a 3742/* Create a tree node. */
3b0bdc72
UD
3743
3744static bin_tree_t *
0fd8ae9c
UD
3745create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3746 re_token_type_t type)
02f3550c
UD
3747{
3748 re_token_t t;
3749 t.type = type;
3750 return create_token_tree (dfa, left, right, &t);
3751}
3752
3753static bin_tree_t *
0fd8ae9c
UD
3754create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3755 const re_token_t *token)
3b0bdc72
UD
3756{
3757 bin_tree_t *tree;
ee70274a 3758 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3b0bdc72 3759 {
ee70274a
UD
3760 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3761
3762 if (storage == NULL)
3763 return NULL;
3764 storage->next = dfa->str_tree_storage;
3765 dfa->str_tree_storage = storage;
3766 dfa->str_tree_storage_idx = 0;
3b0bdc72 3767 }
ee70274a
UD
3768 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3769
3b0bdc72
UD
3770 tree->parent = NULL;
3771 tree->left = left;
3772 tree->right = right;
02f3550c
UD
3773 tree->token = *token;
3774 tree->token.duplicated = 0;
3775 tree->token.opt_subexp = 0;
3776 tree->first = NULL;
3777 tree->next = NULL;
3778 tree->node_idx = -1;
3b0bdc72
UD
3779
3780 if (left != NULL)
3781 left->parent = tree;
3782 if (right != NULL)
3783 right->parent = tree;
3784 return tree;
3785}
3786
02f3550c
UD
3787/* Mark the tree SRC as an optional subexpression.
3788 To be called from preorder or postorder. */
3b0bdc72 3789
02f3550c 3790static reg_errcode_t
0fd8ae9c 3791mark_opt_subexp (void *extra, bin_tree_t *node)
f0d77aa8 3792{
02f3550c
UD
3793 int idx = (int) (long) extra;
3794 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3795 node->token.opt_subexp = 1;
a334319f 3796
02f3550c 3797 return REG_NOERROR;
3b0bdc72
UD
3798}
3799
02f3550c 3800/* Free the allocated memory inside NODE. */
6b6557e8
UD
3801
3802static void
02f3550c 3803free_token (re_token_t *node)
6b6557e8 3804{
02f3550c
UD
3805#ifdef RE_ENABLE_I18N
3806 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3807 free_charset (node->opr.mbcset);
3808 else
3809#endif /* RE_ENABLE_I18N */
3810 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3811 re_free (node->opr.sbcset);
6b6557e8
UD
3812}
3813
02f3550c
UD
3814/* Worker function for tree walking. Free the allocated memory inside NODE
3815 and its children. */
6b6557e8 3816
02f3550c
UD
3817static reg_errcode_t
3818free_tree (void *extra, bin_tree_t *node)
6b6557e8 3819{
02f3550c
UD
3820 free_token (&node->token);
3821 return REG_NOERROR;
6b6557e8
UD
3822}
3823
ee70274a 3824
02f3550c
UD
3825/* Duplicate the node SRC, and return new node. This is a preorder
3826 visit similar to the one implemented by the generic visitor, but
3827 we need more infrastructure to maintain two parallel trees --- so,
3828 it's easier to duplicate. */
3b0bdc72
UD
3829
3830static bin_tree_t *
0fd8ae9c 3831duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3b0bdc72 3832{
02f3550c
UD
3833 const bin_tree_t *node;
3834 bin_tree_t *dup_root;
3835 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3b0bdc72 3836
02f3550c 3837 for (node = root; ; )
3b0bdc72 3838 {
02f3550c
UD
3839 /* Create a new tree and link it back to the current parent. */
3840 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3841 if (*p_new == NULL)
ee70274a 3842 return NULL;
02f3550c
UD
3843 (*p_new)->parent = dup_node;
3844 (*p_new)->token.duplicated = 1;
3845 dup_node = *p_new;
3b0bdc72 3846
02f3550c
UD
3847 /* Go to the left node, or up and to the right. */
3848 if (node->left)
3849 {
3850 node = node->left;
3851 p_new = &dup_node->left;
3852 }
3853 else
3854 {
3855 const bin_tree_t *prev = NULL;
3856 while (node->right == prev || node->right == NULL)
3857 {
3858 prev = node;
3859 node = node->parent;
3860 dup_node = dup_node->parent;
3861 if (!node)
21f5de55 3862 return dup_root;
02f3550c
UD
3863 }
3864 node = node->right;
3865 p_new = &dup_node->right;
3866 }
3b0bdc72 3867 }
3b0bdc72 3868}