]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
misc/test-errno-linux: Handle EINVAL from quotactl
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
688903eb 2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
eb04c213 18 <https://www.gnu.org/licenses/>. */
e054f494 19
8c0ab919
RM
20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
3b0bdc72 24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
01ed6ceb 25 size_t length, reg_syntax_t syntax);
3b0bdc72 26static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
27 const re_dfastate_t *init_state,
28 char *fastmap);
01ed6ceb 29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
c0a0f9a3 30#ifdef RE_ENABLE_I18N
3b0bdc72 31static void free_charset (re_charset_t *cset);
c0a0f9a3 32#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
33static void free_workarea_compile (regex_t *preg);
34static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
35#ifdef RE_ENABLE_I18N
36static void optimize_utf8 (re_dfa_t *dfa);
37#endif
02f3550c 38static reg_errcode_t analyze (regex_t *preg);
02f3550c
UD
39static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
eb04c213
AZ
52static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
a7d5c291 54 unsigned int constraint);
3b0bdc72 55static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 56static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
eb04c213 57 Idx node, bool root);
963d8d78 58static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
eb04c213 59static Idx fetch_number (re_string_t *input, re_token_t *token,
15a7d175 60 reg_syntax_t syntax);
3b0bdc72 61static int peek_token (re_token_t *token, re_string_t *input,
b41bd5bc 62 reg_syntax_t syntax);
3b0bdc72 63static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 64 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 65static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175 66 re_token_t *token, reg_syntax_t syntax,
eb04c213 67 Idx nest, reg_errcode_t *err);
3b0bdc72 68static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175 69 re_token_t *token, reg_syntax_t syntax,
eb04c213 70 Idx nest, reg_errcode_t *err);
3b0bdc72 71static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175 72 re_token_t *token, reg_syntax_t syntax,
eb04c213 73 Idx nest, reg_errcode_t *err);
3b0bdc72 74static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175 75 re_token_t *token, reg_syntax_t syntax,
eb04c213 76 Idx nest, reg_errcode_t *err);
3b0bdc72 77static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 80static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
3b0bdc72 83static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
78d8b07a 87 reg_syntax_t syntax,
eb04c213 88 bool accept_hyphen);
3b0bdc72 89static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
90 re_string_t *regexp,
91 re_token_t *token);
c0a0f9a3 92#ifdef RE_ENABLE_I18N
2c05d33f 93static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 94 re_charset_t *mbcset,
eb04c213 95 Idx *equiv_class_alloc,
15a7d175 96 const unsigned char *name);
997470b3 97static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 98 bitset_t sbcset,
15a7d175 99 re_charset_t *mbcset,
eb04c213
AZ
100 Idx *char_class_alloc,
101 const char *class_name,
15a7d175 102 reg_syntax_t syntax);
c0a0f9a3 103#else /* not RE_ENABLE_I18N */
2c05d33f 104static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 105 const unsigned char *name);
997470b3 106static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 107 bitset_t sbcset,
eb04c213 108 const char *class_name,
15a7d175 109 reg_syntax_t syntax);
c0a0f9a3 110#endif /* not RE_ENABLE_I18N */
86576b62 111static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
997470b3 112 RE_TRANSLATE_TYPE trans,
eb04c213
AZ
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
ee70274a
UD
116static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
02f3550c
UD
118 re_token_type_t type);
119static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
3b0bdc72 122static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
02f3550c
UD
123static void free_token (re_token_t *node);
124static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
3b0bdc72
UD
126\f
127/* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
131
eb04c213 132static const char __re_error_msgid[] =
3b0bdc72
UD
133 {
134#define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
eb04c213 156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
3b0bdc72 157 "\0"
eb04c213 158#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
3b0bdc72
UD
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 };
185
eb04c213 186static const size_t __re_error_msgid_idx[] =
3b0bdc72
UD
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206\f
207/* Entry points for GNU code. */
208
209/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
211 Returns 0 if the pattern was valid, otherwise an error string.
212
d3821ab0 213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
3b0bdc72
UD
214 are set in BUFP on entry. */
215
216const char *
9dd346ff
JM
217re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
3b0bdc72
UD
219{
220 reg_errcode_t ret;
221
3b0bdc72
UD
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
c06a6956
UD
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
3b0bdc72
UD
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
75e4a282 230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
231
232 if (!ret)
233 return NULL;
6455d255 234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
235}
236#ifdef _LIBC
237weak_alias (__re_compile_pattern, re_compile_pattern)
238#endif
239
d3821ab0 240/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
3b0bdc72
UD
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243/* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245reg_syntax_t re_syntax_options;
246
247
248/* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255reg_syntax_t
9dd346ff 256re_set_syntax (reg_syntax_t syntax)
3b0bdc72
UD
257{
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262}
263#ifdef _LIBC
264weak_alias (__re_set_syntax, re_set_syntax)
265#endif
266
267int
9dd346ff 268re_compile_fastmap (struct re_pattern_buffer *bufp)
3b0bdc72 269{
eb04c213 270 re_dfa_t *dfa = bufp->buffer;
3b0bdc72
UD
271 char *fastmap = bufp->fastmap;
272
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
283}
284#ifdef _LIBC
285weak_alias (__re_compile_fastmap, re_compile_fastmap)
286#endif
287
1b2c2628 288static inline void
d3821ab0
RM
289__attribute__ ((always_inline))
290re_set_fastmap (char *fastmap, bool icase, int ch)
1b2c2628
UD
291{
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
295}
296
3b0bdc72
UD
297/* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
299
300static void
0fd8ae9c
UD
301re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
3b0bdc72 303{
eb04c213
AZ
304 re_dfa_t *dfa = bufp->buffer;
305 Idx node_cnt;
306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308 {
eb04c213 309 Idx node = init_state->nodes.elems[node_cnt];
3b0bdc72 310 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
311
312 if (type == CHARACTER)
74e12fbc
UD
313 {
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315#ifdef RE_ENABLE_I18N
14744156 316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 317 {
eb04c213
AZ
318 unsigned char buf[MB_LEN_MAX];
319 unsigned char *p;
74e12fbc
UD
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
2c05d33f 329 memset (&state, '\0', sizeof (state));
b3918c7d
UD
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
9dd6b779 332 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
88764ae2 333 != (size_t) -1))
eb04c213 334 re_set_fastmap (fastmap, false, buf[0]);
74e12fbc
UD
335 }
336#endif
337 }
3b0bdc72 338 else if (type == SIMPLE_BRACKET)
15a7d175 339 {
2c05d33f
UD
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
15a7d175 349 }
c0a0f9a3 350#ifdef RE_ENABLE_I18N
3b0bdc72 351 else if (type == COMPLEX_BRACKET)
15a7d175 352 {
15a7d175 353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
eb04c213 354 Idx i;
bdb56bac 355
c0a0f9a3 356# ifdef _LIBC
bdb56bac
UD
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
15a7d175 365 {
15a7d175
UD
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2c05d33f
UD
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
15a7d175 371 }
bdb56bac
UD
372# endif /* _LIBC */
373
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
815d8147 379 && (cset->nchar_classes || cset->non_match || cset->nranges
bdb56bac
UD
380# ifdef _LIBC
381 || cset->nequiv_classes
382# endif /* _LIBC */
383 ))
15a7d175 384 {
bdb56bac
UD
385 unsigned char c = 0;
386 do
74e12fbc 387 {
bdb56bac
UD
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
74e12fbc 392 }
bdb56bac 393 while (++c != 0);
15a7d175 394 }
bdb56bac
UD
395
396 else
397 {
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
400 {
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 {
9dd6b779 408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
bdb56bac
UD
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 }
aa732e2b
UD
412 }
413 }
15a7d175 414 }
c0a0f9a3 415#endif /* RE_ENABLE_I18N */
c0d5034e
UD
416 else if (type == OP_PERIOD
417#ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419#endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
15a7d175
UD
421 {
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
426 }
3b0bdc72
UD
427 }
428}
429\f
430/* Entry point for POSIX code. */
431/* regcomp takes a regular expression as a string and compiles it.
432
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
435
d3821ab0
RM
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
3b0bdc72
UD
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
d3821ab0
RM
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
3b0bdc72
UD
445
446 PATTERN is the address of the pattern string.
447
448 CFLAGS is a series of bits which affect compilation.
449
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
452
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
455
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
458
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
462
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
465
466int
eb04c213 467regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
3b0bdc72
UD
468{
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 471 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
472
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
476
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 479 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
480 return REG_ESPACE;
481
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
491 }
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
496
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
503
a9388965 504 /* We have already checked preg->fastmap != NULL. */
bc15410e 505 if (BE (ret == REG_NOERROR, 1))
71ccd330 506 /* Compute the fastmap now, since regexec cannot modify the pattern
86576b62 507 buffer. This function never fails in this implementation. */
83b038f2 508 (void) re_compile_fastmap (preg);
71ccd330 509 else
3b0bdc72 510 {
71ccd330
UD
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
3b0bdc72
UD
514 }
515
516 return (int) ret;
517}
518#ifdef _LIBC
b68f8620 519libc_hidden_def (__regcomp)
3b0bdc72
UD
520weak_alias (__regcomp, regcomp)
521#endif
522
523/* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
525
526size_t
eb04c213 527regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
9dd346ff 528 size_t errbuf_size)
3b0bdc72
UD
529{
530 const char *msg;
531 size_t msg_size;
532
bc15410e 533 if (BE (errcode < 0
15a7d175
UD
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
541
6455d255 542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
543
544 msg_size = strlen (msg) + 1; /* Includes the null. */
545
bc15410e 546 if (BE (errbuf_size != 0, 1))
3b0bdc72 547 {
eb04c213 548 size_t cpy_size = msg_size;
bc15410e 549 if (BE (msg_size > errbuf_size, 0))
15a7d175 550 {
eb04c213
AZ
551 cpy_size = errbuf_size - 1;
552 errbuf[cpy_size] = '\0';
15a7d175 553 }
eb04c213 554 memcpy (errbuf, msg, cpy_size);
3b0bdc72
UD
555 }
556
557 return msg_size;
558}
559#ifdef _LIBC
560weak_alias (__regerror, regerror)
561#endif
562
3b0bdc72 563
e40a38b3
UD
564#ifdef RE_ENABLE_I18N
565/* This static array is used for the map to single-byte characters when
566 UTF-8 is used. Otherwise we would allocate memory just to initialize
567 it the same all the time. UTF-8 is the preferred encoding so this is
568 a worthwhile optimization. */
2c05d33f 569static const bitset_t utf8_sb_map =
e40a38b3
UD
570{
571 /* Set the first 128 bits. */
eb04c213 572# if defined __GNUC__ && !defined __STRICT_ANSI__
2c05d33f 573 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
eb04c213
AZ
574# else
575# if 4 * BITSET_WORD_BITS < ASCII_CHARS
576# error "bitset_word_t is narrower than 32 bits"
577# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
579# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
580 BITSET_WORD_MAX, BITSET_WORD_MAX,
581# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
582 BITSET_WORD_MAX,
583# endif
584 (BITSET_WORD_MAX
585 >> (SBC_MAX % BITSET_WORD_BITS == 0
586 ? 0
587 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
588# endif
e40a38b3
UD
589};
590#endif
591
592
71ccd330
UD
593static void
594free_dfa_content (re_dfa_t *dfa)
3b0bdc72 595{
eb04c213 596 Idx i, j;
3b0bdc72 597
ee70274a
UD
598 if (dfa->nodes)
599 for (i = 0; i < dfa->nodes_len; ++i)
02f3550c 600 free_token (dfa->nodes + i);
71ccd330
UD
601 re_free (dfa->nexts);
602 for (i = 0; i < dfa->nodes_len; ++i)
603 {
604 if (dfa->eclosures != NULL)
605 re_node_set_free (dfa->eclosures + i);
606 if (dfa->inveclosures != NULL)
607 re_node_set_free (dfa->inveclosures + i);
608 if (dfa->edests != NULL)
609 re_node_set_free (dfa->edests + i);
610 }
611 re_free (dfa->edests);
612 re_free (dfa->eclosures);
613 re_free (dfa->inveclosures);
614 re_free (dfa->nodes);
3b0bdc72 615
ee70274a
UD
616 if (dfa->state_table)
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 {
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
621 {
622 re_dfastate_t *state = entry->array[j];
623 free_state (state);
624 }
21f5de55 625 re_free (entry->array);
ee70274a 626 }
71ccd330 627 re_free (dfa->state_table);
65e6becf 628#ifdef RE_ENABLE_I18N
e40a38b3
UD
629 if (dfa->sb_char != utf8_sb_map)
630 re_free (dfa->sb_char);
a96c63ed 631#endif
c06a6956 632 re_free (dfa->subexp_map);
0742e48e 633#ifdef DEBUG
71ccd330 634 re_free (dfa->re_str);
0742e48e 635#endif
71ccd330
UD
636
637 re_free (dfa);
638}
639
640
641/* Free dynamically allocated space used by PREG. */
642
643void
9dd346ff 644regfree (regex_t *preg)
71ccd330 645{
eb04c213 646 re_dfa_t *dfa = preg->buffer;
71ccd330 647 if (BE (dfa != NULL, 1))
eb04c213
AZ
648 {
649 lock_fini (dfa->lock);
650 free_dfa_content (dfa);
651 }
86576b62
UD
652 preg->buffer = NULL;
653 preg->allocated = 0;
71ccd330 654
3b0bdc72 655 re_free (preg->fastmap);
86576b62
UD
656 preg->fastmap = NULL;
657
658 re_free (preg->translate);
659 preg->translate = NULL;
3b0bdc72
UD
660}
661#ifdef _LIBC
b68f8620 662libc_hidden_def (__regfree)
3b0bdc72
UD
663weak_alias (__regfree, regfree)
664#endif
665\f
666/* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669#if defined _REGEX_RE_COMP || defined _LIBC
670
671/* BSD has one and only one pattern buffer. */
672static struct re_pattern_buffer re_comp_buf;
673
674char *
675# ifdef _LIBC
676/* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679weak_function
680# endif
80d9be81 681re_comp (const char *s)
3b0bdc72
UD
682{
683 reg_errcode_t ret;
240e87c2 684 char *fastmap;
3b0bdc72
UD
685
686 if (!s)
687 {
688 if (!re_comp_buf.buffer)
689 return gettext ("No previous regular expression");
690 return 0;
691 }
692
240e87c2
RM
693 if (re_comp_buf.buffer)
694 {
695 fastmap = re_comp_buf.fastmap;
696 re_comp_buf.fastmap = NULL;
697 __regfree (&re_comp_buf);
1b2c2628 698 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
699 re_comp_buf.fastmap = fastmap;
700 }
701
702 if (re_comp_buf.fastmap == NULL)
3b0bdc72 703 {
eb04c213 704 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
3b0bdc72 705 if (re_comp_buf.fastmap == NULL)
6455d255
UD
706 return (char *) gettext (__re_error_msgid
707 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
708 }
709
d3821ab0 710 /* Since 're_exec' always passes NULL for the 'regs' argument, we
3b0bdc72
UD
711 don't need to initialize the pattern buffer fields which affect it. */
712
713 /* Match anchors at newlines. */
714 re_comp_buf.newline_anchor = 1;
715
716 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
717
718 if (!ret)
719 return NULL;
720
eb04c213 721 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
6455d255 722 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 723}
240e87c2
RM
724
725#ifdef _LIBC
c877418f 726libc_freeres_fn (free_mem)
240e87c2
RM
727{
728 __regfree (&re_comp_buf);
729}
240e87c2
RM
730#endif
731
3b0bdc72
UD
732#endif /* _REGEX_RE_COMP */
733\f
734/* Internal entry point.
735 Compile the regular expression PATTERN, whose length is LENGTH.
736 SYNTAX indicate regular expression's syntax. */
737
738static reg_errcode_t
0fd8ae9c
UD
739re_compile_internal (regex_t *preg, const char * pattern, size_t length,
740 reg_syntax_t syntax)
3b0bdc72
UD
741{
742 reg_errcode_t err = REG_NOERROR;
743 re_dfa_t *dfa;
744 re_string_t regexp;
745
746 /* Initialize the pattern buffer. */
747 preg->fastmap_accurate = 0;
748 preg->syntax = syntax;
749 preg->not_bol = preg->not_eol = 0;
750 preg->used = 0;
751 preg->re_nsub = 0;
1b2c2628
UD
752 preg->can_be_null = 0;
753 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
754
755 /* Initialize the dfa. */
eb04c213 756 dfa = preg->buffer;
951d6408 757 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3b0bdc72
UD
758 {
759 /* If zero allocated, but buffer is non-null, try to realloc
760 enough space. This loses if buffer's address is bogus, but
761 that is the user's responsibility. If ->buffer is NULL this
762 is a simple allocation. */
763 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764 if (dfa == NULL)
765 return REG_ESPACE;
3b0bdc72 766 preg->allocated = sizeof (re_dfa_t);
eb04c213 767 preg->buffer = dfa;
3b0bdc72 768 }
3b0bdc72
UD
769 preg->used = sizeof (re_dfa_t);
770
771 err = init_dfa (dfa, length);
eb04c213
AZ
772 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
773 err = REG_ESPACE;
bc15410e 774 if (BE (err != REG_NOERROR, 0))
3b0bdc72 775 {
ee70274a 776 free_dfa_content (dfa);
3b0bdc72 777 preg->buffer = NULL;
ca3c505e 778 preg->allocated = 0;
3b0bdc72
UD
779 return err;
780 }
0742e48e 781#ifdef DEBUG
01ed6ceb 782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
0742e48e
UD
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785#endif
3b0bdc72 786
612546c6 787 err = re_string_construct (&regexp, pattern, length, preg->translate,
eb04c213 788 (syntax & RE_ICASE) != 0, dfa);
bc15410e 789 if (BE (err != REG_NOERROR, 0))
3b0bdc72 790 {
ee70274a
UD
791 re_compile_internal_free_return:
792 free_workarea_compile (preg);
793 re_string_destruct (&regexp);
eb04c213 794 lock_fini (dfa->lock);
ee70274a 795 free_dfa_content (dfa);
3b0bdc72 796 preg->buffer = NULL;
ca3c505e 797 preg->allocated = 0;
3b0bdc72
UD
798 return err;
799 }
800
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 804 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
805 goto re_compile_internal_free_return;
806
02f3550c
UD
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
811
14744156
UD
812#ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
815 optimize_utf8 (dfa);
816#endif
817
3b0bdc72
UD
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
3b0bdc72 820
3b0bdc72
UD
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (&regexp);
824
71ccd330
UD
825 if (BE (err != REG_NOERROR, 0))
826 {
eb04c213 827 lock_fini (dfa->lock);
71ccd330
UD
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
ca3c505e 830 preg->allocated = 0;
71ccd330
UD
831 }
832
3b0bdc72
UD
833 return err;
834}
835
836/* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
838
839static reg_errcode_t
0fd8ae9c 840init_dfa (re_dfa_t *dfa, size_t pat_len)
3b0bdc72 841{
eb04c213 842 __re_size_t table_size;
e40a38b3 843#ifndef _LIBC
eb04c213
AZ
844 const char *codeset_name;
845#endif
846#ifdef RE_ENABLE_I18N
847 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
848#else
849 size_t max_i18n_object_size = 0;
e40a38b3 850#endif
eb04c213
AZ
851 size_t max_object_size =
852 MAX (sizeof (struct re_state_table_entry),
853 MAX (sizeof (re_token_t),
854 MAX (sizeof (re_node_set),
855 MAX (sizeof (regmatch_t),
856 max_i18n_object_size))));
81c64d40
UD
857
858 memset (dfa, '\0', sizeof (re_dfa_t));
859
ee70274a
UD
860 /* Force allocation of str_tree_storage the first time. */
861 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
862
eb04c213
AZ
863 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
864 calculation below, and for similar doubling calculations
865 elsewhere. And it's <= rather than <, because some of the
866 doubling calculations add 1 afterwards. */
867 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
01ed6ceb
UD
868 return REG_ESPACE;
869
3b0bdc72
UD
870 dfa->nodes_alloc = pat_len + 1;
871 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
872
3b0bdc72 873 /* table_size = 2 ^ ceil(log pat_len) */
01ed6ceb 874 for (table_size = 1; ; table_size <<= 1)
3b0bdc72
UD
875 if (table_size > pat_len)
876 break;
877
878 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879 dfa->state_hash_mask = table_size - 1;
880
3c0fb574
UD
881 dfa->mb_cur_max = MB_CUR_MAX;
882#ifdef _LIBC
bb3f4825 883 if (dfa->mb_cur_max == 6
3c0fb574
UD
884 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885 dfa->is_utf8 = 1;
f0c7c524
UD
886 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 != 0);
e40a38b3 888#else
e40a38b3 889 codeset_name = nl_langinfo (CODESET);
eb04c213
AZ
890 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
891 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
892 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
893 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
e40a38b3
UD
894 dfa->is_utf8 = 1;
895
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
3c0fb574 899#endif
e40a38b3 900
65e6becf
UD
901#ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
903 {
65e6becf 904 if (dfa->is_utf8)
e40a38b3 905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
65e6becf 906 else
e40a38b3
UD
907 {
908 int i, j, ch;
909
2c05d33f 910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
e40a38b3
UD
911 if (BE (dfa->sb_char == NULL, 0))
912 return REG_ESPACE;
913
2c05d33f
UD
914 /* Set the bits corresponding to single byte chars. */
915 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
e40a38b3 917 {
2d87db5b 918 wint_t wch = __btowc (ch);
e40a38b3 919 if (wch != WEOF)
2c05d33f 920 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
e40a38b3 921# ifndef _LIBC
2d87db5b 922 if (isascii (ch) && wch != ch)
e40a38b3
UD
923 dfa->map_notascii = 1;
924# endif
925 }
926 }
65e6becf
UD
927 }
928#endif
3c0fb574 929
ae73c6c1 930 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
ee70274a 931 return REG_ESPACE;
3b0bdc72
UD
932 return REG_NOERROR;
933}
934
935/* Initialize WORD_CHAR table, which indicate which character is
936 "word". In this case "word" means that it is the word construction
937 character used by some operators like "\<", "\>", etc. */
938
56b168be 939static void
0fd8ae9c 940init_word_char (re_dfa_t *dfa)
3b0bdc72 941{
9f115170 942 int i = 0;
eb04c213 943 int j;
9f115170 944 int ch = 0;
eb04c213 945 dfa->word_ops_used = 1;
9f115170
UD
946 if (BE (dfa->map_notascii == 0, 1))
947 {
0285e6bd
PE
948 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 them, an issue when this code is used in Gnulib. */
567d8c1f
PE
950 bitset_word_t bits0 = 0x00000000;
951 bitset_word_t bits1 = 0x03ff0000;
952 bitset_word_t bits2 = 0x87fffffe;
953 bitset_word_t bits3 = 0x07fffffe;
954 if (BITSET_WORD_BITS == 64)
9f115170 955 {
0285e6bd 956 /* Pacify gcc -Woverflow on 32-bit platformns. */
567d8c1f
PE
957 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
958 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
9f115170
UD
959 i = 2;
960 }
567d8c1f 961 else if (BITSET_WORD_BITS == 32)
9f115170 962 {
567d8c1f
PE
963 dfa->word_char[0] = bits0;
964 dfa->word_char[1] = bits1;
965 dfa->word_char[2] = bits2;
966 dfa->word_char[3] = bits3;
9f115170
UD
967 i = 4;
968 }
969 else
567d8c1f 970 goto general_case;
9f115170
UD
971 ch = 128;
972
973 if (BE (dfa->is_utf8, 1))
974 {
975 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
976 return;
977 }
978 }
979
567d8c1f 980 general_case:
9f115170 981 for (; i < BITSET_WORDS; ++i)
eb04c213 982 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3b0bdc72 983 if (isalnum (ch) || ch == '_')
2c05d33f 984 dfa->word_char[i] |= (bitset_word_t) 1 << j;
3b0bdc72
UD
985}
986
987/* Free the work area which are only used while compiling. */
988
989static void
0fd8ae9c 990free_workarea_compile (regex_t *preg)
3b0bdc72 991{
eb04c213 992 re_dfa_t *dfa = preg->buffer;
ee70274a
UD
993 bin_tree_storage_t *storage, *next;
994 for (storage = dfa->str_tree_storage; storage; storage = next)
995 {
996 next = storage->next;
997 re_free (storage);
998 }
999 dfa->str_tree_storage = NULL;
1000 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 1001 dfa->str_tree = NULL;
a7d5c291
UD
1002 re_free (dfa->org_indices);
1003 dfa->org_indices = NULL;
3b0bdc72
UD
1004}
1005
1006/* Create initial states for all contexts. */
1007
1008static reg_errcode_t
0fd8ae9c 1009create_initial_state (re_dfa_t *dfa)
3b0bdc72 1010{
eb04c213 1011 Idx first, i;
3b0bdc72
UD
1012 reg_errcode_t err;
1013 re_node_set init_nodes;
1014
1015 /* Initial states have the epsilon closure of the node which is
1016 the first node of the regular expression. */
02f3550c 1017 first = dfa->str_tree->first->node_idx;
3b0bdc72
UD
1018 dfa->init_node = first;
1019 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 1020 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
1021 return err;
1022
1023 /* The back-references which are in initial states can epsilon transit,
1024 since in this case all of the subexpressions can be null.
1025 Then we add epsilon closures of the nodes which are the next nodes of
1026 the back-references. */
1027 if (dfa->nbackref > 0)
1028 for (i = 0; i < init_nodes.nelem; ++i)
1029 {
eb04c213 1030 Idx node_idx = init_nodes.elems[i];
15a7d175
UD
1031 re_token_type_t type = dfa->nodes[node_idx].type;
1032
eb04c213 1033 Idx clexp_idx;
15a7d175
UD
1034 if (type != OP_BACK_REF)
1035 continue;
1036 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1037 {
1038 re_token_t *clexp_node;
1039 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040 if (clexp_node->type == OP_CLOSE_SUBEXP
ae73c6c1 1041 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
15a7d175
UD
1042 break;
1043 }
1044 if (clexp_idx == init_nodes.nelem)
1045 continue;
1046
1047 if (type == OP_BACK_REF)
1048 {
eb04c213 1049 Idx dest_idx = dfa->edests[node_idx].elems[0];
15a7d175
UD
1050 if (!re_node_set_contains (&init_nodes, dest_idx))
1051 {
eb04c213
AZ
1052 reg_errcode_t merge_err
1053 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054 if (merge_err != REG_NOERROR)
1055 return merge_err;
15a7d175
UD
1056 i = 0;
1057 }
1058 }
3b0bdc72
UD
1059 }
1060
1061 /* It must be the first time to invoke acquire_state. */
a9388965
UD
1062 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 1064 if (BE (dfa->init_state == NULL, 0))
a9388965 1065 return err;
3b0bdc72
UD
1066 if (dfa->init_state->has_constraint)
1067 {
a9388965 1068 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1069 CONTEXT_WORD);
a9388965 1070 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1071 CONTEXT_NEWLINE);
a9388965 1072 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
1073 &init_nodes,
1074 CONTEXT_NEWLINE
1075 | CONTEXT_BEGBUF);
bc15410e 1076 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
1077 || dfa->init_state_begbuf == NULL, 0))
1078 return err;
3b0bdc72
UD
1079 }
1080 else
1081 dfa->init_state_word = dfa->init_state_nl
1082 = dfa->init_state_begbuf = dfa->init_state;
1083
3b0bdc72
UD
1084 re_node_set_free (&init_nodes);
1085 return REG_NOERROR;
1086}
1087\f
14744156
UD
1088#ifdef RE_ENABLE_I18N
1089/* If it is possible to do searching in single byte encoding instead of UTF-8
1090 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091 DFA nodes where needed. */
1092
1093static void
0fd8ae9c 1094optimize_utf8 (re_dfa_t *dfa)
14744156 1095{
eb04c213
AZ
1096 Idx node;
1097 int i;
1098 bool mb_chars = false;
1099 bool has_period = false;
14744156
UD
1100
1101 for (node = 0; node < dfa->nodes_len; ++node)
1102 switch (dfa->nodes[node].type)
1103 {
1104 case CHARACTER:
eb04c213
AZ
1105 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106 mb_chars = true;
14744156
UD
1107 break;
1108 case ANCHOR:
bc3e1c12 1109 switch (dfa->nodes[node].opr.ctx_type)
14744156
UD
1110 {
1111 case LINE_FIRST:
1112 case LINE_LAST:
1113 case BUF_FIRST:
1114 case BUF_LAST:
1115 break;
1116 default:
0caca71a
UD
1117 /* Word anchors etc. cannot be handled. It's okay to test
1118 opr.ctx_type since constraints (for all DFA nodes) are
1119 created by ORing one or more opr.ctx_type values. */
14744156
UD
1120 return;
1121 }
1122 break;
ad7f28c2 1123 case OP_PERIOD:
eb04c213 1124 has_period = true;
21f5de55 1125 break;
14744156
UD
1126 case OP_BACK_REF:
1127 case OP_ALT:
1128 case END_OF_RE:
14744156 1129 case OP_DUP_ASTERISK:
14744156
UD
1130 case OP_OPEN_SUBEXP:
1131 case OP_CLOSE_SUBEXP:
1132 break;
02f3550c
UD
1133 case COMPLEX_BRACKET:
1134 return;
a8067e8f 1135 case SIMPLE_BRACKET:
eb04c213
AZ
1136 /* Just double check. */
1137 {
1138 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1139 ? 0
1140 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1142 {
1143 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144 return;
1145 rshift = 0;
1146 }
1147 }
a8067e8f 1148 break;
14744156 1149 default:
02f3550c 1150 abort ();
14744156
UD
1151 }
1152
ad7f28c2 1153 if (mb_chars || has_period)
5f93cd52 1154 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1155 {
1156 if (dfa->nodes[node].type == CHARACTER
eb04c213 1157 && dfa->nodes[node].opr.c >= ASCII_CHARS)
ad7f28c2
UD
1158 dfa->nodes[node].mb_partial = 0;
1159 else if (dfa->nodes[node].type == OP_PERIOD)
1160 dfa->nodes[node].type = OP_UTF8_PERIOD;
1161 }
5f93cd52 1162
14744156
UD
1163 /* The search can be in single byte locale. */
1164 dfa->mb_cur_max = 1;
1165 dfa->is_utf8 = 0;
ad7f28c2 1166 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1167}
1168#endif
1169\f
3b0bdc72
UD
1170/* Analyze the structure tree, and calculate "first", "next", "edest",
1171 "eclosure", and "inveclosure". */
1172
1173static reg_errcode_t
0fd8ae9c 1174analyze (regex_t *preg)
3b0bdc72 1175{
eb04c213 1176 re_dfa_t *dfa = preg->buffer;
3b0bdc72
UD
1177 reg_errcode_t ret;
1178
1179 /* Allocate arrays. */
eb04c213
AZ
1180 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1181 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
3b0bdc72
UD
1182 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1183 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1184 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
963d8d78 1185 || dfa->eclosures == NULL, 0))
3b0bdc72 1186 return REG_ESPACE;
02f3550c 1187
eb04c213 1188 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
02f3550c 1189 if (dfa->subexp_map != NULL)
3b0bdc72 1190 {
eb04c213 1191 Idx i;
02f3550c
UD
1192 for (i = 0; i < preg->re_nsub; i++)
1193 dfa->subexp_map[i] = i;
1194 preorder (dfa->str_tree, optimize_subexps, dfa);
1195 for (i = 0; i < preg->re_nsub; i++)
1196 if (dfa->subexp_map[i] != i)
1197 break;
1198 if (i == preg->re_nsub)
1199 {
eb04c213 1200 re_free (dfa->subexp_map);
02f3550c
UD
1201 dfa->subexp_map = NULL;
1202 }
3b0bdc72
UD
1203 }
1204
02f3550c
UD
1205 ret = postorder (dfa->str_tree, lower_subexps, preg);
1206 if (BE (ret != REG_NOERROR, 0))
1207 return ret;
1208 ret = postorder (dfa->str_tree, calc_first, dfa);
1209 if (BE (ret != REG_NOERROR, 0))
1210 return ret;
1211 preorder (dfa->str_tree, calc_next, dfa);
1212 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1213 if (BE (ret != REG_NOERROR, 0))
1214 return ret;
1215 ret = calc_eclosure (dfa);
1216 if (BE (ret != REG_NOERROR, 0))
1217 return ret;
963d8d78
UD
1218
1219 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1220 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1221 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1222 || dfa->nbackref)
1223 {
1224 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1225 if (BE (dfa->inveclosures == NULL, 0))
21f5de55 1226 return REG_ESPACE;
963d8d78
UD
1227 ret = calc_inveclosure (dfa);
1228 }
1229
02f3550c
UD
1230 return ret;
1231}
1232
1233/* Our parse trees are very unbalanced, so we cannot use a stack to
1234 implement parse tree visits. Instead, we use parent pointers and
1235 some hairy code in these two functions. */
1236static reg_errcode_t
0fd8ae9c
UD
1237postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238 void *extra)
02f3550c
UD
1239{
1240 bin_tree_t *node, *prev;
1241
1242 for (node = root; ; )
3b0bdc72 1243 {
02f3550c
UD
1244 /* Descend down the tree, preferably to the left (or to the right
1245 if that's the only child). */
1246 while (node->left || node->right)
1247 if (node->left)
21f5de55
PE
1248 node = node->left;
1249 else
1250 node = node->right;
02f3550c
UD
1251
1252 do
1253 {
1254 reg_errcode_t err = fn (extra, node);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
21f5de55 1257 if (node->parent == NULL)
02f3550c
UD
1258 return REG_NOERROR;
1259 prev = node;
1260 node = node->parent;
1261 }
1262 /* Go up while we have a node that is reached from the right. */
1263 while (node->right == prev || node->right == NULL);
1264 node = node->right;
3b0bdc72 1265 }
3b0bdc72
UD
1266}
1267
02f3550c 1268static reg_errcode_t
0fd8ae9c
UD
1269preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1270 void *extra)
02f3550c
UD
1271{
1272 bin_tree_t *node;
1273
1274 for (node = root; ; )
1275 {
1276 reg_errcode_t err = fn (extra, node);
1277 if (BE (err != REG_NOERROR, 0))
1278 return err;
0ecb606c 1279
02f3550c
UD
1280 /* Go to the left node, or up and to the right. */
1281 if (node->left)
1282 node = node->left;
1283 else
1284 {
1285 bin_tree_t *prev = NULL;
1286 while (node->right == prev || node->right == NULL)
1287 {
1288 prev = node;
1289 node = node->parent;
1290 if (!node)
21f5de55 1291 return REG_NOERROR;
02f3550c
UD
1292 }
1293 node = node->right;
1294 }
1295 }
1296}
1297
1298/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1299 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1300 backreferences as well. Requires a preorder visit. */
0ecb606c 1301static reg_errcode_t
0fd8ae9c 1302optimize_subexps (void *extra, bin_tree_t *node)
0ecb606c 1303{
02f3550c 1304 re_dfa_t *dfa = (re_dfa_t *) extra;
0ecb606c 1305
02f3550c 1306 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3b0bdc72 1307 {
02f3550c
UD
1308 int idx = node->token.opr.idx;
1309 node->token.opr.idx = dfa->subexp_map[idx];
1310 dfa->used_bkref_map |= 1 << node->token.opr.idx;
0ecb606c 1311 }
02f3550c
UD
1312
1313 else if (node->token.type == SUBEXP
21f5de55 1314 && node->left && node->left->token.type == SUBEXP)
0ecb606c 1315 {
eb04c213 1316 Idx other_idx = node->left->token.opr.idx;
02f3550c
UD
1317
1318 node->left = node->left->left;
1319 if (node->left)
21f5de55 1320 node->left->parent = node;
02f3550c
UD
1321
1322 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
2c05d33f 1323 if (other_idx < BITSET_WORD_BITS)
eb04c213 1324 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3b0bdc72 1325 }
02f3550c 1326
3b0bdc72
UD
1327 return REG_NOERROR;
1328}
1329
02f3550c
UD
1330/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1331 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1332static reg_errcode_t
0fd8ae9c 1333lower_subexps (void *extra, bin_tree_t *node)
3b0bdc72 1334{
02f3550c
UD
1335 regex_t *preg = (regex_t *) extra;
1336 reg_errcode_t err = REG_NOERROR;
3b0bdc72 1337
02f3550c 1338 if (node->left && node->left->token.type == SUBEXP)
0ecb606c 1339 {
02f3550c
UD
1340 node->left = lower_subexp (&err, preg, node->left);
1341 if (node->left)
1342 node->left->parent = node;
1343 }
1344 if (node->right && node->right->token.type == SUBEXP)
1345 {
1346 node->right = lower_subexp (&err, preg, node->right);
1347 if (node->right)
1348 node->right->parent = node;
3b0bdc72 1349 }
3b0bdc72 1350
02f3550c
UD
1351 return err;
1352}
3b0bdc72 1353
02f3550c 1354static bin_tree_t *
0fd8ae9c 1355lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
0ecb606c 1356{
eb04c213 1357 re_dfa_t *dfa = preg->buffer;
02f3550c
UD
1358 bin_tree_t *body = node->left;
1359 bin_tree_t *op, *cls, *tree1, *tree;
1360
1361 if (preg->no_sub
744eb12b
UD
1362 /* We do not optimize empty subexpressions, because otherwise we may
1363 have bad CONCAT nodes with NULL children. This is obviously not
1364 very common, so we do not lose much. An example that triggers
1365 this case is the sed "script" /\(\)/x. */
1366 && node->left != NULL
2c05d33f
UD
1367 && (node->token.opr.idx >= BITSET_WORD_BITS
1368 || !(dfa->used_bkref_map
1369 & ((bitset_word_t) 1 << node->token.opr.idx))))
02f3550c
UD
1370 return node->left;
1371
1372 /* Convert the SUBEXP node to the concatenation of an
1373 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1374 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1375 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1376 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1377 tree = create_tree (dfa, op, tree1, CONCAT);
1378 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
0ecb606c 1379 {
02f3550c
UD
1380 *err = REG_ESPACE;
1381 return NULL;
0ecb606c 1382 }
3b0bdc72 1383
02f3550c
UD
1384 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1385 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1386 return tree;
1387}
a334319f 1388
02f3550c
UD
1389/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1390 nodes. Requires a postorder visit. */
1391static reg_errcode_t
0fd8ae9c 1392calc_first (void *extra, bin_tree_t *node)
02f3550c
UD
1393{
1394 re_dfa_t *dfa = (re_dfa_t *) extra;
1395 if (node->token.type == CONCAT)
1396 {
1397 node->first = node->left->first;
1398 node->node_idx = node->left->node_idx;
1399 }
1400 else
1401 {
1402 node->first = node;
1403 node->node_idx = re_dfa_add_node (dfa, node->token);
1404 if (BE (node->node_idx == -1, 0))
21f5de55 1405 return REG_ESPACE;
0caca71a 1406 if (node->token.type == ANCHOR)
21f5de55 1407 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
02f3550c
UD
1408 }
1409 return REG_NOERROR;
1410}
1411
1412/* Pass 2: compute NEXT on the tree. Preorder visit. */
1413static reg_errcode_t
0fd8ae9c 1414calc_next (void *extra, bin_tree_t *node)
02f3550c
UD
1415{
1416 switch (node->token.type)
3b0bdc72
UD
1417 {
1418 case OP_DUP_ASTERISK:
02f3550c 1419 node->left->next = node;
3b0bdc72
UD
1420 break;
1421 case CONCAT:
02f3550c
UD
1422 node->left->next = node->right->first;
1423 node->right->next = node->next;
1424 break;
3b0bdc72 1425 default:
02f3550c
UD
1426 if (node->left)
1427 node->left->next = node->next;
1428 if (node->right)
21f5de55 1429 node->right->next = node->next;
3b0bdc72
UD
1430 break;
1431 }
02f3550c 1432 return REG_NOERROR;
3b0bdc72
UD
1433}
1434
02f3550c
UD
1435/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1436static reg_errcode_t
0fd8ae9c 1437link_nfa_nodes (void *extra, bin_tree_t *node)
a334319f 1438{
02f3550c 1439 re_dfa_t *dfa = (re_dfa_t *) extra;
eb04c213 1440 Idx idx = node->node_idx;
02f3550c
UD
1441 reg_errcode_t err = REG_NOERROR;
1442
1443 switch (node->token.type)
3b0bdc72 1444 {
02f3550c
UD
1445 case CONCAT:
1446 break;
1447
1448 case END_OF_RE:
1449 assert (node->next == NULL);
1450 break;
1451
1452 case OP_DUP_ASTERISK:
1453 case OP_ALT:
1454 {
eb04c213 1455 Idx left, right;
02f3550c
UD
1456 dfa->has_plural_match = 1;
1457 if (node->left != NULL)
1458 left = node->left->first->node_idx;
1459 else
1460 left = node->next->node_idx;
1461 if (node->right != NULL)
1462 right = node->right->first->node_idx;
1463 else
1464 right = node->next->node_idx;
1465 assert (left > -1);
1466 assert (right > -1);
1467 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1468 }
1469 break;
1470
1471 case ANCHOR:
1472 case OP_OPEN_SUBEXP:
1473 case OP_CLOSE_SUBEXP:
1474 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1475 break;
1476
1477 case OP_BACK_REF:
1478 dfa->nexts[idx] = node->next->node_idx;
1479 if (node->token.type == OP_BACK_REF)
2da42bc0 1480 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
02f3550c
UD
1481 break;
1482
1483 default:
1484 assert (!IS_EPSILON_NODE (node->token.type));
1485 dfa->nexts[idx] = node->next->node_idx;
1486 break;
3b0bdc72 1487 }
02f3550c
UD
1488
1489 return err;
3b0bdc72
UD
1490}
1491
485d775d
UD
1492/* Duplicate the epsilon closure of the node ROOT_NODE.
1493 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1494 to their own constraint. */
1495
1496static reg_errcode_t
eb04c213
AZ
1497duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1498 Idx root_node, unsigned int init_constraint)
485d775d 1499{
eb04c213
AZ
1500 Idx org_node, clone_node;
1501 bool ok;
485d775d
UD
1502 unsigned int constraint = init_constraint;
1503 for (org_node = top_org_node, clone_node = top_clone_node;;)
1504 {
eb04c213 1505 Idx org_dest, clone_dest;
485d775d 1506 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1507 {
485d775d
UD
1508 /* If the back reference epsilon-transit, its destination must
1509 also have the constraint. Then duplicate the epsilon closure
1510 of the destination of the back reference, and store it in
1511 edests of the back reference. */
15a7d175
UD
1512 org_dest = dfa->nexts[org_node];
1513 re_node_set_empty (dfa->edests + clone_node);
2d87db5b
UD
1514 clone_dest = duplicate_node (dfa, org_dest, constraint);
1515 if (BE (clone_dest == -1, 0))
1516 return REG_ESPACE;
15a7d175 1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
eb04c213
AZ
1518 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (! ok, 0))
15a7d175
UD
1520 return REG_ESPACE;
1521 }
485d775d 1522 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1523 {
485d775d
UD
1524 /* In case of the node can't epsilon-transit, don't duplicate the
1525 destination and store the original destination as the
1526 destination of the node. */
15a7d175
UD
1527 dfa->nexts[clone_node] = dfa->nexts[org_node];
1528 break;
1529 }
485d775d 1530 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1531 {
485d775d
UD
1532 /* In case of the node can epsilon-transit, and it has only one
1533 destination. */
15a7d175
UD
1534 org_dest = dfa->edests[org_node].elems[0];
1535 re_node_set_empty (dfa->edests + clone_node);
d3821ab0 1536 /* If the node is root_node itself, it means the epsilon closure
eb04c213 1537 has a loop. Then tie it to the destination of the root_node. */
0caca71a 1538 if (org_node == root_node && clone_node != org_node)
15a7d175 1539 {
eb04c213
AZ
1540 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1541 if (BE (! ok, 0))
1542 return REG_ESPACE;
0caca71a 1543 break;
15a7d175 1544 }
d3821ab0 1545 /* In case the node has another constraint, append it. */
0caca71a 1546 constraint |= dfa->nodes[org_node].constraint;
2d87db5b
UD
1547 clone_dest = duplicate_node (dfa, org_dest, constraint);
1548 if (BE (clone_dest == -1, 0))
1549 return REG_ESPACE;
eb04c213
AZ
1550 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551 if (BE (! ok, 0))
15a7d175
UD
1552 return REG_ESPACE;
1553 }
485d775d 1554 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1555 {
485d775d 1556 /* In case of the node can epsilon-transit, and it has two
02f3550c 1557 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
15a7d175
UD
1558 org_dest = dfa->edests[org_node].elems[0];
1559 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1560 /* Search for a duplicated node which satisfies the constraint. */
1561 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1562 if (clone_dest == -1)
1563 {
0caca71a 1564 /* There is no such duplicated node, create a new one. */
2d87db5b
UD
1565 reg_errcode_t err;
1566 clone_dest = duplicate_node (dfa, org_dest, constraint);
1567 if (BE (clone_dest == -1, 0))
1568 return REG_ESPACE;
eb04c213
AZ
1569 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570 if (BE (! ok, 0))
7de66108
UD
1571 return REG_ESPACE;
1572 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1573 root_node, constraint);
1574 if (BE (err != REG_NOERROR, 0))
1575 return err;
a7d5c291
UD
1576 }
1577 else
1578 {
0caca71a 1579 /* There is a duplicated node which satisfies the constraint,
a7d5c291 1580 use it to avoid infinite loop. */
eb04c213
AZ
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 if (BE (! ok, 0))
a7d5c291
UD
1583 return REG_ESPACE;
1584 }
15a7d175
UD
1585
1586 org_dest = dfa->edests[org_node].elems[1];
2d87db5b
UD
1587 clone_dest = duplicate_node (dfa, org_dest, constraint);
1588 if (BE (clone_dest == -1, 0))
1589 return REG_ESPACE;
eb04c213
AZ
1590 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 if (BE (! ok, 0))
15a7d175
UD
1592 return REG_ESPACE;
1593 }
485d775d
UD
1594 org_node = org_dest;
1595 clone_node = clone_dest;
1596 }
1597 return REG_NOERROR;
1598}
1599
a7d5c291
UD
1600/* Search for a node which is duplicated from the node ORG_NODE, and
1601 satisfies the constraint CONSTRAINT. */
1602
eb04c213
AZ
1603static Idx
1604search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
0fd8ae9c 1605 unsigned int constraint)
a7d5c291 1606{
eb04c213 1607 Idx idx;
a7d5c291
UD
1608 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1609 {
1610 if (org_node == dfa->org_indices[idx]
1611 && constraint == dfa->nodes[idx].constraint)
1612 return idx; /* Found. */
1613 }
1614 return -1; /* Not found. */
1615}
1616
a9388965 1617/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
2d87db5b
UD
1618 Return the index of the new node, or -1 if insufficient storage is
1619 available. */
a9388965 1620
eb04c213
AZ
1621static Idx
1622duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
3b0bdc72 1623{
eb04c213 1624 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
2d87db5b
UD
1625 if (BE (dup_idx != -1, 1))
1626 {
1627 dfa->nodes[dup_idx].constraint = constraint;
0caca71a 1628 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
2d87db5b
UD
1629 dfa->nodes[dup_idx].duplicated = 1;
1630
1631 /* Store the index of the original node. */
1632 dfa->org_indices[dup_idx] = org_idx;
1633 }
1634 return dup_idx;
3b0bdc72
UD
1635}
1636
963d8d78 1637static reg_errcode_t
0fd8ae9c 1638calc_inveclosure (re_dfa_t *dfa)
3b0bdc72 1639{
eb04c213
AZ
1640 Idx src, idx;
1641 bool ok;
963d8d78
UD
1642 for (idx = 0; idx < dfa->nodes_len; ++idx)
1643 re_node_set_init_empty (dfa->inveclosures + idx);
1644
3b0bdc72
UD
1645 for (src = 0; src < dfa->nodes_len; ++src)
1646 {
eb04c213 1647 Idx *elems = dfa->eclosures[src].elems;
3b0bdc72 1648 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175 1649 {
eb04c213
AZ
1650 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1651 if (BE (! ok, 0))
963d8d78 1652 return REG_ESPACE;
15a7d175 1653 }
3b0bdc72 1654 }
963d8d78
UD
1655
1656 return REG_NOERROR;
3b0bdc72
UD
1657}
1658
1659/* Calculate "eclosure" for all the node in DFA. */
1660
1661static reg_errcode_t
0fd8ae9c 1662calc_eclosure (re_dfa_t *dfa)
3b0bdc72 1663{
eb04c213
AZ
1664 Idx node_idx;
1665 bool incomplete;
3b0bdc72
UD
1666#ifdef DEBUG
1667 assert (dfa->nodes_len > 0);
1668#endif
eb04c213 1669 incomplete = false;
3b0bdc72 1670 /* For each nodes, calculate epsilon closure. */
485d775d 1671 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1672 {
a9388965 1673 reg_errcode_t err;
3b0bdc72 1674 re_node_set eclosure_elem;
485d775d 1675 if (node_idx == dfa->nodes_len)
15a7d175
UD
1676 {
1677 if (!incomplete)
1678 break;
eb04c213 1679 incomplete = false;
15a7d175
UD
1680 node_idx = 0;
1681 }
3b0bdc72
UD
1682
1683#ifdef DEBUG
3b0bdc72
UD
1684 assert (dfa->eclosures[node_idx].nelem != -1);
1685#endif
c06a6956 1686
3b0bdc72
UD
1687 /* If we have already calculated, skip it. */
1688 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1689 continue;
d3821ab0 1690 /* Calculate epsilon closure of 'node_idx'. */
eb04c213 1691 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
bc15410e 1692 if (BE (err != REG_NOERROR, 0))
15a7d175 1693 return err;
3b0bdc72
UD
1694
1695 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175 1696 {
eb04c213 1697 incomplete = true;
15a7d175
UD
1698 re_node_set_free (&eclosure_elem);
1699 }
3b0bdc72 1700 }
3b0bdc72
UD
1701 return REG_NOERROR;
1702}
1703
1704/* Calculate epsilon closure of NODE. */
1705
a9388965 1706static reg_errcode_t
eb04c213 1707calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
3b0bdc72 1708{
a9388965 1709 reg_errcode_t err;
eb04c213 1710 Idx i;
3b0bdc72 1711 re_node_set eclosure;
eb04c213
AZ
1712 bool ok;
1713 bool incomplete = false;
a9388965 1714 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1715 if (BE (err != REG_NOERROR, 0))
a9388965 1716 return err;
3b0bdc72
UD
1717
1718 /* This indicates that we are calculating this node now.
1719 We reference this value to avoid infinite loop. */
1720 dfa->eclosures[node].nelem = -1;
1721
0caca71a
UD
1722 /* If the current node has constraints, duplicate all nodes
1723 since they must inherit the constraints. */
1724 if (dfa->nodes[node].constraint
b4ae56bd
UD
1725 && dfa->edests[node].nelem
1726 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
485d775d 1727 {
0caca71a
UD
1728 err = duplicate_node_closure (dfa, node, node, node,
1729 dfa->nodes[node].constraint);
485d775d 1730 if (BE (err != REG_NOERROR, 0))
15a7d175 1731 return err;
485d775d 1732 }
3b0bdc72
UD
1733
1734 /* Expand each epsilon destination nodes. */
485d775d 1735 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1736 for (i = 0; i < dfa->edests[node].nelem; ++i)
1737 {
15a7d175 1738 re_node_set eclosure_elem;
eb04c213
AZ
1739 Idx edest = dfa->edests[node].elems[i];
1740 /* If calculating the epsilon closure of 'edest' is in progress,
15a7d175
UD
1741 return intermediate result. */
1742 if (dfa->eclosures[edest].nelem == -1)
1743 {
eb04c213 1744 incomplete = true;
15a7d175
UD
1745 continue;
1746 }
eb04c213 1747 /* If we haven't calculated the epsilon closure of 'edest' yet,
15a7d175
UD
1748 calculate now. Otherwise use calculated epsilon closure. */
1749 if (dfa->eclosures[edest].nelem == 0)
1750 {
eb04c213 1751 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
15a7d175
UD
1752 if (BE (err != REG_NOERROR, 0))
1753 return err;
1754 }
1755 else
1756 eclosure_elem = dfa->eclosures[edest];
d3821ab0 1757 /* Merge the epsilon closure of 'edest'. */
2da42bc0
UD
1758 err = re_node_set_merge (&eclosure, &eclosure_elem);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
d3821ab0 1761 /* If the epsilon closure of 'edest' is incomplete,
15a7d175
UD
1762 the epsilon closure of this node is also incomplete. */
1763 if (dfa->eclosures[edest].nelem == 0)
1764 {
eb04c213 1765 incomplete = true;
15a7d175
UD
1766 re_node_set_free (&eclosure_elem);
1767 }
3b0bdc72
UD
1768 }
1769
21f5de55 1770 /* An epsilon closure includes itself. */
eb04c213
AZ
1771 ok = re_node_set_insert (&eclosure, node);
1772 if (BE (! ok, 0))
21f5de55 1773 return REG_ESPACE;
3b0bdc72
UD
1774 if (incomplete && !root)
1775 dfa->eclosures[node].nelem = 0;
1776 else
1777 dfa->eclosures[node] = eclosure;
a9388965
UD
1778 *new_set = eclosure;
1779 return REG_NOERROR;
3b0bdc72
UD
1780}
1781\f
1782/* Functions for token which are used in the parser. */
1783
1784/* Fetch a token from INPUT.
1785 We must not use this function inside bracket expressions. */
1786
f0d77aa8 1787static void
0fd8ae9c 1788fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
3b0bdc72 1789{
f0d77aa8 1790 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1791}
1792
1793/* Peek a token from INPUT, and return the length of the token.
1794 We must not use this function inside bracket expressions. */
1795
1796static int
0fd8ae9c 1797peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
1798{
1799 unsigned char c;
1800
1801 if (re_string_eoi (input))
1802 {
1803 token->type = END_OF_RE;
1804 return 0;
1805 }
1806
1807 c = re_string_peek_byte (input, 0);
1808 token->opr.c = c;
1809
65e6becf 1810 token->word_char = 0;
3b0bdc72
UD
1811#ifdef RE_ENABLE_I18N
1812 token->mb_partial = 0;
3c0fb574 1813 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1814 !re_string_first_byte (input, re_string_cur_idx (input)))
1815 {
1816 token->type = CHARACTER;
1817 token->mb_partial = 1;
1818 return 1;
1819 }
1820#endif
1821 if (c == '\\')
1822 {
1823 unsigned char c2;
1824 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1825 {
1826 token->type = BACK_SLASH;
1827 return 1;
1828 }
3b0bdc72
UD
1829
1830 c2 = re_string_peek_byte_case (input, 1);
1831 token->opr.c = c2;
1832 token->type = CHARACTER;
65e6becf
UD
1833#ifdef RE_ENABLE_I18N
1834 if (input->mb_cur_max > 1)
1835 {
1836 wint_t wc = re_string_wchar_at (input,
1837 re_string_cur_idx (input) + 1);
1838 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1839 }
1840 else
1841#endif
1842 token->word_char = IS_WORD_CHAR (c2) != 0;
1843
3b0bdc72 1844 switch (c2)
15a7d175
UD
1845 {
1846 case '|':
1847 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1848 token->type = OP_ALT;
1849 break;
1850 case '1': case '2': case '3': case '4': case '5':
1851 case '6': case '7': case '8': case '9':
1852 if (!(syntax & RE_NO_BK_REFS))
1853 {
1854 token->type = OP_BACK_REF;
ae73c6c1 1855 token->opr.idx = c2 - '1';
15a7d175
UD
1856 }
1857 break;
1858 case '<':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 {
1861 token->type = ANCHOR;
bb3f4825 1862 token->opr.ctx_type = WORD_FIRST;
15a7d175
UD
1863 }
1864 break;
1865 case '>':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 {
1868 token->type = ANCHOR;
bb3f4825 1869 token->opr.ctx_type = WORD_LAST;
15a7d175
UD
1870 }
1871 break;
1872 case 'b':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 {
1875 token->type = ANCHOR;
bb3f4825 1876 token->opr.ctx_type = WORD_DELIM;
15a7d175
UD
1877 }
1878 break;
1879 case 'B':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 {
1882 token->type = ANCHOR;
24992143 1883 token->opr.ctx_type = NOT_WORD_DELIM;
15a7d175
UD
1884 }
1885 break;
1886 case 'w':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = OP_WORD;
1889 break;
1890 case 'W':
1891 if (!(syntax & RE_NO_GNU_OPS))
1892 token->type = OP_NOTWORD;
1893 break;
e2b6bfa3
UD
1894 case 's':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = OP_SPACE;
1897 break;
1898 case 'S':
1899 if (!(syntax & RE_NO_GNU_OPS))
1900 token->type = OP_NOTSPACE;
1901 break;
15a7d175
UD
1902 case '`':
1903 if (!(syntax & RE_NO_GNU_OPS))
1904 {
1905 token->type = ANCHOR;
bb3f4825 1906 token->opr.ctx_type = BUF_FIRST;
15a7d175
UD
1907 }
1908 break;
1909 case '\'':
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 {
1912 token->type = ANCHOR;
bb3f4825 1913 token->opr.ctx_type = BUF_LAST;
15a7d175
UD
1914 }
1915 break;
1916 case '(':
1917 if (!(syntax & RE_NO_BK_PARENS))
1918 token->type = OP_OPEN_SUBEXP;
1919 break;
1920 case ')':
1921 if (!(syntax & RE_NO_BK_PARENS))
1922 token->type = OP_CLOSE_SUBEXP;
1923 break;
1924 case '+':
1925 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1926 token->type = OP_DUP_PLUS;
1927 break;
1928 case '?':
1929 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1930 token->type = OP_DUP_QUESTION;
1931 break;
1932 case '{':
1933 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1934 token->type = OP_OPEN_DUP_NUM;
1935 break;
1936 case '}':
1937 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1938 token->type = OP_CLOSE_DUP_NUM;
1939 break;
1940 default:
1941 break;
1942 }
3b0bdc72
UD
1943 return 2;
1944 }
1945
1946 token->type = CHARACTER;
65e6becf
UD
1947#ifdef RE_ENABLE_I18N
1948 if (input->mb_cur_max > 1)
1949 {
1950 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1951 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1952 }
1953 else
1954#endif
1955 token->word_char = IS_WORD_CHAR (token->opr.c);
1956
3b0bdc72
UD
1957 switch (c)
1958 {
1959 case '\n':
1960 if (syntax & RE_NEWLINE_ALT)
15a7d175 1961 token->type = OP_ALT;
3b0bdc72
UD
1962 break;
1963 case '|':
1964 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1965 token->type = OP_ALT;
3b0bdc72
UD
1966 break;
1967 case '*':
1968 token->type = OP_DUP_ASTERISK;
1969 break;
1970 case '+':
1971 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1972 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1973 break;
1974 case '?':
1975 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1976 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1977 break;
1978 case '{':
1979 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1980 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1981 break;
1982 case '}':
1983 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1984 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1985 break;
1986 case '(':
1987 if (syntax & RE_NO_BK_PARENS)
15a7d175 1988 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1989 break;
1990 case ')':
1991 if (syntax & RE_NO_BK_PARENS)
15a7d175 1992 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1993 break;
1994 case '[':
1995 token->type = OP_OPEN_BRACKET;
1996 break;
1997 case '.':
1998 token->type = OP_PERIOD;
1999 break;
2000 case '^':
134abcb5 2001 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
2002 re_string_cur_idx (input) != 0)
2003 {
2004 char prev = re_string_peek_byte (input, -1);
134abcb5 2005 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
2006 break;
2007 }
3b0bdc72 2008 token->type = ANCHOR;
bb3f4825 2009 token->opr.ctx_type = LINE_FIRST;
3b0bdc72
UD
2010 break;
2011 case '$':
2012 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
2013 re_string_cur_idx (input) + 1 != re_string_length (input))
2014 {
2015 re_token_t next;
2016 re_string_skip_bytes (input, 1);
2017 peek_token (&next, input, syntax);
2018 re_string_skip_bytes (input, -1);
2019 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2020 break;
2021 }
3b0bdc72 2022 token->type = ANCHOR;
bb3f4825 2023 token->opr.ctx_type = LINE_LAST;
3b0bdc72
UD
2024 break;
2025 default:
2026 break;
2027 }
2028 return 1;
2029}
2030
2031/* Peek a token from INPUT, and return the length of the token.
2032 We must not use this function out of bracket expressions. */
2033
2034static int
0fd8ae9c 2035peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
2036{
2037 unsigned char c;
2038 if (re_string_eoi (input))
2039 {
2040 token->type = END_OF_RE;
2041 return 0;
2042 }
2043 c = re_string_peek_byte (input, 0);
2044 token->opr.c = c;
2045
2046#ifdef RE_ENABLE_I18N
3c0fb574 2047 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
2048 !re_string_first_byte (input, re_string_cur_idx (input)))
2049 {
2050 token->type = CHARACTER;
2051 return 1;
2052 }
2053#endif /* RE_ENABLE_I18N */
2054
294b6bcc
UD
2055 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056 && re_string_cur_idx (input) + 1 < re_string_length (input))
3b0bdc72
UD
2057 {
2058 /* In this case, '\' escape a character. */
2059 unsigned char c2;
82bbb29e
UD
2060 re_string_skip_bytes (input, 1);
2061 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
2062 token->opr.c = c2;
2063 token->type = CHARACTER;
2064 return 1;
2065 }
2066 if (c == '[') /* '[' is a special char in a bracket exps. */
2067 {
2068 unsigned char c2;
2069 int token_len;
294b6bcc
UD
2070 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071 c2 = re_string_peek_byte (input, 1);
2072 else
2073 c2 = 0;
3b0bdc72
UD
2074 token->opr.c = c2;
2075 token_len = 2;
2076 switch (c2)
15a7d175
UD
2077 {
2078 case '.':
2079 token->type = OP_OPEN_COLL_ELEM;
2080 break;
eb04c213 2081
15a7d175
UD
2082 case '=':
2083 token->type = OP_OPEN_EQUIV_CLASS;
2084 break;
eb04c213 2085
15a7d175
UD
2086 case ':':
2087 if (syntax & RE_CHAR_CLASSES)
2088 {
2089 token->type = OP_OPEN_CHAR_CLASS;
2090 break;
2091 }
eb04c213 2092 FALLTHROUGH;
15a7d175
UD
2093 default:
2094 token->type = CHARACTER;
2095 token->opr.c = c;
2096 token_len = 1;
2097 break;
2098 }
3b0bdc72
UD
2099 return token_len;
2100 }
2101 switch (c)
2102 {
2103 case '-':
2104 token->type = OP_CHARSET_RANGE;
2105 break;
2106 case ']':
2107 token->type = OP_CLOSE_BRACKET;
2108 break;
2109 case '^':
2110 token->type = OP_NON_MATCH_LIST;
2111 break;
2112 default:
2113 token->type = CHARACTER;
2114 }
2115 return 1;
2116}
2117\f
2118/* Functions for parser. */
2119
2120/* Entry point of the parser.
2121 Parse the regular expression REGEXP and return the structure tree.
d3821ab0 2122 If an error occurs, ERR is set by error code, and return NULL.
3b0bdc72 2123 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
2124 CAT
2125 / \
2126 / \
3b0bdc72
UD
2127 <reg_exp> EOR
2128
2129 CAT means concatenation.
2130 EOR means end of regular expression. */
2131
2132static bin_tree_t *
0fd8ae9c
UD
2133parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2134 reg_errcode_t *err)
3b0bdc72 2135{
eb04c213 2136 re_dfa_t *dfa = preg->buffer;
3b0bdc72
UD
2137 bin_tree_t *tree, *eor, *root;
2138 re_token_t current_token;
56b168be 2139 dfa->syntax = syntax;
f0d77aa8 2140 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2141 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 2142 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72 2143 return NULL;
02f3550c 2144 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
3b0bdc72 2145 if (tree != NULL)
02f3550c 2146 root = create_tree (dfa, tree, eor, CONCAT);
3b0bdc72
UD
2147 else
2148 root = eor;
ee70274a 2149 if (BE (eor == NULL || root == NULL, 0))
485d775d
UD
2150 {
2151 *err = REG_ESPACE;
2152 return NULL;
2153 }
3b0bdc72
UD
2154 return root;
2155}
2156
2157/* This function build the following tree, from regular expression
2158 <branch1>|<branch2>:
15a7d175
UD
2159 ALT
2160 / \
2161 / \
3b0bdc72
UD
2162 <branch1> <branch2>
2163
d3821ab0 2164 ALT means alternative, which represents the operator '|'. */
3b0bdc72
UD
2165
2166static bin_tree_t *
0fd8ae9c 2167parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2168 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2169{
eb04c213 2170 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2171 bin_tree_t *tree, *branch = NULL;
eb04c213 2172 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
3b0bdc72 2173 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 2174 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2175 return NULL;
2176
2177 while (token->type == OP_ALT)
2178 {
f0d77aa8 2179 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2180 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
2181 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2182 {
eb04c213
AZ
2183 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2184 dfa->completed_bkref_map = initial_bkref_map;
15a7d175
UD
2185 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2186 if (BE (*err != REG_NOERROR && branch == NULL, 0))
aa6ec754
AS
2187 {
2188 if (tree != NULL)
2189 postorder (tree, free_tree, NULL);
2190 return NULL;
2191 }
eb04c213 2192 dfa->completed_bkref_map |= accumulated_bkref_map;
15a7d175 2193 }
9b88fc16
UD
2194 else
2195 branch = NULL;
02f3550c 2196 tree = create_tree (dfa, tree, branch, OP_ALT);
ee70274a 2197 if (BE (tree == NULL, 0))
15a7d175
UD
2198 {
2199 *err = REG_ESPACE;
2200 return NULL;
2201 }
3b0bdc72
UD
2202 }
2203 return tree;
2204}
2205
2206/* This function build the following tree, from regular expression
2207 <exp1><exp2>:
15a7d175
UD
2208 CAT
2209 / \
3b0bdc72
UD
2210 / \
2211 <exp1> <exp2>
2212
2213 CAT means concatenation. */
2214
2215static bin_tree_t *
0fd8ae9c 2216parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2217 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2218{
eb04c213
AZ
2219 bin_tree_t *tree, *expr;
2220 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2221 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2222 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2223 return NULL;
2224
2225 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2226 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72 2227 {
eb04c213
AZ
2228 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2229 if (BE (*err != REG_NOERROR && expr == NULL, 0))
15a7d175 2230 {
b833d51f
UD
2231 if (tree != NULL)
2232 postorder (tree, free_tree, NULL);
15a7d175
UD
2233 return NULL;
2234 }
eb04c213 2235 if (tree != NULL && expr != NULL)
15a7d175 2236 {
eb04c213 2237 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
e9b9cbf5 2238 if (newtree == NULL)
15a7d175 2239 {
eb04c213 2240 postorder (expr, free_tree, NULL);
e9b9cbf5 2241 postorder (tree, free_tree, NULL);
15a7d175
UD
2242 *err = REG_ESPACE;
2243 return NULL;
2244 }
e9b9cbf5 2245 tree = newtree;
15a7d175 2246 }
3b0bdc72 2247 else if (tree == NULL)
eb04c213
AZ
2248 tree = expr;
2249 /* Otherwise expr == NULL, we don't need to create new tree. */
3b0bdc72
UD
2250 }
2251 return tree;
2252}
2253
2254/* This function build the following tree, from regular expression a*:
15a7d175
UD
2255 *
2256 |
2257 a
3b0bdc72
UD
2258*/
2259
2260static bin_tree_t *
0fd8ae9c 2261parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2262 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2263{
eb04c213 2264 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2265 bin_tree_t *tree;
3b0bdc72
UD
2266 switch (token->type)
2267 {
2268 case CHARACTER:
02f3550c 2269 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2270 if (BE (tree == NULL, 0))
15a7d175
UD
2271 {
2272 *err = REG_ESPACE;
2273 return NULL;
2274 }
3b0bdc72 2275#ifdef RE_ENABLE_I18N
3c0fb574 2276 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2277 {
2278 while (!re_string_eoi (regexp)
2279 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2280 {
15a7d175 2281 bin_tree_t *mbc_remain;
f0d77aa8 2282 fetch_token (token, regexp, syntax);
02f3550c
UD
2283 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2284 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
ee70274a 2285 if (BE (mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2286 {
2287 *err = REG_ESPACE;
2288 return NULL;
2289 }
15a7d175 2290 }
3b0bdc72
UD
2291 }
2292#endif
2293 break;
eb04c213 2294
3b0bdc72
UD
2295 case OP_OPEN_SUBEXP:
2296 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2297 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2298 return NULL;
3b0bdc72 2299 break;
eb04c213 2300
3b0bdc72
UD
2301 case OP_OPEN_BRACKET:
2302 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2304 return NULL;
3b0bdc72 2305 break;
eb04c213 2306
3b0bdc72 2307 case OP_BACK_REF:
ae73c6c1 2308 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
15a7d175
UD
2309 {
2310 *err = REG_ESUBREG;
2311 return NULL;
2312 }
ae73c6c1 2313 dfa->used_bkref_map |= 1 << token->opr.idx;
02f3550c 2314 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2315 if (BE (tree == NULL, 0))
15a7d175
UD
2316 {
2317 *err = REG_ESPACE;
2318 return NULL;
2319 }
3b0bdc72
UD
2320 ++dfa->nbackref;
2321 dfa->has_mb_node = 1;
2322 break;
eb04c213 2323
06e8303a
UD
2324 case OP_OPEN_DUP_NUM:
2325 if (syntax & RE_CONTEXT_INVALID_DUP)
2326 {
2327 *err = REG_BADRPT;
2328 return NULL;
2329 }
eb04c213 2330 FALLTHROUGH;
3b0bdc72
UD
2331 case OP_DUP_ASTERISK:
2332 case OP_DUP_PLUS:
2333 case OP_DUP_QUESTION:
3b0bdc72 2334 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2335 {
2336 *err = REG_BADRPT;
2337 return NULL;
2338 }
3b0bdc72 2339 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2340 {
f0d77aa8 2341 fetch_token (token, regexp, syntax);
15a7d175
UD
2342 return parse_expression (regexp, preg, token, syntax, nest, err);
2343 }
eb04c213 2344 FALLTHROUGH;
3b0bdc72
UD
2345 case OP_CLOSE_SUBEXP:
2346 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2347 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348 {
2349 *err = REG_ERPAREN;
2350 return NULL;
2351 }
eb04c213 2352 FALLTHROUGH;
3b0bdc72
UD
2353 case OP_CLOSE_DUP_NUM:
2354 /* We treat it as a normal character. */
2355
2356 /* Then we can these characters as normal characters. */
2357 token->type = CHARACTER;
65e6becf
UD
2358 /* mb_partial and word_char bits should be initialized already
2359 by peek_token. */
02f3550c 2360 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2361 if (BE (tree == NULL, 0))
15a7d175
UD
2362 {
2363 *err = REG_ESPACE;
2364 return NULL;
2365 }
3b0bdc72 2366 break;
eb04c213 2367
3b0bdc72 2368 case ANCHOR:
bb3f4825 2369 if ((token->opr.ctx_type
24992143 2370 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
56b168be
UD
2371 && dfa->word_ops_used == 0)
2372 init_word_char (dfa);
24992143 2373 if (token->opr.ctx_type == WORD_DELIM
21f5de55 2374 || token->opr.ctx_type == NOT_WORD_DELIM)
15a7d175
UD
2375 {
2376 bin_tree_t *tree_first, *tree_last;
24992143
UD
2377 if (token->opr.ctx_type == WORD_DELIM)
2378 {
2379 token->opr.ctx_type = WORD_FIRST;
02f3550c 2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2381 token->opr.ctx_type = WORD_LAST;
21f5de55
PE
2382 }
2383 else
2384 {
24992143 2385 token->opr.ctx_type = INSIDE_WORD;
02f3550c 2386 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2387 token->opr.ctx_type = INSIDE_NOTWORD;
21f5de55 2388 }
02f3550c
UD
2389 tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
ee70274a 2391 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
15a7d175
UD
2392 {
2393 *err = REG_ESPACE;
2394 return NULL;
2395 }
2396 }
3b0bdc72 2397 else
15a7d175 2398 {
02f3550c 2399 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2400 if (BE (tree == NULL, 0))
54e1cabc
UD
2401 {
2402 *err = REG_ESPACE;
2403 return NULL;
2404 }
15a7d175 2405 }
3b0bdc72 2406 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2410 fetch_token (token, regexp, syntax);
3b0bdc72 2411 return tree;
eb04c213 2412
3b0bdc72 2413 case OP_PERIOD:
02f3550c 2414 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2415 if (BE (tree == NULL, 0))
15a7d175
UD
2416 {
2417 *err = REG_ESPACE;
2418 return NULL;
2419 }
3c0fb574 2420 if (dfa->mb_cur_max > 1)
15a7d175 2421 dfa->has_mb_node = 1;
3b0bdc72 2422 break;
eb04c213 2423
3b0bdc72 2424 case OP_WORD:
3b0bdc72 2425 case OP_NOTWORD:
266c1f50 2426 tree = build_charclass_op (dfa, regexp->trans,
eb04c213
AZ
2427 "alnum",
2428 "_",
266c1f50 2429 token->type == OP_NOTWORD, err);
e2b6bfa3
UD
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 return NULL;
2432 break;
eb04c213 2433
e2b6bfa3 2434 case OP_SPACE:
e2b6bfa3 2435 case OP_NOTSPACE:
266c1f50 2436 tree = build_charclass_op (dfa, regexp->trans,
eb04c213
AZ
2437 "space",
2438 "",
266c1f50 2439 token->type == OP_NOTSPACE, err);
bc15410e 2440 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2441 return NULL;
3b0bdc72 2442 break;
eb04c213 2443
3b0bdc72
UD
2444 case OP_ALT:
2445 case END_OF_RE:
2446 return NULL;
eb04c213 2447
3b0bdc72
UD
2448 case BACK_SLASH:
2449 *err = REG_EESCAPE;
2450 return NULL;
eb04c213 2451
3b0bdc72
UD
2452 default:
2453 /* Must not happen? */
2454#ifdef DEBUG
2455 assert (0);
2456#endif
2457 return NULL;
2458 }
f0d77aa8 2459 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2460
2461 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2462 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72 2463 {
eb04c213
AZ
2464 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2465 syntax, err);
4d43ef1e
AS
2466 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2467 {
2468 if (tree != NULL)
2469 postorder (tree, free_tree, NULL);
2470 return NULL;
2471 }
2472 tree = dup_tree;
c34bfc8d
UD
2473 /* In BRE consecutive duplications are not allowed. */
2474 if ((syntax & RE_CONTEXT_INVALID_DUP)
2475 && (token->type == OP_DUP_ASTERISK
2476 || token->type == OP_OPEN_DUP_NUM))
2477 {
4d43ef1e
AS
2478 if (tree != NULL)
2479 postorder (tree, free_tree, NULL);
c34bfc8d
UD
2480 *err = REG_BADRPT;
2481 return NULL;
2482 }
3b0bdc72
UD
2483 }
2484
2485 return tree;
2486}
2487
2488/* This function build the following tree, from regular expression
2489 (<reg_exp>):
15a7d175
UD
2490 SUBEXP
2491 |
2492 <reg_exp>
3b0bdc72
UD
2493*/
2494
2495static bin_tree_t *
0fd8ae9c 2496parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2497 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2498{
eb04c213 2499 re_dfa_t *dfa = preg->buffer;
02f3550c 2500 bin_tree_t *tree;
3b0bdc72
UD
2501 size_t cur_nsub;
2502 cur_nsub = preg->re_nsub++;
81c64d40 2503
f0d77aa8 2504 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2505
2506 /* The subexpression may be a null string. */
2507 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2508 tree = NULL;
3b0bdc72
UD
2509 else
2510 {
2511 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
02f3550c 2512 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
a129c80d
UD
2513 {
2514 if (tree != NULL)
e9b9cbf5 2515 postorder (tree, free_tree, NULL);
a129c80d
UD
2516 *err = REG_EPAREN;
2517 }
02f3550c 2518 if (BE (*err != REG_NOERROR, 0))
15a7d175 2519 return NULL;
3b0bdc72 2520 }
01ed6ceb
UD
2521
2522 if (cur_nsub <= '9' - '1')
2523 dfa->completed_bkref_map |= 1 << cur_nsub;
02f3550c
UD
2524
2525 tree = create_tree (dfa, tree, NULL, SUBEXP);
2526 if (BE (tree == NULL, 0))
485d775d
UD
2527 {
2528 *err = REG_ESPACE;
2529 return NULL;
2530 }
02f3550c 2531 tree->token.opr.idx = cur_nsub;
3b0bdc72
UD
2532 return tree;
2533}
2534
2535/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2536
2537static bin_tree_t *
0fd8ae9c
UD
2538parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2539 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72 2540{
20dc2f79 2541 bin_tree_t *tree = NULL, *old_tree = NULL;
eb04c213 2542 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
3b0bdc72 2543 re_token_t start_token = *token;
6b6557e8 2544
3b0bdc72
UD
2545 if (token->type == OP_OPEN_DUP_NUM)
2546 {
6b6557e8
UD
2547 end = 0;
2548 start = fetch_number (regexp, token, syntax);
3b0bdc72 2549 if (start == -1)
15a7d175
UD
2550 {
2551 if (token->type == CHARACTER && token->opr.c == ',')
2552 start = 0; /* We treat "{,m}" as "{0,m}". */
2553 else
2554 {
2555 *err = REG_BADBR; /* <re>{} is invalid. */
2556 return NULL;
2557 }
2558 }
602c2f9d 2559 if (BE (start != -2, 1))
15a7d175
UD
2560 {
2561 /* We treat "{n}" as "{n,n}". */
2562 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2563 : ((token->type == CHARACTER && token->opr.c == ',')
2564 ? fetch_number (regexp, token, syntax) : -2));
2565 }
602c2f9d 2566 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2567 {
2568 /* Invalid sequence. */
6b6557e8
UD
2569 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2570 {
2571 if (token->type == END_OF_RE)
2572 *err = REG_EBRACE;
2573 else
2574 *err = REG_BADBR;
2575
2576 return NULL;
2577 }
2578
2579 /* If the syntax bit is set, rollback. */
2580 re_string_set_index (regexp, start_idx);
2581 *token = start_token;
2582 token->type = CHARACTER;
2583 /* mb_partial and word_char bits should be already initialized by
2584 peek_token. */
2585 return elem;
15a7d175 2586 }
6b6557e8 2587
eb04c213
AZ
2588 if (BE ((end != -1 && start > end)
2589 || token->type != OP_CLOSE_DUP_NUM, 0))
15a7d175 2590 {
6b6557e8
UD
2591 /* First number greater than second. */
2592 *err = REG_BADBR;
15a7d175
UD
2593 return NULL;
2594 }
eb04c213
AZ
2595
2596 if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2597 {
2598 *err = REG_ESIZE;
2599 return NULL;
2600 }
6b6557e8
UD
2601 }
2602 else
2603 {
2604 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2606 }
2607
20dc2f79
UD
2608 fetch_token (token, regexp, syntax);
2609
02f3550c 2610 if (BE (elem == NULL, 0))
20dc2f79 2611 return NULL;
02f3550c
UD
2612 if (BE (start == 0 && end == 0, 0))
2613 {
2614 postorder (elem, free_tree, NULL);
2615 return NULL;
2616 }
602c2f9d 2617
6b6557e8 2618 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
20dc2f79 2619 if (BE (start > 0, 0))
6b6557e8
UD
2620 {
2621 tree = elem;
2622 for (i = 2; i <= start; ++i)
7de66108 2623 {
6b6557e8 2624 elem = duplicate_tree (elem, dfa);
02f3550c 2625 tree = create_tree (dfa, tree, elem, CONCAT);
6b6557e8 2626 if (BE (elem == NULL || tree == NULL, 0))
7de66108
UD
2627 goto parse_dup_op_espace;
2628 }
6b6557e8 2629
20dc2f79
UD
2630 if (start == end)
2631 return tree;
a96c63ed 2632
20dc2f79
UD
2633 /* Duplicate ELEM before it is marked optional. */
2634 elem = duplicate_tree (elem, dfa);
7ee03f00
JM
2635 if (BE (elem == NULL, 0))
2636 goto parse_dup_op_espace;
20dc2f79
UD
2637 old_tree = tree;
2638 }
2639 else
2640 old_tree = NULL;
2641
02f3550c 2642 if (elem->token.type == SUBEXP)
eb04c213
AZ
2643 {
2644 uintptr_t subidx = elem->token.opr.idx;
2645 postorder (elem, mark_opt_subexp, (void *) subidx);
2646 }
02f3550c 2647
eb04c213
AZ
2648 tree = create_tree (dfa, elem, NULL,
2649 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
20dc2f79
UD
2650 if (BE (tree == NULL, 0))
2651 goto parse_dup_op_espace;
2652
2653 /* This loop is actually executed only when end != -1,
2654 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2655 already created the start+1-th copy. */
eb04c213
AZ
2656 if (TYPE_SIGNED (Idx) || end != -1)
2657 for (i = start + 2; i <= end; ++i)
2658 {
2659 elem = duplicate_tree (elem, dfa);
2660 tree = create_tree (dfa, tree, elem, CONCAT);
2661 if (BE (elem == NULL || tree == NULL, 0))
2662 goto parse_dup_op_espace;
2663
2664 tree = create_tree (dfa, tree, NULL, OP_ALT);
2665 if (BE (tree == NULL, 0))
2666 goto parse_dup_op_espace;
2667 }
6b6557e8 2668
20dc2f79 2669 if (old_tree)
02f3550c 2670 tree = create_tree (dfa, old_tree, tree, CONCAT);
20dc2f79 2671
3b0bdc72
UD
2672 return tree;
2673
2674 parse_dup_op_espace:
3b0bdc72
UD
2675 *err = REG_ESPACE;
2676 return NULL;
3b0bdc72
UD
2677}
2678
2679/* Size of the names for collating symbol/equivalence_class/character_class.
2680 I'm not sure, but maybe enough. */
2681#define BRACKET_NAME_BUF_SIZE 32
2682
434d3784 2683#ifndef _LIBC
eb04c213
AZ
2684
2685# ifdef RE_ENABLE_I18N
2686/* Convert the byte B to the corresponding wide character. In a
2687 unibyte locale, treat B as itself if it is an encoding error.
2688 In a multibyte locale, return WEOF if B is an encoding error. */
2689static wint_t
2690parse_byte (unsigned char b, re_charset_t *mbcset)
2691{
2692 wint_t wc = __btowc (b);
2693 return wc == WEOF && !mbcset ? b : wc;
2694}
2695#endif
2696
434d3784
UD
2697 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2698 Build the range expression which starts from START_ELEM, and ends
2699 at END_ELEM. The result are written to MBCSET and SBCSET.
2700 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
d3821ab0 2701 mbcset->range_ends, is a pointer argument since we may
434d3784
UD
2702 update it. */
2703
2704static reg_errcode_t
c0a0f9a3 2705# ifdef RE_ENABLE_I18N
eb04c213
AZ
2706build_range_exp (const reg_syntax_t syntax,
2707 bitset_t sbcset,
2708 re_charset_t *mbcset,
2709 Idx *range_alloc,
2710 const bracket_elem_t *start_elem,
2711 const bracket_elem_t *end_elem)
c0a0f9a3 2712# else /* not RE_ENABLE_I18N */
eb04c213
AZ
2713build_range_exp (const reg_syntax_t syntax,
2714 bitset_t sbcset,
2715 const bracket_elem_t *start_elem,
2716 const bracket_elem_t *end_elem)
c0a0f9a3 2717# endif /* not RE_ENABLE_I18N */
434d3784
UD
2718{
2719 unsigned int start_ch, end_ch;
2720 /* Equivalence Classes and Character Classes can't be a range start/end. */
2721 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2722 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2723 0))
434d3784
UD
2724 return REG_ERANGE;
2725
2726 /* We can handle no multi character collating elements without libc
2727 support. */
62439eac 2728 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2729 && strlen ((char *) start_elem->opr.name) > 1)
2730 || (end_elem->type == COLL_SYM
2731 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2732 return REG_ECOLLATE;
2733
2734# ifdef RE_ENABLE_I18N
2735 {
2d87db5b
UD
2736 wchar_t wc;
2737 wint_t start_wc;
2738 wint_t end_wc;
434d3784
UD
2739
2740 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2741 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2742 : 0));
434d3784 2743 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2744 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2745 : 0));
434d3784 2746 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
eb04c213 2747 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
434d3784 2748 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
eb04c213 2749 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
4bb333cd
UD
2750 if (start_wc == WEOF || end_wc == WEOF)
2751 return REG_ECOLLATE;
eb04c213 2752 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
434d3784
UD
2753 return REG_ERANGE;
2754
10677727
UD
2755 /* Got valid collation sequence values, add them as a new entry.
2756 However, for !_LIBC we have no collation elements: if the
2757 character set is single byte, the single byte character set
2758 that we build below suffices. parse_bracket_exp passes
2759 no MBCSET if dfa->mb_cur_max == 1. */
2760 if (mbcset)
434d3784 2761 {
21f5de55
PE
2762 /* Check the space of the arrays. */
2763 if (BE (*range_alloc == mbcset->nranges, 0))
2764 {
10677727
UD
2765 /* There is not enough space, need realloc. */
2766 wchar_t *new_array_start, *new_array_end;
eb04c213 2767 Idx new_nranges;
10677727
UD
2768
2769 /* +1 in case of mbcset->nranges is 0. */
2770 new_nranges = 2 * mbcset->nranges + 1;
2771 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2772 are NULL if *range_alloc == 0. */
2773 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
21f5de55 2774 new_nranges);
10677727 2775 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
21f5de55 2776 new_nranges);
10677727
UD
2777
2778 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
eb04c213
AZ
2779 {
2780 re_free (new_array_start);
2781 re_free (new_array_end);
2782 return REG_ESPACE;
2783 }
10677727
UD
2784
2785 mbcset->range_starts = new_array_start;
2786 mbcset->range_ends = new_array_end;
2787 *range_alloc = new_nranges;
21f5de55 2788 }
10677727 2789
21f5de55
PE
2790 mbcset->range_starts[mbcset->nranges] = start_wc;
2791 mbcset->range_ends[mbcset->nranges++] = end_wc;
434d3784
UD
2792 }
2793
434d3784 2794 /* Build the table for single byte characters. */
1e7947dc 2795 for (wc = 0; wc < SBC_MAX; ++wc)
434d3784 2796 {
eb04c213 2797 if (start_wc <= wc && wc <= end_wc)
15a7d175 2798 bitset_set (sbcset, wc);
434d3784
UD
2799 }
2800 }
2801# else /* not RE_ENABLE_I18N */
2802 {
2803 unsigned int ch;
2804 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2805 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2806 : 0));
434d3784 2807 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2808 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2809 : 0));
434d3784
UD
2810 if (start_ch > end_ch)
2811 return REG_ERANGE;
2812 /* Build the table for single byte characters. */
1e7947dc 2813 for (ch = 0; ch < SBC_MAX; ++ch)
434d3784 2814 if (start_ch <= ch && ch <= end_ch)
15a7d175 2815 bitset_set (sbcset, ch);
434d3784
UD
2816 }
2817# endif /* not RE_ENABLE_I18N */
2818 return REG_NOERROR;
2819}
2820#endif /* not _LIBC */
2821
2822#ifndef _LIBC
2823/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2824 Build the collating element which is represented by NAME.
2825 The result are written to MBCSET and SBCSET.
2826 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2827 pointer argument since we may update it. */
2828
2829static reg_errcode_t
c0a0f9a3 2830# ifdef RE_ENABLE_I18N
0fd8ae9c 2831build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 2832 Idx *coll_sym_alloc, const unsigned char *name)
c0a0f9a3 2833# else /* not RE_ENABLE_I18N */
0fd8ae9c 2834build_collating_symbol (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 2835# endif /* not RE_ENABLE_I18N */
434d3784 2836{
62439eac
UD
2837 size_t name_len = strlen ((const char *) name);
2838 if (BE (name_len != 1, 0))
434d3784
UD
2839 return REG_ECOLLATE;
2840 else
2841 {
62439eac 2842 bitset_set (sbcset, name[0]);
434d3784
UD
2843 return REG_NOERROR;
2844 }
2845}
2846#endif /* not _LIBC */
2847
3b0bdc72
UD
2848/* This function parse bracket expression like "[abc]", "[a-c]",
2849 "[[.a-a.]]" etc. */
2850
2851static bin_tree_t *
0fd8ae9c
UD
2852parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2853 reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72
UD
2854{
2855#ifdef _LIBC
62439eac
UD
2856 const unsigned char *collseqmb;
2857 const char *collseqwc;
3b0bdc72
UD
2858 uint32_t nrules;
2859 int32_t table_size;
2860 const int32_t *symb_table;
2861 const unsigned char *extra;
2862
d3821ab0
RM
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Seek the collating symbol entry corresponding to NAME.
7e2f0d2d
AS
2865 Return the index of the symbol in the SYMB_TABLE,
2866 or -1 if not found. */
3b0bdc72 2867
7090d3ca 2868 auto inline int32_t
d3821ab0 2869 __attribute__ ((always_inline))
7e2f0d2d 2870 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
3b0bdc72 2871 {
7e2f0d2d 2872 int32_t elem;
a334319f 2873
7e2f0d2d
AS
2874 for (elem = 0; elem < table_size; elem++)
2875 if (symb_table[2 * elem] != 0)
2876 {
2877 int32_t idx = symb_table[2 * elem + 1];
2878 /* Skip the name of collating element name. */
2879 idx += 1 + extra[idx];
2880 if (/* Compare the length of the name. */
2881 name_len == extra[idx]
2882 /* Compare the name. */
2883 && memcmp (name, &extra[idx + 1], name_len) == 0)
2884 /* Yep, this is the entry. */
2885 return elem;
2886 }
2887 return -1;
3b0bdc72
UD
2888 }
2889
a532a41d 2890 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
2891 Look up the collation sequence value of BR_ELEM.
2892 Return the value if succeeded, UINT_MAX otherwise. */
2893
7090d3ca 2894 auto inline unsigned int
d3821ab0 2895 __attribute__ ((always_inline))
7e2f0d2d 2896 lookup_collation_sequence_value (bracket_elem_t *br_elem)
3b0bdc72
UD
2897 {
2898 if (br_elem->type == SB_CHAR)
15a7d175
UD
2899 {
2900 /*
2901 if (MB_CUR_MAX == 1)
2902 */
2903 if (nrules == 0)
2904 return collseqmb[br_elem->opr.ch];
2905 else
2906 {
2907 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2908 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2909 }
2910 }
3b0bdc72 2911 else if (br_elem->type == MB_CHAR)
15a7d175 2912 {
a532a41d
UD
2913 if (nrules != 0)
2914 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2915 }
3b0bdc72 2916 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2917 {
2918 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919 if (nrules != 0)
2920 {
2921 int32_t elem, idx;
2922 elem = seek_collating_symbol_entry (br_elem->opr.name,
2923 sym_name_len);
7e2f0d2d 2924 if (elem != -1)
15a7d175
UD
2925 {
2926 /* We found the entry. */
2927 idx = symb_table[2 * elem + 1];
2928 /* Skip the name of collating element name. */
2929 idx += 1 + extra[idx];
2930 /* Skip the byte sequence of the collating element. */
2931 idx += 1 + extra[idx];
2932 /* Adjust for the alignment. */
2933 idx = (idx + 3) & ~3;
2934 /* Skip the multibyte collation sequence value. */
2935 idx += sizeof (unsigned int);
2936 /* Skip the wide char sequence of the collating element. */
2937 idx += sizeof (unsigned int) *
2938 (1 + *(unsigned int *) (extra + idx));
2939 /* Return the collation sequence value. */
2940 return *(unsigned int *) (extra + idx);
2941 }
7e2f0d2d 2942 else if (sym_name_len == 1)
15a7d175
UD
2943 {
2944 /* No valid character. Match it as a single byte
2945 character. */
2946 return collseqmb[br_elem->opr.name[0]];
2947 }
2948 }
2949 else if (sym_name_len == 1)
2950 return collseqmb[br_elem->opr.name[0]];
2951 }
3b0bdc72
UD
2952 return UINT_MAX;
2953 }
2954
d3821ab0 2955 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
2956 Build the range expression which starts from START_ELEM, and ends
2957 at END_ELEM. The result are written to MBCSET and SBCSET.
2958 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
d3821ab0 2959 mbcset->range_ends, is a pointer argument since we may
3b0bdc72
UD
2960 update it. */
2961
7090d3ca 2962 auto inline reg_errcode_t
d3821ab0 2963 __attribute__ ((always_inline))
7e2f0d2d
AS
2964 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2965 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
3b0bdc72
UD
2966 {
2967 unsigned int ch;
2968 uint32_t start_collseq;
2969 uint32_t end_collseq;
2970
434d3784 2971 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2972 start/end. */
bc15410e 2973 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2974 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2975 0))
2976 return REG_ERANGE;
3b0bdc72 2977
eb04c213 2978 /* FIXME: Implement rational ranges here, too. */
3b0bdc72
UD
2979 start_collseq = lookup_collation_sequence_value (start_elem);
2980 end_collseq = lookup_collation_sequence_value (end_elem);
2981 /* Check start/end collation sequence values. */
bc15410e 2982 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2983 return REG_ECOLLATE;
bc15410e 2984 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2985 return REG_ERANGE;
3b0bdc72 2986
10677727
UD
2987 /* Got valid collation sequence values, add them as a new entry.
2988 However, if we have no collation elements, and the character set
2989 is single byte, the single byte character set that we
2990 build below suffices. */
2991 if (nrules > 0 || dfa->mb_cur_max > 1)
2992 {
21f5de55
PE
2993 /* Check the space of the arrays. */
2994 if (BE (*range_alloc == mbcset->nranges, 0))
10677727
UD
2995 {
2996 /* There is not enough space, need realloc. */
2997 uint32_t *new_array_start;
2998 uint32_t *new_array_end;
eb04c213 2999 Idx new_nranges;
10677727
UD
3000
3001 /* +1 in case of mbcset->nranges is 0. */
3002 new_nranges = 2 * mbcset->nranges + 1;
3003 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3004 new_nranges);
3005 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
21f5de55 3006 new_nranges);
10677727
UD
3007
3008 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
21f5de55 3009 return REG_ESPACE;
10677727
UD
3010
3011 mbcset->range_starts = new_array_start;
3012 mbcset->range_ends = new_array_end;
3013 *range_alloc = new_nranges;
3014 }
3015
21f5de55
PE
3016 mbcset->range_starts[mbcset->nranges] = start_collseq;
3017 mbcset->range_ends[mbcset->nranges++] = end_collseq;
10677727 3018 }
3b0bdc72
UD
3019
3020 /* Build the table for single byte characters. */
1e7947dc 3021 for (ch = 0; ch < SBC_MAX; ch++)
15a7d175
UD
3022 {
3023 uint32_t ch_collseq;
3024 /*
3025 if (MB_CUR_MAX == 1)
3026 */
3027 if (nrules == 0)
3028 ch_collseq = collseqmb[ch];
3029 else
25337753 3030 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
3031 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3032 bitset_set (sbcset, ch);
3033 }
3b0bdc72
UD
3034 return REG_NOERROR;
3035 }
3b0bdc72 3036
d3821ab0 3037 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
3038 Build the collating element which is represented by NAME.
3039 The result are written to MBCSET and SBCSET.
3040 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
d3821ab0 3041 pointer argument since we may update it. */
3b0bdc72 3042
7090d3ca 3043 auto inline reg_errcode_t
d3821ab0 3044 __attribute__ ((always_inline))
7e2f0d2d 3045 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 3046 Idx *coll_sym_alloc, const unsigned char *name)
3b0bdc72 3047 {
3b0bdc72 3048 int32_t elem, idx;
62439eac 3049 size_t name_len = strlen ((const char *) name);
3b0bdc72 3050 if (nrules != 0)
15a7d175
UD
3051 {
3052 elem = seek_collating_symbol_entry (name, name_len);
7e2f0d2d 3053 if (elem != -1)
15a7d175
UD
3054 {
3055 /* We found the entry. */
3056 idx = symb_table[2 * elem + 1];
3057 /* Skip the name of collating element name. */
3058 idx += 1 + extra[idx];
3059 }
7e2f0d2d 3060 else if (name_len == 1)
15a7d175
UD
3061 {
3062 /* No valid character, treat it as a normal
3063 character. */
3064 bitset_set (sbcset, name[0]);
3065 return REG_NOERROR;
3066 }
3067 else
3068 return REG_ECOLLATE;
3b0bdc72 3069
15a7d175
UD
3070 /* Got valid collation sequence, add it as a new entry. */
3071 /* Check the space of the arrays. */
951d6408 3072 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
15a7d175
UD
3073 {
3074 /* Not enough, realloc it. */
3075 /* +1 in case of mbcset->ncoll_syms is 0. */
eb04c213 3076 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
15a7d175
UD
3077 /* Use realloc since mbcset->coll_syms is NULL
3078 if *alloc == 0. */
951d6408
UD
3079 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3080 new_coll_sym_alloc);
3081 if (BE (new_coll_syms == NULL, 0))
15a7d175 3082 return REG_ESPACE;
951d6408
UD
3083 mbcset->coll_syms = new_coll_syms;
3084 *coll_sym_alloc = new_coll_sym_alloc;
15a7d175
UD
3085 }
3086 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
15a7d175
UD
3087 return REG_NOERROR;
3088 }
3b0bdc72 3089 else
15a7d175
UD
3090 {
3091 if (BE (name_len != 1, 0))
3092 return REG_ECOLLATE;
3093 else
3094 {
3095 bitset_set (sbcset, name[0]);
3096 return REG_NOERROR;
3097 }
3098 }
3b0bdc72 3099 }
434d3784
UD
3100#endif
3101
3b0bdc72
UD
3102 re_token_t br_token;
3103 re_bitset_ptr_t sbcset;
c0a0f9a3 3104#ifdef RE_ENABLE_I18N
3b0bdc72 3105 re_charset_t *mbcset;
eb04c213
AZ
3106 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3107 Idx equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3 3108#endif /* not RE_ENABLE_I18N */
eb04c213 3109 bool non_match = false;
c0a0f9a3 3110 bin_tree_t *work_tree;
ee70274a 3111 int token_len;
eb04c213 3112 bool first_round = true;
3b0bdc72 3113#ifdef _LIBC
62439eac
UD
3114 collseqmb = (const unsigned char *)
3115 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
3116 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3117 if (nrules)
3118 {
3119 /*
3120 if (MB_CUR_MAX > 1)
3121 */
7d4722e3 3122 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
3123 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3124 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3125 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 3126 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3127 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3128 }
3129#endif
2c05d33f 3130 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3131#ifdef RE_ENABLE_I18N
3b0bdc72 3132 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3133#endif /* RE_ENABLE_I18N */
3134#ifdef RE_ENABLE_I18N
bc15410e 3135 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3136#else
3137 if (BE (sbcset == NULL, 0))
3138#endif /* RE_ENABLE_I18N */
3b0bdc72 3139 {
a129c80d
UD
3140 re_free (sbcset);
3141#ifdef RE_ENABLE_I18N
3142 re_free (mbcset);
3143#endif
3b0bdc72
UD
3144 *err = REG_ESPACE;
3145 return NULL;
3146 }
3147
3148 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3149 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 3150 {
3b0bdc72 3151 *err = REG_BADPAT;
434d3784 3152 goto parse_bracket_exp_free_return;
3b0bdc72
UD
3153 }
3154 if (token->type == OP_NON_MATCH_LIST)
3155 {
c0a0f9a3 3156#ifdef RE_ENABLE_I18N
3b0bdc72 3157 mbcset->non_match = 1;
c0a0f9a3 3158#endif /* not RE_ENABLE_I18N */
eb04c213 3159 non_match = true;
3b0bdc72 3160 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
784aacea 3161 bitset_set (sbcset, '\n');
3b0bdc72
UD
3162 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3163 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3164 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
3165 {
3166 *err = REG_BADPAT;
3167 goto parse_bracket_exp_free_return;
3168 }
3b0bdc72
UD
3169 }
3170
3171 /* We treat the first ']' as a normal character. */
3172 if (token->type == OP_CLOSE_BRACKET)
3173 token->type = CHARACTER;
3174
3175 while (1)
3176 {
3177 bracket_elem_t start_elem, end_elem;
62439eac
UD
3178 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3179 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72 3180 reg_errcode_t ret;
eb04c213
AZ
3181 int token_len2 = 0;
3182 bool is_range_exp = false;
3b0bdc72
UD
3183 re_token_t token2;
3184
3185 start_elem.opr.name = start_name_buf;
39a12f8d 3186 start_elem.type = COLL_SYM;
3b0bdc72 3187 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 3188 syntax, first_round);
bc15410e 3189 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
3190 {
3191 *err = ret;
3192 goto parse_bracket_exp_free_return;
3193 }
eb04c213 3194 first_round = false;
3b0bdc72 3195
78d8b07a 3196 /* Get information about the next token. We need it in any case. */
3b0bdc72 3197 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3198
3199 /* Do not check for ranges if we know they are not allowed. */
3200 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3201 {
15a7d175
UD
3202 if (BE (token->type == END_OF_RE, 0))
3203 {
78d8b07a 3204 *err = REG_EBRACK;
15a7d175
UD
3205 goto parse_bracket_exp_free_return;
3206 }
78d8b07a 3207 if (token->type == OP_CHARSET_RANGE)
15a7d175 3208 {
78d8b07a
UD
3209 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3210 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3211 if (BE (token2.type == END_OF_RE, 0))
3212 {
3213 *err = REG_EBRACK;
3214 goto parse_bracket_exp_free_return;
3215 }
3216 if (token2.type == OP_CLOSE_BRACKET)
3217 {
3218 /* We treat the last '-' as a normal character. */
3219 re_string_skip_bytes (regexp, -token_len);
3220 token->type = CHARACTER;
3221 }
3222 else
eb04c213 3223 is_range_exp = true;
15a7d175 3224 }
15a7d175 3225 }
3b0bdc72 3226
eb04c213 3227 if (is_range_exp == true)
15a7d175
UD
3228 {
3229 end_elem.opr.name = end_name_buf;
39a12f8d 3230 end_elem.type = COLL_SYM;
15a7d175 3231 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
eb04c213 3232 dfa, syntax, true);
15a7d175
UD
3233 if (BE (ret != REG_NOERROR, 0))
3234 {
3235 *err = ret;
3236 goto parse_bracket_exp_free_return;
3237 }
3238
3239 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3240
10677727
UD
3241#ifdef _LIBC
3242 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3243 &start_elem, &end_elem);
3244#else
3245# ifdef RE_ENABLE_I18N
eb04c213 3246 *err = build_range_exp (syntax, sbcset,
10677727
UD
3247 dfa->mb_cur_max > 1 ? mbcset : NULL,
3248 &range_alloc, &start_elem, &end_elem);
3249# else
eb04c213 3250 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
10677727 3251# endif
c0a0f9a3 3252#endif /* RE_ENABLE_I18N */
15a7d175
UD
3253 if (BE (*err != REG_NOERROR, 0))
3254 goto parse_bracket_exp_free_return;
3255 }
3b0bdc72 3256 else
15a7d175
UD
3257 {
3258 switch (start_elem.type)
3259 {
3260 case SB_CHAR:
3261 bitset_set (sbcset, start_elem.opr.ch);
3262 break;
c0a0f9a3 3263#ifdef RE_ENABLE_I18N
15a7d175
UD
3264 case MB_CHAR:
3265 /* Check whether the array has enough space. */
951d6408 3266 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
15a7d175 3267 {
951d6408 3268 wchar_t *new_mbchars;
15a7d175
UD
3269 /* Not enough, realloc it. */
3270 /* +1 in case of mbcset->nmbchars is 0. */
3271 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3272 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3273 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3274 mbchar_alloc);
3275 if (BE (new_mbchars == NULL, 0))
15a7d175 3276 goto parse_bracket_exp_espace;
951d6408 3277 mbcset->mbchars = new_mbchars;
15a7d175
UD
3278 }
3279 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3280 break;
c0a0f9a3 3281#endif /* RE_ENABLE_I18N */
15a7d175
UD
3282 case EQUIV_CLASS:
3283 *err = build_equiv_class (sbcset,
c0a0f9a3 3284#ifdef RE_ENABLE_I18N
15a7d175 3285 mbcset, &equiv_class_alloc,
c0a0f9a3 3286#endif /* RE_ENABLE_I18N */
3b0bdc72 3287 start_elem.opr.name);
15a7d175
UD
3288 if (BE (*err != REG_NOERROR, 0))
3289 goto parse_bracket_exp_free_return;
3290 break;
3291 case COLL_SYM:
3292 *err = build_collating_symbol (sbcset,
c0a0f9a3 3293#ifdef RE_ENABLE_I18N
15a7d175 3294 mbcset, &coll_sym_alloc,
c0a0f9a3 3295#endif /* RE_ENABLE_I18N */
3b0bdc72 3296 start_elem.opr.name);
15a7d175
UD
3297 if (BE (*err != REG_NOERROR, 0))
3298 goto parse_bracket_exp_free_return;
3299 break;
3300 case CHAR_CLASS:
66b110e8 3301 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3302#ifdef RE_ENABLE_I18N
7b7b9e70 3303 mbcset, &char_class_alloc,
c0a0f9a3 3304#endif /* RE_ENABLE_I18N */
eb04c213
AZ
3305 (const char *) start_elem.opr.name,
3306 syntax);
7b7b9e70
UD
3307 if (BE (*err != REG_NOERROR, 0))
3308 goto parse_bracket_exp_free_return;
15a7d175
UD
3309 break;
3310 default:
3311 assert (0);
3312 break;
3313 }
3314 }
78d8b07a
UD
3315 if (BE (token->type == END_OF_RE, 0))
3316 {
3317 *err = REG_EBRACK;
3318 goto parse_bracket_exp_free_return;
3319 }
3b0bdc72 3320 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3321 break;
3b0bdc72
UD
3322 }
3323
3324 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3325
3326 /* If it is non-matching list. */
c0a0f9a3 3327 if (non_match)
3b0bdc72 3328 bitset_not (sbcset);
10677727 3329
65e6becf
UD
3330#ifdef RE_ENABLE_I18N
3331 /* Ensure only single byte characters are set. */
3332 if (dfa->mb_cur_max > 1)
3333 bitset_mask (sbcset, dfa->sb_char);
3b0bdc72
UD
3334
3335 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3336 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3337 || mbcset->non_match)))
3b0bdc72 3338 {
3b0bdc72 3339 bin_tree_t *mbc_tree;
ad7f28c2 3340 int sbc_idx;
3b0bdc72 3341 /* Build a tree for complex bracket. */
ad7f28c2 3342 dfa->has_mb_node = 1;
02f3550c
UD
3343 br_token.type = COMPLEX_BRACKET;
3344 br_token.opr.mbcset = mbcset;
3345 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346 if (BE (mbc_tree == NULL, 0))
3347 goto parse_bracket_exp_espace;
2c05d33f 3348 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
ad7f28c2
UD
3349 if (sbcset[sbc_idx])
3350 break;
3351 /* If there are no bits set in sbcset, there is no point
3352 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
2c05d33f 3353 if (sbc_idx < BITSET_WORDS)
02f3550c 3354 {
21f5de55
PE
3355 /* Build a tree for simple bracket. */
3356 br_token.type = SIMPLE_BRACKET;
3357 br_token.opr.sbcset = sbcset;
3358 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359 if (BE (work_tree == NULL, 0))
3360 goto parse_bracket_exp_espace;
02f3550c 3361
21f5de55
PE
3362 /* Then join them by ALT node. */
3363 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3364 if (BE (work_tree == NULL, 0))
3365 goto parse_bracket_exp_espace;
02f3550c
UD
3366 }
3367 else
ad7f28c2
UD
3368 {
3369 re_free (sbcset);
02f3550c 3370 work_tree = mbc_tree;
ad7f28c2 3371 }
3b0bdc72
UD
3372 }
3373 else
963d8d78 3374#endif /* not RE_ENABLE_I18N */
3b0bdc72 3375 {
963d8d78
UD
3376#ifdef RE_ENABLE_I18N
3377 free_charset (mbcset);
3378#endif
02f3550c
UD
3379 /* Build a tree for simple bracket. */
3380 br_token.type = SIMPLE_BRACKET;
3381 br_token.opr.sbcset = sbcset;
3382 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3383 if (BE (work_tree == NULL, 0))
21f5de55 3384 goto parse_bracket_exp_espace;
3b0bdc72 3385 }
02f3550c 3386 return work_tree;
3b0bdc72
UD
3387
3388 parse_bracket_exp_espace:
3b0bdc72 3389 *err = REG_ESPACE;
434d3784
UD
3390 parse_bracket_exp_free_return:
3391 re_free (sbcset);
c0a0f9a3 3392#ifdef RE_ENABLE_I18N
434d3784 3393 free_charset (mbcset);
c0a0f9a3 3394#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3395 return NULL;
3396}
3397
434d3784
UD
3398/* Parse an element in the bracket expression. */
3399
3b0bdc72 3400static reg_errcode_t
0fd8ae9c
UD
3401parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3402 re_token_t *token, int token_len, re_dfa_t *dfa,
eb04c213 3403 reg_syntax_t syntax, bool accept_hyphen)
3b0bdc72
UD
3404{
3405#ifdef RE_ENABLE_I18N
3406 int cur_char_size;
3407 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3408 if (cur_char_size > 1)
3409 {
3410 elem->type = MB_CHAR;
3411 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3412 re_string_skip_bytes (regexp, cur_char_size);
3413 return REG_NOERROR;
3414 }
3415#endif /* RE_ENABLE_I18N */
3416 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3417 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3418 || token->type == OP_OPEN_EQUIV_CLASS)
3419 return parse_bracket_symbol (elem, regexp, token);
78d8b07a
UD
3420 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3421 {
3422 /* A '-' must only appear as anything but a range indicator before
3423 the closing bracket. Everything else is an error. */
3424 re_token_t token2;
3425 (void) peek_token_bracket (&token2, regexp, syntax);
3426 if (token2.type != OP_CLOSE_BRACKET)
3427 /* The actual error value is not standardized since this whole
3428 case is undefined. But ERANGE makes good sense. */
3429 return REG_ERANGE;
3430 }
3b0bdc72
UD
3431 elem->type = SB_CHAR;
3432 elem->opr.ch = token->opr.c;
3433 return REG_NOERROR;
3434}
3435
434d3784
UD
3436/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3437 such as [:<character_class>:], [.<collating_element>.], and
3438 [=<equivalent_class>=]. */
3439
3b0bdc72 3440static reg_errcode_t
0fd8ae9c
UD
3441parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3442 re_token_t *token)
3b0bdc72
UD
3443{
3444 unsigned char ch, delim = token->opr.c;
3445 int i = 0;
294b6bcc
UD
3446 if (re_string_eoi(regexp))
3447 return REG_EBRACK;
602c2f9d 3448 for (;; ++i)
3b0bdc72 3449 {
294b6bcc 3450 if (i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3451 return REG_EBRACK;
3b0bdc72 3452 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3453 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3454 else
15a7d175 3455 ch = re_string_fetch_byte (regexp);
294b6bcc
UD
3456 if (re_string_eoi(regexp))
3457 return REG_EBRACK;
3b0bdc72 3458 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3459 break;
3b0bdc72
UD
3460 elem->opr.name[i] = ch;
3461 }
3462 re_string_skip_bytes (regexp, 1);
3463 elem->opr.name[i] = '\0';
3464 switch (token->type)
3465 {
3466 case OP_OPEN_COLL_ELEM:
3467 elem->type = COLL_SYM;
3468 break;
3469 case OP_OPEN_EQUIV_CLASS:
3470 elem->type = EQUIV_CLASS;
3471 break;
3472 case OP_OPEN_CHAR_CLASS:
3473 elem->type = CHAR_CLASS;
3474 break;
3475 default:
3476 break;
3477 }
3478 return REG_NOERROR;
3479}
3480
3481 /* Helper function for parse_bracket_exp.
3482 Build the equivalence class which is represented by NAME.
3483 The result are written to MBCSET and SBCSET.
3484 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
d3821ab0 3485 is a pointer argument since we may update it. */
3b0bdc72
UD
3486
3487static reg_errcode_t
c0a0f9a3 3488#ifdef RE_ENABLE_I18N
0fd8ae9c 3489build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 3490 Idx *equiv_class_alloc, const unsigned char *name)
c0a0f9a3 3491#else /* not RE_ENABLE_I18N */
0fd8ae9c 3492build_equiv_class (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 3493#endif /* not RE_ENABLE_I18N */
3b0bdc72 3494{
0fd8ae9c 3495#ifdef _LIBC
3b0bdc72
UD
3496 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3497 if (nrules != 0)
3498 {
3499 const int32_t *table, *indirect;
3500 const unsigned char *weights, *extra, *cp;
62439eac 3501 unsigned char char_buf[2];
3b0bdc72
UD
3502 int32_t idx1, idx2;
3503 unsigned int ch;
3504 size_t len;
3b0bdc72 3505 /* Calculate the index for equivalence class. */
62439eac 3506 cp = name;
3b0bdc72
UD
3507 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 _NL_COLLATE_WEIGHTMB);
3510 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3511 _NL_COLLATE_EXTRAMB);
3b0bdc72 3512 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3513 _NL_COLLATE_INDIRECTMB);
8c0ab919 3514 idx1 = findidx (table, indirect, extra, &cp, -1);
f3a6cc0a 3515 if (BE (idx1 == 0 || *cp != '\0', 0))
15a7d175
UD
3516 /* This isn't a valid character. */
3517 return REG_ECOLLATE;
3b0bdc72 3518
5069ff32 3519 /* Build single byte matching table for this equivalence class. */
b7d1c5fa 3520 len = weights[idx1 & 0xffffff];
3b0bdc72 3521 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3522 {
3523 char_buf[0] = ch;
3524 cp = char_buf;
8c0ab919 3525 idx2 = findidx (table, indirect, extra, &cp, 1);
3b0bdc72 3526/*
15a7d175 3527 idx2 = table[ch];
3b0bdc72 3528*/
15a7d175
UD
3529 if (idx2 == 0)
3530 /* This isn't a valid character. */
3531 continue;
b7d1c5fa
UD
3532 /* Compare only if the length matches and the collation rule
3533 index is the same. */
786658a0
FW
3534 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3535 && memcmp (weights + (idx1 & 0xffffff) + 1,
3536 weights + (idx2 & 0xffffff) + 1, len) == 0)
3537 bitset_set (sbcset, ch);
15a7d175 3538 }
a9388965 3539 /* Check whether the array has enough space. */
951d6408 3540 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
15a7d175
UD
3541 {
3542 /* Not enough, realloc it. */
3543 /* +1 in case of mbcset->nequiv_classes is 0. */
eb04c213 3544 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3545 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3546 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3547 int32_t,
3548 new_equiv_class_alloc);
3549 if (BE (new_equiv_classes == NULL, 0))
15a7d175 3550 return REG_ESPACE;
951d6408
UD
3551 mbcset->equiv_classes = new_equiv_classes;
3552 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3553 }
3b0bdc72
UD
3554 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3555 }
3556 else
10677727 3557#endif /* _LIBC */
3b0bdc72 3558 {
62439eac 3559 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3560 return REG_ECOLLATE;
62439eac 3561 bitset_set (sbcset, *name);
3b0bdc72
UD
3562 }
3563 return REG_NOERROR;
3564}
3565
3566 /* Helper function for parse_bracket_exp.
3567 Build the character class which is represented by NAME.
3568 The result are written to MBCSET and SBCSET.
3569 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
d3821ab0 3570 is a pointer argument since we may update it. */
3b0bdc72
UD
3571
3572static reg_errcode_t
c0a0f9a3 3573#ifdef RE_ENABLE_I18N
0fd8ae9c 3574build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
eb04c213
AZ
3575 re_charset_t *mbcset, Idx *char_class_alloc,
3576 const char *class_name, reg_syntax_t syntax)
c0a0f9a3 3577#else /* not RE_ENABLE_I18N */
0fd8ae9c 3578build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
eb04c213 3579 const char *class_name, reg_syntax_t syntax)
c0a0f9a3 3580#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3581{
3582 int i;
eb04c213 3583 const char *name = class_name;
c0a0f9a3
UD
3584
3585 /* In case of REG_ICASE "upper" and "lower" match the both of
3586 upper and lower cases. */
3587 if ((syntax & RE_ICASE)
3588 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3589 name = "alpha";
3590
3591#ifdef RE_ENABLE_I18N
3b0bdc72 3592 /* Check the space of the arrays. */
951d6408 3593 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
a9388965
UD
3594 {
3595 /* Not enough, realloc it. */
3596 /* +1 in case of mbcset->nchar_classes is 0. */
eb04c213 3597 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3598 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3599 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3600 new_char_class_alloc);
3601 if (BE (new_char_classes == NULL, 0))
15a7d175 3602 return REG_ESPACE;
951d6408
UD
3603 mbcset->char_classes = new_char_classes;
3604 *char_class_alloc = new_char_class_alloc;
a9388965 3605 }
3b0bdc72 3606 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3607#endif /* RE_ENABLE_I18N */
3b0bdc72 3608
66b110e8 3609#define BUILD_CHARCLASS_LOOP(ctype_func) \
513bbb25
UD
3610 do { \
3611 if (BE (trans != NULL, 0)) \
0ecb606c 3612 { \
513bbb25 3613 for (i = 0; i < SBC_MAX; ++i) \
eb04c213 3614 if (ctype_func (i)) \
513bbb25
UD
3615 bitset_set (sbcset, trans[i]); \
3616 } \
3617 else \
3618 { \
3619 for (i = 0; i < SBC_MAX; ++i) \
eb04c213 3620 if (ctype_func (i)) \
513bbb25
UD
3621 bitset_set (sbcset, i); \
3622 } \
3623 } while (0)
3b0bdc72
UD
3624
3625 if (strcmp (name, "alnum") == 0)
513bbb25 3626 BUILD_CHARCLASS_LOOP (isalnum);
3b0bdc72 3627 else if (strcmp (name, "cntrl") == 0)
513bbb25 3628 BUILD_CHARCLASS_LOOP (iscntrl);
3b0bdc72 3629 else if (strcmp (name, "lower") == 0)
513bbb25 3630 BUILD_CHARCLASS_LOOP (islower);
3b0bdc72 3631 else if (strcmp (name, "space") == 0)
513bbb25 3632 BUILD_CHARCLASS_LOOP (isspace);
3b0bdc72 3633 else if (strcmp (name, "alpha") == 0)
513bbb25 3634 BUILD_CHARCLASS_LOOP (isalpha);
3b0bdc72 3635 else if (strcmp (name, "digit") == 0)
513bbb25 3636 BUILD_CHARCLASS_LOOP (isdigit);
3b0bdc72 3637 else if (strcmp (name, "print") == 0)
513bbb25 3638 BUILD_CHARCLASS_LOOP (isprint);
3b0bdc72 3639 else if (strcmp (name, "upper") == 0)
513bbb25 3640 BUILD_CHARCLASS_LOOP (isupper);
3b0bdc72 3641 else if (strcmp (name, "blank") == 0)
513bbb25 3642 BUILD_CHARCLASS_LOOP (isblank);
3b0bdc72 3643 else if (strcmp (name, "graph") == 0)
513bbb25 3644 BUILD_CHARCLASS_LOOP (isgraph);
3b0bdc72 3645 else if (strcmp (name, "punct") == 0)
513bbb25 3646 BUILD_CHARCLASS_LOOP (ispunct);
3b0bdc72 3647 else if (strcmp (name, "xdigit") == 0)
513bbb25 3648 BUILD_CHARCLASS_LOOP (isxdigit);
3b0bdc72
UD
3649 else
3650 return REG_ECTYPE;
3651
3652 return REG_NOERROR;
3653}
3654
3655static bin_tree_t *
0fd8ae9c 3656build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
eb04c213
AZ
3657 const char *class_name,
3658 const char *extra, bool non_match,
0fd8ae9c 3659 reg_errcode_t *err)
3b0bdc72
UD
3660{
3661 re_bitset_ptr_t sbcset;
c0a0f9a3 3662#ifdef RE_ENABLE_I18N
3b0bdc72 3663 re_charset_t *mbcset;
eb04c213 3664 Idx alloc = 0;
c0a0f9a3 3665#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3666 reg_errcode_t ret;
3667 re_token_t br_token;
3668 bin_tree_t *tree;
3b0bdc72 3669
2c05d33f 3670 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3671 if (BE (sbcset == NULL, 0))
3b0bdc72
UD
3672 {
3673 *err = REG_ESPACE;
3674 return NULL;
3675 }
c0a0f9a3 3676#ifdef RE_ENABLE_I18N
eb04c213
AZ
3677 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3678 if (BE (mbcset == NULL, 0))
3679 {
3680 re_free (sbcset);
3681 *err = REG_ESPACE;
3682 return NULL;
3b0bdc72 3683 }
eb04c213
AZ
3684 mbcset->non_match = non_match;
3685#endif /* RE_ENABLE_I18N */
3b0bdc72 3686
602c2f9d 3687 /* We don't care the syntax in this case. */
66b110e8 3688 ret = build_charclass (trans, sbcset,
c0a0f9a3 3689#ifdef RE_ENABLE_I18N
15a7d175 3690 mbcset, &alloc,
c0a0f9a3 3691#endif /* RE_ENABLE_I18N */
e2b6bfa3 3692 class_name, 0);
c0a0f9a3 3693
bc15410e 3694 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3695 {
3696 re_free (sbcset);
c0a0f9a3 3697#ifdef RE_ENABLE_I18N
3b0bdc72 3698 free_charset (mbcset);
c0a0f9a3 3699#endif /* RE_ENABLE_I18N */
7b7b9e70 3700 *err = ret;
3b0bdc72
UD
3701 return NULL;
3702 }
434d3784 3703 /* \w match '_' also. */
e2b6bfa3
UD
3704 for (; *extra; extra++)
3705 bitset_set (sbcset, *extra);
3b0bdc72
UD
3706
3707 /* If it is non-matching list. */
c0a0f9a3 3708 if (non_match)
3b0bdc72
UD
3709 bitset_not (sbcset);
3710
65e6becf
UD
3711#ifdef RE_ENABLE_I18N
3712 /* Ensure only single byte characters are set. */
3713 if (dfa->mb_cur_max > 1)
3714 bitset_mask (sbcset, dfa->sb_char);
3715#endif
3716
3b0bdc72 3717 /* Build a tree for simple bracket. */
eb04c213
AZ
3718#if defined GCC_LINT || defined lint
3719 memset (&br_token, 0, sizeof br_token);
3720#endif
3b0bdc72
UD
3721 br_token.type = SIMPLE_BRACKET;
3722 br_token.opr.sbcset = sbcset;
02f3550c 3723 tree = create_token_tree (dfa, NULL, NULL, &br_token);
ee70274a 3724 if (BE (tree == NULL, 0))
3b0bdc72
UD
3725 goto build_word_op_espace;
3726
c0a0f9a3 3727#ifdef RE_ENABLE_I18N
3c0fb574 3728 if (dfa->mb_cur_max > 1)
3b0bdc72 3729 {
3b0bdc72
UD
3730 bin_tree_t *mbc_tree;
3731 /* Build a tree for complex bracket. */
3732 br_token.type = COMPLEX_BRACKET;
3733 br_token.opr.mbcset = mbcset;
3734 dfa->has_mb_node = 1;
02f3550c 3735 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
ee70274a 3736 if (BE (mbc_tree == NULL, 0))
15a7d175 3737 goto build_word_op_espace;
3b0bdc72 3738 /* Then join them by ALT node. */
02f3550c 3739 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
ee70274a 3740 if (BE (mbc_tree != NULL, 1))
15a7d175 3741 return tree;
3b0bdc72
UD
3742 }
3743 else
3744 {
3745 free_charset (mbcset);
3746 return tree;
3747 }
c0a0f9a3
UD
3748#else /* not RE_ENABLE_I18N */
3749 return tree;
3750#endif /* not RE_ENABLE_I18N */
3751
3b0bdc72
UD
3752 build_word_op_espace:
3753 re_free (sbcset);
c0a0f9a3 3754#ifdef RE_ENABLE_I18N
3b0bdc72 3755 free_charset (mbcset);
c0a0f9a3 3756#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3757 *err = REG_ESPACE;
3758 return NULL;
3759}
3760
3761/* This is intended for the expressions like "a{1,3}".
eb04c213
AZ
3762 Fetch a number from 'input', and return the number.
3763 Return -1 if the number field is empty like "{,1}".
3764 Return RE_DUP_MAX + 1 if the number field is too large.
3765 Return -2 if an error occurred. */
3b0bdc72 3766
eb04c213 3767static Idx
0fd8ae9c 3768fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3b0bdc72 3769{
eb04c213 3770 Idx num = -1;
3b0bdc72
UD
3771 unsigned char c;
3772 while (1)
3773 {
f0d77aa8 3774 fetch_token (token, input, syntax);
3b0bdc72 3775 c = token->opr.c;
602c2f9d 3776 if (BE (token->type == END_OF_RE, 0))
15a7d175 3777 return -2;
3b0bdc72 3778 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3779 break;
602c2f9d 3780 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
eb04c213
AZ
3781 ? -2
3782 : num == -1
3783 ? c - '0'
3784 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3b0bdc72 3785 }
3b0bdc72
UD
3786 return num;
3787}
3788\f
c0a0f9a3 3789#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3790static void
3791free_charset (re_charset_t *cset)
3792{
3793 re_free (cset->mbchars);
c0a0f9a3 3794# ifdef _LIBC
3b0bdc72
UD
3795 re_free (cset->coll_syms);
3796 re_free (cset->equiv_classes);
3797 re_free (cset->range_starts);
3798 re_free (cset->range_ends);
c0a0f9a3 3799# endif
3b0bdc72
UD
3800 re_free (cset->char_classes);
3801 re_free (cset);
3802}
c0a0f9a3 3803#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3804\f
3805/* Functions for binary tree operation. */
3806
ee70274a 3807/* Create a tree node. */
3b0bdc72
UD
3808
3809static bin_tree_t *
0fd8ae9c
UD
3810create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3811 re_token_type_t type)
02f3550c
UD
3812{
3813 re_token_t t;
eb04c213
AZ
3814#if defined GCC_LINT || defined lint
3815 memset (&t, 0, sizeof t);
3816#endif
02f3550c
UD
3817 t.type = type;
3818 return create_token_tree (dfa, left, right, &t);
3819}
3820
3821static bin_tree_t *
0fd8ae9c
UD
3822create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823 const re_token_t *token)
3b0bdc72
UD
3824{
3825 bin_tree_t *tree;
ee70274a 3826 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3b0bdc72 3827 {
ee70274a
UD
3828 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829
3830 if (storage == NULL)
3831 return NULL;
3832 storage->next = dfa->str_tree_storage;
3833 dfa->str_tree_storage = storage;
3834 dfa->str_tree_storage_idx = 0;
3b0bdc72 3835 }
ee70274a
UD
3836 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837
3b0bdc72
UD
3838 tree->parent = NULL;
3839 tree->left = left;
3840 tree->right = right;
02f3550c
UD
3841 tree->token = *token;
3842 tree->token.duplicated = 0;
3843 tree->token.opt_subexp = 0;
3844 tree->first = NULL;
3845 tree->next = NULL;
3846 tree->node_idx = -1;
3b0bdc72
UD
3847
3848 if (left != NULL)
3849 left->parent = tree;
3850 if (right != NULL)
3851 right->parent = tree;
3852 return tree;
3853}
3854
02f3550c
UD
3855/* Mark the tree SRC as an optional subexpression.
3856 To be called from preorder or postorder. */
3b0bdc72 3857
02f3550c 3858static reg_errcode_t
0fd8ae9c 3859mark_opt_subexp (void *extra, bin_tree_t *node)
f0d77aa8 3860{
eb04c213 3861 Idx idx = (uintptr_t) extra;
02f3550c
UD
3862 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863 node->token.opt_subexp = 1;
a334319f 3864
02f3550c 3865 return REG_NOERROR;
3b0bdc72
UD
3866}
3867
02f3550c 3868/* Free the allocated memory inside NODE. */
6b6557e8
UD
3869
3870static void
02f3550c 3871free_token (re_token_t *node)
6b6557e8 3872{
02f3550c
UD
3873#ifdef RE_ENABLE_I18N
3874 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875 free_charset (node->opr.mbcset);
3876 else
3877#endif /* RE_ENABLE_I18N */
3878 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879 re_free (node->opr.sbcset);
6b6557e8
UD
3880}
3881
02f3550c
UD
3882/* Worker function for tree walking. Free the allocated memory inside NODE
3883 and its children. */
6b6557e8 3884
02f3550c
UD
3885static reg_errcode_t
3886free_tree (void *extra, bin_tree_t *node)
6b6557e8 3887{
02f3550c
UD
3888 free_token (&node->token);
3889 return REG_NOERROR;
6b6557e8
UD
3890}
3891
ee70274a 3892
02f3550c
UD
3893/* Duplicate the node SRC, and return new node. This is a preorder
3894 visit similar to the one implemented by the generic visitor, but
3895 we need more infrastructure to maintain two parallel trees --- so,
3896 it's easier to duplicate. */
3b0bdc72
UD
3897
3898static bin_tree_t *
0fd8ae9c 3899duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3b0bdc72 3900{
02f3550c
UD
3901 const bin_tree_t *node;
3902 bin_tree_t *dup_root;
3903 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3b0bdc72 3904
02f3550c 3905 for (node = root; ; )
3b0bdc72 3906 {
02f3550c
UD
3907 /* Create a new tree and link it back to the current parent. */
3908 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909 if (*p_new == NULL)
ee70274a 3910 return NULL;
02f3550c
UD
3911 (*p_new)->parent = dup_node;
3912 (*p_new)->token.duplicated = 1;
3913 dup_node = *p_new;
3b0bdc72 3914
02f3550c
UD
3915 /* Go to the left node, or up and to the right. */
3916 if (node->left)
3917 {
3918 node = node->left;
3919 p_new = &dup_node->left;
3920 }
3921 else
3922 {
3923 const bin_tree_t *prev = NULL;
3924 while (node->right == prev || node->right == NULL)
3925 {
3926 prev = node;
3927 node = node->parent;
3928 dup_node = dup_node->parent;
3929 if (!node)
21f5de55 3930 return dup_root;
02f3550c
UD
3931 }
3932 node = node->right;
3933 p_new = &dup_node->right;
3934 }
3b0bdc72 3935 }
3b0bdc72 3936}