]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
Implement waitpid in terms of wait4
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
04277e02 2 Copyright (C) 2002-2019 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
eb04c213 18 <https://www.gnu.org/licenses/>. */
e054f494 19
8c0ab919
RM
20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
3b0bdc72 24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
01ed6ceb 25 size_t length, reg_syntax_t syntax);
3b0bdc72 26static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
27 const re_dfastate_t *init_state,
28 char *fastmap);
01ed6ceb 29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
c0a0f9a3 30#ifdef RE_ENABLE_I18N
3b0bdc72 31static void free_charset (re_charset_t *cset);
c0a0f9a3 32#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
33static void free_workarea_compile (regex_t *preg);
34static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
35#ifdef RE_ENABLE_I18N
36static void optimize_utf8 (re_dfa_t *dfa);
37#endif
02f3550c 38static reg_errcode_t analyze (regex_t *preg);
02f3550c
UD
39static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
eb04c213
AZ
52static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
a7d5c291 54 unsigned int constraint);
3b0bdc72 55static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 56static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
eb04c213 57 Idx node, bool root);
963d8d78 58static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
eb04c213 59static Idx fetch_number (re_string_t *input, re_token_t *token,
15a7d175 60 reg_syntax_t syntax);
3b0bdc72 61static int peek_token (re_token_t *token, re_string_t *input,
b41bd5bc 62 reg_syntax_t syntax);
3b0bdc72 63static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 64 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 65static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175 66 re_token_t *token, reg_syntax_t syntax,
eb04c213 67 Idx nest, reg_errcode_t *err);
3b0bdc72 68static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175 69 re_token_t *token, reg_syntax_t syntax,
eb04c213 70 Idx nest, reg_errcode_t *err);
3b0bdc72 71static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175 72 re_token_t *token, reg_syntax_t syntax,
eb04c213 73 Idx nest, reg_errcode_t *err);
3b0bdc72 74static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175 75 re_token_t *token, reg_syntax_t syntax,
eb04c213 76 Idx nest, reg_errcode_t *err);
3b0bdc72 77static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 80static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
3b0bdc72 83static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
78d8b07a 87 reg_syntax_t syntax,
eb04c213 88 bool accept_hyphen);
3b0bdc72 89static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
90 re_string_t *regexp,
91 re_token_t *token);
c0a0f9a3 92#ifdef RE_ENABLE_I18N
2c05d33f 93static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 94 re_charset_t *mbcset,
eb04c213 95 Idx *equiv_class_alloc,
15a7d175 96 const unsigned char *name);
997470b3 97static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 98 bitset_t sbcset,
15a7d175 99 re_charset_t *mbcset,
eb04c213
AZ
100 Idx *char_class_alloc,
101 const char *class_name,
15a7d175 102 reg_syntax_t syntax);
c0a0f9a3 103#else /* not RE_ENABLE_I18N */
2c05d33f 104static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 105 const unsigned char *name);
997470b3 106static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 107 bitset_t sbcset,
eb04c213 108 const char *class_name,
15a7d175 109 reg_syntax_t syntax);
c0a0f9a3 110#endif /* not RE_ENABLE_I18N */
86576b62 111static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
997470b3 112 RE_TRANSLATE_TYPE trans,
eb04c213
AZ
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
ee70274a
UD
116static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
02f3550c
UD
118 re_token_type_t type);
119static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
3b0bdc72 122static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
02f3550c
UD
123static void free_token (re_token_t *node);
124static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
3b0bdc72
UD
126\f
127/* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
131
eb04c213 132static const char __re_error_msgid[] =
3b0bdc72
UD
133 {
134#define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
eb04c213 156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
3b0bdc72 157 "\0"
eb04c213 158#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
3b0bdc72
UD
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 };
185
eb04c213 186static const size_t __re_error_msgid_idx[] =
3b0bdc72
UD
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206\f
207/* Entry points for GNU code. */
208
209/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
211 Returns 0 if the pattern was valid, otherwise an error string.
212
d3821ab0 213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
3b0bdc72
UD
214 are set in BUFP on entry. */
215
216const char *
9dd346ff
JM
217re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
3b0bdc72
UD
219{
220 reg_errcode_t ret;
221
3b0bdc72
UD
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
c06a6956
UD
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
3b0bdc72
UD
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
75e4a282 230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
231
232 if (!ret)
233 return NULL;
6455d255 234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 235}
3b0bdc72 236weak_alias (__re_compile_pattern, re_compile_pattern)
3b0bdc72 237
d3821ab0 238/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
3b0bdc72
UD
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241/* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243reg_syntax_t re_syntax_options;
244
245
246/* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
249
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
252
253reg_syntax_t
9dd346ff 254re_set_syntax (reg_syntax_t syntax)
3b0bdc72
UD
255{
256 reg_syntax_t ret = re_syntax_options;
257
258 re_syntax_options = syntax;
259 return ret;
260}
3b0bdc72 261weak_alias (__re_set_syntax, re_set_syntax)
3b0bdc72
UD
262
263int
9dd346ff 264re_compile_fastmap (struct re_pattern_buffer *bufp)
3b0bdc72 265{
eb04c213 266 re_dfa_t *dfa = bufp->buffer;
3b0bdc72
UD
267 char *fastmap = bufp->fastmap;
268
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
278 return 0;
279}
3b0bdc72 280weak_alias (__re_compile_fastmap, re_compile_fastmap)
3b0bdc72 281
1b2c2628 282static inline void
d3821ab0
RM
283__attribute__ ((always_inline))
284re_set_fastmap (char *fastmap, bool icase, int ch)
1b2c2628
UD
285{
286 fastmap[ch] = 1;
287 if (icase)
288 fastmap[tolower (ch)] = 1;
289}
290
3b0bdc72
UD
291/* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
293
294static void
0fd8ae9c
UD
295re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 char *fastmap)
3b0bdc72 297{
eb04c213
AZ
298 re_dfa_t *dfa = bufp->buffer;
299 Idx node_cnt;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
302 {
eb04c213 303 Idx node = init_state->nodes.elems[node_cnt];
3b0bdc72 304 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
305
306 if (type == CHARACTER)
74e12fbc
UD
307 {
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309#ifdef RE_ENABLE_I18N
14744156 310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 311 {
eb04c213
AZ
312 unsigned char buf[MB_LEN_MAX];
313 unsigned char *p;
74e12fbc
UD
314 wchar_t wc;
315 mbstate_t state;
316
317 p = buf;
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
2c05d33f 323 memset (&state, '\0', sizeof (state));
b3918c7d
UD
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 &state) == p - buf
9dd6b779 326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
88764ae2 327 != (size_t) -1))
eb04c213 328 re_set_fastmap (fastmap, false, buf[0]);
74e12fbc
UD
329 }
330#endif
331 }
3b0bdc72 332 else if (type == SIMPLE_BRACKET)
15a7d175 333 {
2c05d33f
UD
334 int i, ch;
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
336 {
337 int j;
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
342 }
15a7d175 343 }
c0a0f9a3 344#ifdef RE_ENABLE_I18N
3b0bdc72 345 else if (type == COMPLEX_BRACKET)
15a7d175 346 {
15a7d175 347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
eb04c213 348 Idx i;
bdb56bac 349
c0a0f9a3 350# ifdef _LIBC
bdb56bac
UD
351 /* See if we have to try all bytes which start multiple collation
352 elements.
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
15a7d175 359 {
15a7d175
UD
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2c05d33f
UD
362 for (i = 0; i < SBC_MAX; ++i)
363 if (table[i] < 0)
364 re_set_fastmap (fastmap, icase, i);
15a7d175 365 }
bdb56bac
UD
366# endif /* _LIBC */
367
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa->mb_cur_max > 1
815d8147 373 && (cset->nchar_classes || cset->non_match || cset->nranges
bdb56bac
UD
374# ifdef _LIBC
375 || cset->nequiv_classes
376# endif /* _LIBC */
377 ))
15a7d175 378 {
bdb56bac
UD
379 unsigned char c = 0;
380 do
74e12fbc 381 {
bdb56bac
UD
382 mbstate_t mbs;
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
74e12fbc 386 }
bdb56bac 387 while (++c != 0);
15a7d175 388 }
bdb56bac
UD
389
390 else
391 {
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i = 0; i < cset->nmbchars; ++i)
394 {
395 char buf[256];
396 mbstate_t state;
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
401 {
9dd6b779 402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
bdb56bac
UD
403 != (size_t) -1)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
405 }
aa732e2b
UD
406 }
407 }
15a7d175 408 }
c0a0f9a3 409#endif /* RE_ENABLE_I18N */
c0d5034e
UD
410 else if (type == OP_PERIOD
411#ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413#endif /* RE_ENABLE_I18N */
414 || type == END_OF_RE)
15a7d175
UD
415 {
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
420 }
3b0bdc72
UD
421 }
422}
423\f
424/* Entry point for POSIX code. */
425/* regcomp takes a regular expression as a string and compiles it.
426
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
429
d3821ab0
RM
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
3b0bdc72
UD
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
d3821ab0
RM
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
3b0bdc72
UD
439
440 PATTERN is the address of the pattern string.
441
442 CFLAGS is a series of bits which affect compilation.
443
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
446
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
449
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
452
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
456
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
459
460int
c0feb731 461regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
3b0bdc72
UD
462{
463 reg_errcode_t ret;
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 465 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
466
467 preg->buffer = NULL;
468 preg->allocated = 0;
469 preg->used = 0;
470
471 /* Try to allocate space for the fastmap. */
472 preg->fastmap = re_malloc (char, SBC_MAX);
f4efbdfb 473 if (__glibc_unlikely (preg->fastmap == NULL))
3b0bdc72
UD
474 return REG_ESPACE;
475
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
477
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags & REG_NEWLINE)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483 /* It also changes the matching behavior. */
484 preg->newline_anchor = 1;
485 }
486 else
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
490
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret == REG_ERPAREN)
496 ret = REG_EPAREN;
497
a9388965 498 /* We have already checked preg->fastmap != NULL. */
f4efbdfb 499 if (__glibc_likely (ret == REG_NOERROR))
71ccd330 500 /* Compute the fastmap now, since regexec cannot modify the pattern
86576b62 501 buffer. This function never fails in this implementation. */
83b038f2 502 (void) re_compile_fastmap (preg);
71ccd330 503 else
3b0bdc72 504 {
71ccd330
UD
505 /* Some error occurred while compiling the expression. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
3b0bdc72
UD
508 }
509
510 return (int) ret;
511}
b68f8620 512libc_hidden_def (__regcomp)
3b0bdc72 513weak_alias (__regcomp, regcomp)
3b0bdc72
UD
514
515/* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
517
518size_t
c0feb731 519regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
9dd346ff 520 size_t errbuf_size)
3b0bdc72
UD
521{
522 const char *msg;
523 size_t msg_size;
f4efbdfb 524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
3b0bdc72 525
f4efbdfb 526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
3b0bdc72
UD
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
531 abort ();
532
6455d255 533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
534
535 msg_size = strlen (msg) + 1; /* Includes the null. */
536
f4efbdfb 537 if (__glibc_likely (errbuf_size != 0))
3b0bdc72 538 {
eb04c213 539 size_t cpy_size = msg_size;
f4efbdfb 540 if (__glibc_unlikely (msg_size > errbuf_size))
15a7d175 541 {
eb04c213
AZ
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
15a7d175 544 }
eb04c213 545 memcpy (errbuf, msg, cpy_size);
3b0bdc72
UD
546 }
547
548 return msg_size;
549}
3b0bdc72 550weak_alias (__regerror, regerror)
3b0bdc72 551
3b0bdc72 552
e40a38b3
UD
553#ifdef RE_ENABLE_I18N
554/* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
2c05d33f 558static const bitset_t utf8_sb_map =
e40a38b3
UD
559{
560 /* Set the first 128 bits. */
eb04c213 561# if defined __GNUC__ && !defined __STRICT_ANSI__
2c05d33f 562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
eb04c213
AZ
563# else
564# if 4 * BITSET_WORD_BITS < ASCII_CHARS
565# error "bitset_word_t is narrower than 32 bits"
566# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571 BITSET_WORD_MAX,
572# endif
573 (BITSET_WORD_MAX
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
575 ? 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577# endif
e40a38b3
UD
578};
579#endif
580
581
71ccd330
UD
582static void
583free_dfa_content (re_dfa_t *dfa)
3b0bdc72 584{
eb04c213 585 Idx i, j;
3b0bdc72 586
ee70274a
UD
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
02f3550c 589 free_token (dfa->nodes + i);
71ccd330
UD
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
592 {
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
599 }
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
3b0bdc72 604
ee70274a
UD
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
607 {
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
610 {
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
613 }
21f5de55 614 re_free (entry->array);
ee70274a 615 }
71ccd330 616 re_free (dfa->state_table);
65e6becf 617#ifdef RE_ENABLE_I18N
e40a38b3
UD
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
a96c63ed 620#endif
c06a6956 621 re_free (dfa->subexp_map);
0742e48e 622#ifdef DEBUG
71ccd330 623 re_free (dfa->re_str);
0742e48e 624#endif
71ccd330
UD
625
626 re_free (dfa);
627}
628
629
630/* Free dynamically allocated space used by PREG. */
631
632void
9dd346ff 633regfree (regex_t *preg)
71ccd330 634{
eb04c213 635 re_dfa_t *dfa = preg->buffer;
f4efbdfb 636 if (__glibc_likely (dfa != NULL))
eb04c213
AZ
637 {
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
640 }
86576b62
UD
641 preg->buffer = NULL;
642 preg->allocated = 0;
71ccd330 643
3b0bdc72 644 re_free (preg->fastmap);
86576b62
UD
645 preg->fastmap = NULL;
646
647 re_free (preg->translate);
648 preg->translate = NULL;
3b0bdc72 649}
b68f8620 650libc_hidden_def (__regfree)
3b0bdc72 651weak_alias (__regfree, regfree)
3b0bdc72
UD
652\f
653/* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
655
656#if defined _REGEX_RE_COMP || defined _LIBC
657
658/* BSD has one and only one pattern buffer. */
659static struct re_pattern_buffer re_comp_buf;
660
661char *
662# ifdef _LIBC
663/* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
666weak_function
667# endif
80d9be81 668re_comp (const char *s)
3b0bdc72
UD
669{
670 reg_errcode_t ret;
240e87c2 671 char *fastmap;
3b0bdc72
UD
672
673 if (!s)
674 {
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
678 }
679
240e87c2
RM
680 if (re_comp_buf.buffer)
681 {
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
1b2c2628 685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
686 re_comp_buf.fastmap = fastmap;
687 }
688
689 if (re_comp_buf.fastmap == NULL)
3b0bdc72 690 {
eb04c213 691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
3b0bdc72 692 if (re_comp_buf.fastmap == NULL)
6455d255
UD
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
695 }
696
d3821ab0 697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
3b0bdc72
UD
698 don't need to initialize the pattern buffer fields which affect it. */
699
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
702
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705 if (!ret)
706 return NULL;
707
eb04c213 708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
6455d255 709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 710}
240e87c2
RM
711
712#ifdef _LIBC
c877418f 713libc_freeres_fn (free_mem)
240e87c2
RM
714{
715 __regfree (&re_comp_buf);
716}
240e87c2
RM
717#endif
718
3b0bdc72
UD
719#endif /* _REGEX_RE_COMP */
720\f
721/* Internal entry point.
722 Compile the regular expression PATTERN, whose length is LENGTH.
723 SYNTAX indicate regular expression's syntax. */
724
725static reg_errcode_t
0fd8ae9c
UD
726re_compile_internal (regex_t *preg, const char * pattern, size_t length,
727 reg_syntax_t syntax)
3b0bdc72
UD
728{
729 reg_errcode_t err = REG_NOERROR;
730 re_dfa_t *dfa;
731 re_string_t regexp;
732
733 /* Initialize the pattern buffer. */
734 preg->fastmap_accurate = 0;
735 preg->syntax = syntax;
736 preg->not_bol = preg->not_eol = 0;
737 preg->used = 0;
738 preg->re_nsub = 0;
1b2c2628
UD
739 preg->can_be_null = 0;
740 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
741
742 /* Initialize the dfa. */
eb04c213 743 dfa = preg->buffer;
f4efbdfb 744 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
3b0bdc72
UD
745 {
746 /* If zero allocated, but buffer is non-null, try to realloc
747 enough space. This loses if buffer's address is bogus, but
748 that is the user's responsibility. If ->buffer is NULL this
749 is a simple allocation. */
750 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
751 if (dfa == NULL)
752 return REG_ESPACE;
3b0bdc72 753 preg->allocated = sizeof (re_dfa_t);
eb04c213 754 preg->buffer = dfa;
3b0bdc72 755 }
3b0bdc72
UD
756 preg->used = sizeof (re_dfa_t);
757
758 err = init_dfa (dfa, length);
f4efbdfb 759 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
eb04c213 760 err = REG_ESPACE;
f4efbdfb 761 if (__glibc_unlikely (err != REG_NOERROR))
3b0bdc72 762 {
ee70274a 763 free_dfa_content (dfa);
3b0bdc72 764 preg->buffer = NULL;
ca3c505e 765 preg->allocated = 0;
3b0bdc72
UD
766 return err;
767 }
0742e48e 768#ifdef DEBUG
01ed6ceb 769 /* Note: length+1 will not overflow since it is checked in init_dfa. */
0742e48e
UD
770 dfa->re_str = re_malloc (char, length + 1);
771 strncpy (dfa->re_str, pattern, length + 1);
772#endif
3b0bdc72 773
612546c6 774 err = re_string_construct (&regexp, pattern, length, preg->translate,
eb04c213 775 (syntax & RE_ICASE) != 0, dfa);
f4efbdfb 776 if (__glibc_unlikely (err != REG_NOERROR))
3b0bdc72 777 {
ee70274a
UD
778 re_compile_internal_free_return:
779 free_workarea_compile (preg);
780 re_string_destruct (&regexp);
eb04c213 781 lock_fini (dfa->lock);
ee70274a 782 free_dfa_content (dfa);
3b0bdc72 783 preg->buffer = NULL;
ca3c505e 784 preg->allocated = 0;
3b0bdc72
UD
785 return err;
786 }
787
788 /* Parse the regular expression, and build a structure tree. */
789 preg->re_nsub = 0;
790 dfa->str_tree = parse (&regexp, preg, syntax, &err);
f4efbdfb 791 if (__glibc_unlikely (dfa->str_tree == NULL))
3b0bdc72
UD
792 goto re_compile_internal_free_return;
793
02f3550c
UD
794 /* Analyze the tree and create the nfa. */
795 err = analyze (preg);
f4efbdfb 796 if (__glibc_unlikely (err != REG_NOERROR))
02f3550c
UD
797 goto re_compile_internal_free_return;
798
14744156
UD
799#ifdef RE_ENABLE_I18N
800 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 801 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
802 optimize_utf8 (dfa);
803#endif
804
3b0bdc72
UD
805 /* Then create the initial state of the dfa. */
806 err = create_initial_state (dfa);
3b0bdc72 807
3b0bdc72
UD
808 /* Release work areas. */
809 free_workarea_compile (preg);
810 re_string_destruct (&regexp);
811
f4efbdfb 812 if (__glibc_unlikely (err != REG_NOERROR))
71ccd330 813 {
eb04c213 814 lock_fini (dfa->lock);
71ccd330
UD
815 free_dfa_content (dfa);
816 preg->buffer = NULL;
ca3c505e 817 preg->allocated = 0;
71ccd330
UD
818 }
819
3b0bdc72
UD
820 return err;
821}
822
823/* Initialize DFA. We use the length of the regular expression PAT_LEN
824 as the initial length of some arrays. */
825
826static reg_errcode_t
0fd8ae9c 827init_dfa (re_dfa_t *dfa, size_t pat_len)
3b0bdc72 828{
eb04c213 829 __re_size_t table_size;
e40a38b3 830#ifndef _LIBC
eb04c213
AZ
831 const char *codeset_name;
832#endif
833#ifdef RE_ENABLE_I18N
834 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835#else
836 size_t max_i18n_object_size = 0;
e40a38b3 837#endif
eb04c213
AZ
838 size_t max_object_size =
839 MAX (sizeof (struct re_state_table_entry),
840 MAX (sizeof (re_token_t),
841 MAX (sizeof (re_node_set),
842 MAX (sizeof (regmatch_t),
843 max_i18n_object_size))));
81c64d40
UD
844
845 memset (dfa, '\0', sizeof (re_dfa_t));
846
ee70274a
UD
847 /* Force allocation of str_tree_storage the first time. */
848 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849
eb04c213
AZ
850 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
851 calculation below, and for similar doubling calculations
852 elsewhere. And it's <= rather than <, because some of the
853 doubling calculations add 1 afterwards. */
f4efbdfb
PE
854 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855 <= pat_len))
01ed6ceb
UD
856 return REG_ESPACE;
857
3b0bdc72
UD
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
3b0bdc72 861 /* table_size = 2 ^ ceil(log pat_len) */
01ed6ceb 862 for (table_size = 1; ; table_size <<= 1)
3b0bdc72
UD
863 if (table_size > pat_len)
864 break;
865
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
868
3c0fb574
UD
869 dfa->mb_cur_max = MB_CUR_MAX;
870#ifdef _LIBC
bb3f4825 871 if (dfa->mb_cur_max == 6
3c0fb574
UD
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
f0c7c524
UD
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
e40a38b3 876#else
e40a38b3 877 codeset_name = nl_langinfo (CODESET);
eb04c213
AZ
878 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
e40a38b3
UD
882 dfa->is_utf8 = 1;
883
884 /* We check exhaustively in the loop below if this charset is a
885 superset of ASCII. */
886 dfa->map_notascii = 0;
3c0fb574 887#endif
e40a38b3 888
65e6becf
UD
889#ifdef RE_ENABLE_I18N
890 if (dfa->mb_cur_max > 1)
891 {
65e6becf 892 if (dfa->is_utf8)
e40a38b3 893 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
65e6becf 894 else
e40a38b3
UD
895 {
896 int i, j, ch;
897
2c05d33f 898 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
f4efbdfb 899 if (__glibc_unlikely (dfa->sb_char == NULL))
e40a38b3
UD
900 return REG_ESPACE;
901
2c05d33f
UD
902 /* Set the bits corresponding to single byte chars. */
903 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
904 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
e40a38b3 905 {
2d87db5b 906 wint_t wch = __btowc (ch);
e40a38b3 907 if (wch != WEOF)
2c05d33f 908 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
e40a38b3 909# ifndef _LIBC
2d87db5b 910 if (isascii (ch) && wch != ch)
e40a38b3
UD
911 dfa->map_notascii = 1;
912# endif
913 }
914 }
65e6becf
UD
915 }
916#endif
3c0fb574 917
f4efbdfb 918 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
ee70274a 919 return REG_ESPACE;
3b0bdc72
UD
920 return REG_NOERROR;
921}
922
923/* Initialize WORD_CHAR table, which indicate which character is
924 "word". In this case "word" means that it is the word construction
925 character used by some operators like "\<", "\>", etc. */
926
56b168be 927static void
0fd8ae9c 928init_word_char (re_dfa_t *dfa)
3b0bdc72 929{
9f115170 930 int i = 0;
eb04c213 931 int j;
9f115170 932 int ch = 0;
eb04c213 933 dfa->word_ops_used = 1;
f4efbdfb 934 if (__glibc_likely (dfa->map_notascii == 0))
9f115170 935 {
0285e6bd
PE
936 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
937 them, an issue when this code is used in Gnulib. */
567d8c1f
PE
938 bitset_word_t bits0 = 0x00000000;
939 bitset_word_t bits1 = 0x03ff0000;
940 bitset_word_t bits2 = 0x87fffffe;
941 bitset_word_t bits3 = 0x07fffffe;
942 if (BITSET_WORD_BITS == 64)
9f115170 943 {
0285e6bd 944 /* Pacify gcc -Woverflow on 32-bit platformns. */
567d8c1f
PE
945 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
9f115170
UD
947 i = 2;
948 }
567d8c1f 949 else if (BITSET_WORD_BITS == 32)
9f115170 950 {
567d8c1f
PE
951 dfa->word_char[0] = bits0;
952 dfa->word_char[1] = bits1;
953 dfa->word_char[2] = bits2;
954 dfa->word_char[3] = bits3;
9f115170
UD
955 i = 4;
956 }
957 else
567d8c1f 958 goto general_case;
9f115170
UD
959 ch = 128;
960
f4efbdfb 961 if (__glibc_likely (dfa->is_utf8))
9f115170
UD
962 {
963 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964 return;
965 }
966 }
967
567d8c1f 968 general_case:
9f115170 969 for (; i < BITSET_WORDS; ++i)
eb04c213 970 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3b0bdc72 971 if (isalnum (ch) || ch == '_')
2c05d33f 972 dfa->word_char[i] |= (bitset_word_t) 1 << j;
3b0bdc72
UD
973}
974
975/* Free the work area which are only used while compiling. */
976
977static void
0fd8ae9c 978free_workarea_compile (regex_t *preg)
3b0bdc72 979{
eb04c213 980 re_dfa_t *dfa = preg->buffer;
ee70274a
UD
981 bin_tree_storage_t *storage, *next;
982 for (storage = dfa->str_tree_storage; storage; storage = next)
983 {
984 next = storage->next;
985 re_free (storage);
986 }
987 dfa->str_tree_storage = NULL;
988 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 989 dfa->str_tree = NULL;
a7d5c291
UD
990 re_free (dfa->org_indices);
991 dfa->org_indices = NULL;
3b0bdc72
UD
992}
993
994/* Create initial states for all contexts. */
995
996static reg_errcode_t
0fd8ae9c 997create_initial_state (re_dfa_t *dfa)
3b0bdc72 998{
eb04c213 999 Idx first, i;
3b0bdc72
UD
1000 reg_errcode_t err;
1001 re_node_set init_nodes;
1002
1003 /* Initial states have the epsilon closure of the node which is
1004 the first node of the regular expression. */
02f3550c 1005 first = dfa->str_tree->first->node_idx;
3b0bdc72
UD
1006 dfa->init_node = first;
1007 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
f4efbdfb 1008 if (__glibc_unlikely (err != REG_NOERROR))
3b0bdc72
UD
1009 return err;
1010
1011 /* The back-references which are in initial states can epsilon transit,
1012 since in this case all of the subexpressions can be null.
1013 Then we add epsilon closures of the nodes which are the next nodes of
1014 the back-references. */
1015 if (dfa->nbackref > 0)
1016 for (i = 0; i < init_nodes.nelem; ++i)
1017 {
eb04c213 1018 Idx node_idx = init_nodes.elems[i];
15a7d175
UD
1019 re_token_type_t type = dfa->nodes[node_idx].type;
1020
eb04c213 1021 Idx clexp_idx;
15a7d175
UD
1022 if (type != OP_BACK_REF)
1023 continue;
1024 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025 {
1026 re_token_t *clexp_node;
1027 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 if (clexp_node->type == OP_CLOSE_SUBEXP
ae73c6c1 1029 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
15a7d175
UD
1030 break;
1031 }
1032 if (clexp_idx == init_nodes.nelem)
1033 continue;
1034
1035 if (type == OP_BACK_REF)
1036 {
eb04c213 1037 Idx dest_idx = dfa->edests[node_idx].elems[0];
15a7d175
UD
1038 if (!re_node_set_contains (&init_nodes, dest_idx))
1039 {
eb04c213
AZ
1040 reg_errcode_t merge_err
1041 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042 if (merge_err != REG_NOERROR)
1043 return merge_err;
15a7d175
UD
1044 i = 0;
1045 }
1046 }
3b0bdc72
UD
1047 }
1048
1049 /* It must be the first time to invoke acquire_state. */
a9388965
UD
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
f4efbdfb 1052 if (__glibc_unlikely (dfa->init_state == NULL))
a9388965 1053 return err;
3b0bdc72
UD
1054 if (dfa->init_state->has_constraint)
1055 {
a9388965 1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1057 CONTEXT_WORD);
a9388965 1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1059 CONTEXT_NEWLINE);
a9388965 1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
1061 &init_nodes,
1062 CONTEXT_NEWLINE
1063 | CONTEXT_BEGBUF);
f4efbdfb
PE
1064 if (__glibc_unlikely (dfa->init_state_word == NULL
1065 || dfa->init_state_nl == NULL
1066 || dfa->init_state_begbuf == NULL))
15a7d175 1067 return err;
3b0bdc72
UD
1068 }
1069 else
1070 dfa->init_state_word = dfa->init_state_nl
1071 = dfa->init_state_begbuf = dfa->init_state;
1072
3b0bdc72
UD
1073 re_node_set_free (&init_nodes);
1074 return REG_NOERROR;
1075}
1076\f
14744156
UD
1077#ifdef RE_ENABLE_I18N
1078/* If it is possible to do searching in single byte encoding instead of UTF-8
1079 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080 DFA nodes where needed. */
1081
1082static void
0fd8ae9c 1083optimize_utf8 (re_dfa_t *dfa)
14744156 1084{
eb04c213
AZ
1085 Idx node;
1086 int i;
1087 bool mb_chars = false;
1088 bool has_period = false;
14744156
UD
1089
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1092 {
1093 case CHARACTER:
eb04c213
AZ
1094 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095 mb_chars = true;
14744156
UD
1096 break;
1097 case ANCHOR:
bc3e1c12 1098 switch (dfa->nodes[node].opr.ctx_type)
14744156
UD
1099 {
1100 case LINE_FIRST:
1101 case LINE_LAST:
1102 case BUF_FIRST:
1103 case BUF_LAST:
1104 break;
1105 default:
0caca71a
UD
1106 /* Word anchors etc. cannot be handled. It's okay to test
1107 opr.ctx_type since constraints (for all DFA nodes) are
1108 created by ORing one or more opr.ctx_type values. */
14744156
UD
1109 return;
1110 }
1111 break;
ad7f28c2 1112 case OP_PERIOD:
eb04c213 1113 has_period = true;
21f5de55 1114 break;
14744156
UD
1115 case OP_BACK_REF:
1116 case OP_ALT:
1117 case END_OF_RE:
14744156 1118 case OP_DUP_ASTERISK:
14744156
UD
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1121 break;
02f3550c
UD
1122 case COMPLEX_BRACKET:
1123 return;
a8067e8f 1124 case SIMPLE_BRACKET:
eb04c213
AZ
1125 /* Just double check. */
1126 {
1127 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1128 ? 0
1129 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131 {
1132 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1133 return;
1134 rshift = 0;
1135 }
1136 }
a8067e8f 1137 break;
14744156 1138 default:
02f3550c 1139 abort ();
14744156
UD
1140 }
1141
ad7f28c2 1142 if (mb_chars || has_period)
5f93cd52 1143 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1144 {
1145 if (dfa->nodes[node].type == CHARACTER
eb04c213 1146 && dfa->nodes[node].opr.c >= ASCII_CHARS)
ad7f28c2
UD
1147 dfa->nodes[node].mb_partial = 0;
1148 else if (dfa->nodes[node].type == OP_PERIOD)
1149 dfa->nodes[node].type = OP_UTF8_PERIOD;
1150 }
5f93cd52 1151
14744156
UD
1152 /* The search can be in single byte locale. */
1153 dfa->mb_cur_max = 1;
1154 dfa->is_utf8 = 0;
ad7f28c2 1155 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1156}
1157#endif
1158\f
3b0bdc72
UD
1159/* Analyze the structure tree, and calculate "first", "next", "edest",
1160 "eclosure", and "inveclosure". */
1161
1162static reg_errcode_t
0fd8ae9c 1163analyze (regex_t *preg)
3b0bdc72 1164{
eb04c213 1165 re_dfa_t *dfa = preg->buffer;
3b0bdc72
UD
1166 reg_errcode_t ret;
1167
1168 /* Allocate arrays. */
eb04c213
AZ
1169 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
3b0bdc72
UD
1171 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
f4efbdfb
PE
1173 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174 || dfa->edests == NULL || dfa->eclosures == NULL))
3b0bdc72 1175 return REG_ESPACE;
02f3550c 1176
eb04c213 1177 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
02f3550c 1178 if (dfa->subexp_map != NULL)
3b0bdc72 1179 {
eb04c213 1180 Idx i;
02f3550c
UD
1181 for (i = 0; i < preg->re_nsub; i++)
1182 dfa->subexp_map[i] = i;
1183 preorder (dfa->str_tree, optimize_subexps, dfa);
1184 for (i = 0; i < preg->re_nsub; i++)
1185 if (dfa->subexp_map[i] != i)
1186 break;
1187 if (i == preg->re_nsub)
1188 {
eb04c213 1189 re_free (dfa->subexp_map);
02f3550c
UD
1190 dfa->subexp_map = NULL;
1191 }
3b0bdc72
UD
1192 }
1193
02f3550c 1194 ret = postorder (dfa->str_tree, lower_subexps, preg);
f4efbdfb 1195 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c
UD
1196 return ret;
1197 ret = postorder (dfa->str_tree, calc_first, dfa);
f4efbdfb 1198 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c
UD
1199 return ret;
1200 preorder (dfa->str_tree, calc_next, dfa);
1201 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
f4efbdfb 1202 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c
UD
1203 return ret;
1204 ret = calc_eclosure (dfa);
f4efbdfb 1205 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c 1206 return ret;
963d8d78
UD
1207
1208 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1209 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1210 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1211 || dfa->nbackref)
1212 {
1213 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
f4efbdfb 1214 if (__glibc_unlikely (dfa->inveclosures == NULL))
21f5de55 1215 return REG_ESPACE;
963d8d78
UD
1216 ret = calc_inveclosure (dfa);
1217 }
1218
02f3550c
UD
1219 return ret;
1220}
1221
1222/* Our parse trees are very unbalanced, so we cannot use a stack to
1223 implement parse tree visits. Instead, we use parent pointers and
1224 some hairy code in these two functions. */
1225static reg_errcode_t
0fd8ae9c
UD
1226postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1227 void *extra)
02f3550c
UD
1228{
1229 bin_tree_t *node, *prev;
1230
1231 for (node = root; ; )
3b0bdc72 1232 {
02f3550c
UD
1233 /* Descend down the tree, preferably to the left (or to the right
1234 if that's the only child). */
1235 while (node->left || node->right)
1236 if (node->left)
21f5de55
PE
1237 node = node->left;
1238 else
1239 node = node->right;
02f3550c
UD
1240
1241 do
1242 {
1243 reg_errcode_t err = fn (extra, node);
f4efbdfb 1244 if (__glibc_unlikely (err != REG_NOERROR))
02f3550c 1245 return err;
21f5de55 1246 if (node->parent == NULL)
02f3550c
UD
1247 return REG_NOERROR;
1248 prev = node;
1249 node = node->parent;
1250 }
1251 /* Go up while we have a node that is reached from the right. */
1252 while (node->right == prev || node->right == NULL);
1253 node = node->right;
3b0bdc72 1254 }
3b0bdc72
UD
1255}
1256
02f3550c 1257static reg_errcode_t
0fd8ae9c
UD
1258preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1259 void *extra)
02f3550c
UD
1260{
1261 bin_tree_t *node;
1262
1263 for (node = root; ; )
1264 {
1265 reg_errcode_t err = fn (extra, node);
f4efbdfb 1266 if (__glibc_unlikely (err != REG_NOERROR))
02f3550c 1267 return err;
0ecb606c 1268
02f3550c
UD
1269 /* Go to the left node, or up and to the right. */
1270 if (node->left)
1271 node = node->left;
1272 else
1273 {
1274 bin_tree_t *prev = NULL;
1275 while (node->right == prev || node->right == NULL)
1276 {
1277 prev = node;
1278 node = node->parent;
1279 if (!node)
21f5de55 1280 return REG_NOERROR;
02f3550c
UD
1281 }
1282 node = node->right;
1283 }
1284 }
1285}
1286
1287/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1288 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1289 backreferences as well. Requires a preorder visit. */
0ecb606c 1290static reg_errcode_t
0fd8ae9c 1291optimize_subexps (void *extra, bin_tree_t *node)
0ecb606c 1292{
02f3550c 1293 re_dfa_t *dfa = (re_dfa_t *) extra;
0ecb606c 1294
02f3550c 1295 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3b0bdc72 1296 {
02f3550c
UD
1297 int idx = node->token.opr.idx;
1298 node->token.opr.idx = dfa->subexp_map[idx];
1299 dfa->used_bkref_map |= 1 << node->token.opr.idx;
0ecb606c 1300 }
02f3550c
UD
1301
1302 else if (node->token.type == SUBEXP
21f5de55 1303 && node->left && node->left->token.type == SUBEXP)
0ecb606c 1304 {
eb04c213 1305 Idx other_idx = node->left->token.opr.idx;
02f3550c
UD
1306
1307 node->left = node->left->left;
1308 if (node->left)
21f5de55 1309 node->left->parent = node;
02f3550c
UD
1310
1311 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
2c05d33f 1312 if (other_idx < BITSET_WORD_BITS)
eb04c213 1313 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3b0bdc72 1314 }
02f3550c 1315
3b0bdc72
UD
1316 return REG_NOERROR;
1317}
1318
02f3550c
UD
1319/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1320 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1321static reg_errcode_t
0fd8ae9c 1322lower_subexps (void *extra, bin_tree_t *node)
3b0bdc72 1323{
02f3550c
UD
1324 regex_t *preg = (regex_t *) extra;
1325 reg_errcode_t err = REG_NOERROR;
3b0bdc72 1326
02f3550c 1327 if (node->left && node->left->token.type == SUBEXP)
0ecb606c 1328 {
02f3550c
UD
1329 node->left = lower_subexp (&err, preg, node->left);
1330 if (node->left)
1331 node->left->parent = node;
1332 }
1333 if (node->right && node->right->token.type == SUBEXP)
1334 {
1335 node->right = lower_subexp (&err, preg, node->right);
1336 if (node->right)
1337 node->right->parent = node;
3b0bdc72 1338 }
3b0bdc72 1339
02f3550c
UD
1340 return err;
1341}
3b0bdc72 1342
02f3550c 1343static bin_tree_t *
0fd8ae9c 1344lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
0ecb606c 1345{
eb04c213 1346 re_dfa_t *dfa = preg->buffer;
02f3550c
UD
1347 bin_tree_t *body = node->left;
1348 bin_tree_t *op, *cls, *tree1, *tree;
1349
1350 if (preg->no_sub
744eb12b
UD
1351 /* We do not optimize empty subexpressions, because otherwise we may
1352 have bad CONCAT nodes with NULL children. This is obviously not
1353 very common, so we do not lose much. An example that triggers
1354 this case is the sed "script" /\(\)/x. */
1355 && node->left != NULL
2c05d33f
UD
1356 && (node->token.opr.idx >= BITSET_WORD_BITS
1357 || !(dfa->used_bkref_map
1358 & ((bitset_word_t) 1 << node->token.opr.idx))))
02f3550c
UD
1359 return node->left;
1360
1361 /* Convert the SUBEXP node to the concatenation of an
1362 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1363 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366 tree = create_tree (dfa, op, tree1, CONCAT);
f4efbdfb
PE
1367 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368 || op == NULL || cls == NULL))
0ecb606c 1369 {
02f3550c
UD
1370 *err = REG_ESPACE;
1371 return NULL;
0ecb606c 1372 }
3b0bdc72 1373
02f3550c
UD
1374 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376 return tree;
1377}
a334319f 1378
02f3550c
UD
1379/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1380 nodes. Requires a postorder visit. */
1381static reg_errcode_t
0fd8ae9c 1382calc_first (void *extra, bin_tree_t *node)
02f3550c
UD
1383{
1384 re_dfa_t *dfa = (re_dfa_t *) extra;
1385 if (node->token.type == CONCAT)
1386 {
1387 node->first = node->left->first;
1388 node->node_idx = node->left->node_idx;
1389 }
1390 else
1391 {
1392 node->first = node;
1393 node->node_idx = re_dfa_add_node (dfa, node->token);
f4efbdfb 1394 if (__glibc_unlikely (node->node_idx == -1))
21f5de55 1395 return REG_ESPACE;
0caca71a 1396 if (node->token.type == ANCHOR)
21f5de55 1397 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
02f3550c
UD
1398 }
1399 return REG_NOERROR;
1400}
1401
1402/* Pass 2: compute NEXT on the tree. Preorder visit. */
1403static reg_errcode_t
0fd8ae9c 1404calc_next (void *extra, bin_tree_t *node)
02f3550c
UD
1405{
1406 switch (node->token.type)
3b0bdc72
UD
1407 {
1408 case OP_DUP_ASTERISK:
02f3550c 1409 node->left->next = node;
3b0bdc72
UD
1410 break;
1411 case CONCAT:
02f3550c
UD
1412 node->left->next = node->right->first;
1413 node->right->next = node->next;
1414 break;
3b0bdc72 1415 default:
02f3550c
UD
1416 if (node->left)
1417 node->left->next = node->next;
1418 if (node->right)
21f5de55 1419 node->right->next = node->next;
3b0bdc72
UD
1420 break;
1421 }
02f3550c 1422 return REG_NOERROR;
3b0bdc72
UD
1423}
1424
02f3550c
UD
1425/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1426static reg_errcode_t
0fd8ae9c 1427link_nfa_nodes (void *extra, bin_tree_t *node)
a334319f 1428{
02f3550c 1429 re_dfa_t *dfa = (re_dfa_t *) extra;
eb04c213 1430 Idx idx = node->node_idx;
02f3550c
UD
1431 reg_errcode_t err = REG_NOERROR;
1432
1433 switch (node->token.type)
3b0bdc72 1434 {
02f3550c
UD
1435 case CONCAT:
1436 break;
1437
1438 case END_OF_RE:
2a0356e1 1439 DEBUG_ASSERT (node->next == NULL);
02f3550c
UD
1440 break;
1441
1442 case OP_DUP_ASTERISK:
1443 case OP_ALT:
1444 {
eb04c213 1445 Idx left, right;
02f3550c
UD
1446 dfa->has_plural_match = 1;
1447 if (node->left != NULL)
1448 left = node->left->first->node_idx;
1449 else
1450 left = node->next->node_idx;
1451 if (node->right != NULL)
1452 right = node->right->first->node_idx;
1453 else
1454 right = node->next->node_idx;
2a0356e1
AZ
1455 DEBUG_ASSERT (left > -1);
1456 DEBUG_ASSERT (right > -1);
02f3550c
UD
1457 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458 }
1459 break;
1460
1461 case ANCHOR:
1462 case OP_OPEN_SUBEXP:
1463 case OP_CLOSE_SUBEXP:
1464 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465 break;
1466
1467 case OP_BACK_REF:
1468 dfa->nexts[idx] = node->next->node_idx;
1469 if (node->token.type == OP_BACK_REF)
2da42bc0 1470 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
02f3550c
UD
1471 break;
1472
1473 default:
2a0356e1 1474 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
02f3550c
UD
1475 dfa->nexts[idx] = node->next->node_idx;
1476 break;
3b0bdc72 1477 }
02f3550c
UD
1478
1479 return err;
3b0bdc72
UD
1480}
1481
485d775d
UD
1482/* Duplicate the epsilon closure of the node ROOT_NODE.
1483 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484 to their own constraint. */
1485
1486static reg_errcode_t
eb04c213
AZ
1487duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488 Idx root_node, unsigned int init_constraint)
485d775d 1489{
eb04c213
AZ
1490 Idx org_node, clone_node;
1491 bool ok;
485d775d
UD
1492 unsigned int constraint = init_constraint;
1493 for (org_node = top_org_node, clone_node = top_clone_node;;)
1494 {
eb04c213 1495 Idx org_dest, clone_dest;
485d775d 1496 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1497 {
485d775d
UD
1498 /* If the back reference epsilon-transit, its destination must
1499 also have the constraint. Then duplicate the epsilon closure
1500 of the destination of the back reference, and store it in
1501 edests of the back reference. */
15a7d175
UD
1502 org_dest = dfa->nexts[org_node];
1503 re_node_set_empty (dfa->edests + clone_node);
2d87db5b 1504 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1505 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1506 return REG_ESPACE;
15a7d175 1507 dfa->nexts[clone_node] = dfa->nexts[org_node];
eb04c213 1508 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1509 if (__glibc_unlikely (! ok))
15a7d175
UD
1510 return REG_ESPACE;
1511 }
485d775d 1512 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1513 {
485d775d
UD
1514 /* In case of the node can't epsilon-transit, don't duplicate the
1515 destination and store the original destination as the
1516 destination of the node. */
15a7d175
UD
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 break;
1519 }
485d775d 1520 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1521 {
485d775d
UD
1522 /* In case of the node can epsilon-transit, and it has only one
1523 destination. */
15a7d175
UD
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
d3821ab0 1526 /* If the node is root_node itself, it means the epsilon closure
eb04c213 1527 has a loop. Then tie it to the destination of the root_node. */
0caca71a 1528 if (org_node == root_node && clone_node != org_node)
15a7d175 1529 {
eb04c213 1530 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
f4efbdfb 1531 if (__glibc_unlikely (! ok))
eb04c213 1532 return REG_ESPACE;
0caca71a 1533 break;
15a7d175 1534 }
d3821ab0 1535 /* In case the node has another constraint, append it. */
0caca71a 1536 constraint |= dfa->nodes[org_node].constraint;
2d87db5b 1537 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1538 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1539 return REG_ESPACE;
eb04c213 1540 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1541 if (__glibc_unlikely (! ok))
15a7d175
UD
1542 return REG_ESPACE;
1543 }
485d775d 1544 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1545 {
485d775d 1546 /* In case of the node can epsilon-transit, and it has two
02f3550c 1547 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
15a7d175
UD
1548 org_dest = dfa->edests[org_node].elems[0];
1549 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1550 /* Search for a duplicated node which satisfies the constraint. */
1551 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552 if (clone_dest == -1)
1553 {
0caca71a 1554 /* There is no such duplicated node, create a new one. */
2d87db5b
UD
1555 reg_errcode_t err;
1556 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1557 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1558 return REG_ESPACE;
eb04c213 1559 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1560 if (__glibc_unlikely (! ok))
7de66108
UD
1561 return REG_ESPACE;
1562 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563 root_node, constraint);
f4efbdfb 1564 if (__glibc_unlikely (err != REG_NOERROR))
7de66108 1565 return err;
a7d5c291
UD
1566 }
1567 else
1568 {
0caca71a 1569 /* There is a duplicated node which satisfies the constraint,
a7d5c291 1570 use it to avoid infinite loop. */
eb04c213 1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1572 if (__glibc_unlikely (! ok))
a7d5c291
UD
1573 return REG_ESPACE;
1574 }
15a7d175
UD
1575
1576 org_dest = dfa->edests[org_node].elems[1];
2d87db5b 1577 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1578 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1579 return REG_ESPACE;
eb04c213 1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1581 if (__glibc_unlikely (! ok))
15a7d175
UD
1582 return REG_ESPACE;
1583 }
485d775d
UD
1584 org_node = org_dest;
1585 clone_node = clone_dest;
1586 }
1587 return REG_NOERROR;
1588}
1589
a7d5c291
UD
1590/* Search for a node which is duplicated from the node ORG_NODE, and
1591 satisfies the constraint CONSTRAINT. */
1592
eb04c213
AZ
1593static Idx
1594search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
0fd8ae9c 1595 unsigned int constraint)
a7d5c291 1596{
eb04c213 1597 Idx idx;
a7d5c291
UD
1598 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1599 {
1600 if (org_node == dfa->org_indices[idx]
1601 && constraint == dfa->nodes[idx].constraint)
1602 return idx; /* Found. */
1603 }
1604 return -1; /* Not found. */
1605}
1606
a9388965 1607/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
2d87db5b
UD
1608 Return the index of the new node, or -1 if insufficient storage is
1609 available. */
a9388965 1610
eb04c213
AZ
1611static Idx
1612duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
3b0bdc72 1613{
eb04c213 1614 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
f4efbdfb 1615 if (__glibc_likely (dup_idx != -1))
2d87db5b
UD
1616 {
1617 dfa->nodes[dup_idx].constraint = constraint;
0caca71a 1618 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
2d87db5b
UD
1619 dfa->nodes[dup_idx].duplicated = 1;
1620
1621 /* Store the index of the original node. */
1622 dfa->org_indices[dup_idx] = org_idx;
1623 }
1624 return dup_idx;
3b0bdc72
UD
1625}
1626
963d8d78 1627static reg_errcode_t
0fd8ae9c 1628calc_inveclosure (re_dfa_t *dfa)
3b0bdc72 1629{
eb04c213
AZ
1630 Idx src, idx;
1631 bool ok;
963d8d78
UD
1632 for (idx = 0; idx < dfa->nodes_len; ++idx)
1633 re_node_set_init_empty (dfa->inveclosures + idx);
1634
3b0bdc72
UD
1635 for (src = 0; src < dfa->nodes_len; ++src)
1636 {
eb04c213 1637 Idx *elems = dfa->eclosures[src].elems;
3b0bdc72 1638 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175 1639 {
eb04c213 1640 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
f4efbdfb 1641 if (__glibc_unlikely (! ok))
963d8d78 1642 return REG_ESPACE;
15a7d175 1643 }
3b0bdc72 1644 }
963d8d78
UD
1645
1646 return REG_NOERROR;
3b0bdc72
UD
1647}
1648
1649/* Calculate "eclosure" for all the node in DFA. */
1650
1651static reg_errcode_t
0fd8ae9c 1652calc_eclosure (re_dfa_t *dfa)
3b0bdc72 1653{
eb04c213
AZ
1654 Idx node_idx;
1655 bool incomplete;
2a0356e1 1656 DEBUG_ASSERT (dfa->nodes_len > 0);
eb04c213 1657 incomplete = false;
3b0bdc72 1658 /* For each nodes, calculate epsilon closure. */
485d775d 1659 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1660 {
a9388965 1661 reg_errcode_t err;
3b0bdc72 1662 re_node_set eclosure_elem;
485d775d 1663 if (node_idx == dfa->nodes_len)
15a7d175
UD
1664 {
1665 if (!incomplete)
1666 break;
eb04c213 1667 incomplete = false;
15a7d175
UD
1668 node_idx = 0;
1669 }
3b0bdc72 1670
2a0356e1 1671 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
c06a6956 1672
3b0bdc72
UD
1673 /* If we have already calculated, skip it. */
1674 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1675 continue;
d3821ab0 1676 /* Calculate epsilon closure of 'node_idx'. */
eb04c213 1677 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
f4efbdfb 1678 if (__glibc_unlikely (err != REG_NOERROR))
15a7d175 1679 return err;
3b0bdc72
UD
1680
1681 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175 1682 {
eb04c213 1683 incomplete = true;
15a7d175
UD
1684 re_node_set_free (&eclosure_elem);
1685 }
3b0bdc72 1686 }
3b0bdc72
UD
1687 return REG_NOERROR;
1688}
1689
1690/* Calculate epsilon closure of NODE. */
1691
a9388965 1692static reg_errcode_t
eb04c213 1693calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
3b0bdc72 1694{
a9388965 1695 reg_errcode_t err;
eb04c213 1696 Idx i;
3b0bdc72 1697 re_node_set eclosure;
eb04c213
AZ
1698 bool ok;
1699 bool incomplete = false;
a9388965 1700 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
f4efbdfb 1701 if (__glibc_unlikely (err != REG_NOERROR))
a9388965 1702 return err;
3b0bdc72
UD
1703
1704 /* This indicates that we are calculating this node now.
1705 We reference this value to avoid infinite loop. */
1706 dfa->eclosures[node].nelem = -1;
1707
0caca71a
UD
1708 /* If the current node has constraints, duplicate all nodes
1709 since they must inherit the constraints. */
1710 if (dfa->nodes[node].constraint
b4ae56bd
UD
1711 && dfa->edests[node].nelem
1712 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
485d775d 1713 {
0caca71a
UD
1714 err = duplicate_node_closure (dfa, node, node, node,
1715 dfa->nodes[node].constraint);
f4efbdfb 1716 if (__glibc_unlikely (err != REG_NOERROR))
15a7d175 1717 return err;
485d775d 1718 }
3b0bdc72
UD
1719
1720 /* Expand each epsilon destination nodes. */
485d775d 1721 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1722 for (i = 0; i < dfa->edests[node].nelem; ++i)
1723 {
15a7d175 1724 re_node_set eclosure_elem;
eb04c213
AZ
1725 Idx edest = dfa->edests[node].elems[i];
1726 /* If calculating the epsilon closure of 'edest' is in progress,
15a7d175
UD
1727 return intermediate result. */
1728 if (dfa->eclosures[edest].nelem == -1)
1729 {
eb04c213 1730 incomplete = true;
15a7d175
UD
1731 continue;
1732 }
eb04c213 1733 /* If we haven't calculated the epsilon closure of 'edest' yet,
15a7d175
UD
1734 calculate now. Otherwise use calculated epsilon closure. */
1735 if (dfa->eclosures[edest].nelem == 0)
1736 {
eb04c213 1737 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
f4efbdfb 1738 if (__glibc_unlikely (err != REG_NOERROR))
15a7d175
UD
1739 return err;
1740 }
1741 else
1742 eclosure_elem = dfa->eclosures[edest];
d3821ab0 1743 /* Merge the epsilon closure of 'edest'. */
2da42bc0 1744 err = re_node_set_merge (&eclosure, &eclosure_elem);
f4efbdfb 1745 if (__glibc_unlikely (err != REG_NOERROR))
2da42bc0 1746 return err;
d3821ab0 1747 /* If the epsilon closure of 'edest' is incomplete,
15a7d175
UD
1748 the epsilon closure of this node is also incomplete. */
1749 if (dfa->eclosures[edest].nelem == 0)
1750 {
eb04c213 1751 incomplete = true;
15a7d175
UD
1752 re_node_set_free (&eclosure_elem);
1753 }
3b0bdc72
UD
1754 }
1755
21f5de55 1756 /* An epsilon closure includes itself. */
eb04c213 1757 ok = re_node_set_insert (&eclosure, node);
f4efbdfb 1758 if (__glibc_unlikely (! ok))
21f5de55 1759 return REG_ESPACE;
3b0bdc72
UD
1760 if (incomplete && !root)
1761 dfa->eclosures[node].nelem = 0;
1762 else
1763 dfa->eclosures[node] = eclosure;
a9388965
UD
1764 *new_set = eclosure;
1765 return REG_NOERROR;
3b0bdc72
UD
1766}
1767\f
1768/* Functions for token which are used in the parser. */
1769
1770/* Fetch a token from INPUT.
1771 We must not use this function inside bracket expressions. */
1772
f0d77aa8 1773static void
0fd8ae9c 1774fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
3b0bdc72 1775{
f0d77aa8 1776 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1777}
1778
1779/* Peek a token from INPUT, and return the length of the token.
1780 We must not use this function inside bracket expressions. */
1781
1782static int
0fd8ae9c 1783peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
1784{
1785 unsigned char c;
1786
1787 if (re_string_eoi (input))
1788 {
1789 token->type = END_OF_RE;
1790 return 0;
1791 }
1792
1793 c = re_string_peek_byte (input, 0);
1794 token->opr.c = c;
1795
65e6becf 1796 token->word_char = 0;
3b0bdc72
UD
1797#ifdef RE_ENABLE_I18N
1798 token->mb_partial = 0;
34a5a146
JM
1799 if (input->mb_cur_max > 1
1800 && !re_string_first_byte (input, re_string_cur_idx (input)))
3b0bdc72
UD
1801 {
1802 token->type = CHARACTER;
1803 token->mb_partial = 1;
1804 return 1;
1805 }
1806#endif
1807 if (c == '\\')
1808 {
1809 unsigned char c2;
1810 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1811 {
1812 token->type = BACK_SLASH;
1813 return 1;
1814 }
3b0bdc72
UD
1815
1816 c2 = re_string_peek_byte_case (input, 1);
1817 token->opr.c = c2;
1818 token->type = CHARACTER;
65e6becf
UD
1819#ifdef RE_ENABLE_I18N
1820 if (input->mb_cur_max > 1)
1821 {
1822 wint_t wc = re_string_wchar_at (input,
1823 re_string_cur_idx (input) + 1);
1824 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1825 }
1826 else
1827#endif
1828 token->word_char = IS_WORD_CHAR (c2) != 0;
1829
3b0bdc72 1830 switch (c2)
15a7d175
UD
1831 {
1832 case '|':
1833 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1834 token->type = OP_ALT;
1835 break;
1836 case '1': case '2': case '3': case '4': case '5':
1837 case '6': case '7': case '8': case '9':
1838 if (!(syntax & RE_NO_BK_REFS))
1839 {
1840 token->type = OP_BACK_REF;
ae73c6c1 1841 token->opr.idx = c2 - '1';
15a7d175
UD
1842 }
1843 break;
1844 case '<':
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 {
1847 token->type = ANCHOR;
bb3f4825 1848 token->opr.ctx_type = WORD_FIRST;
15a7d175
UD
1849 }
1850 break;
1851 case '>':
1852 if (!(syntax & RE_NO_GNU_OPS))
1853 {
1854 token->type = ANCHOR;
bb3f4825 1855 token->opr.ctx_type = WORD_LAST;
15a7d175
UD
1856 }
1857 break;
1858 case 'b':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 {
1861 token->type = ANCHOR;
bb3f4825 1862 token->opr.ctx_type = WORD_DELIM;
15a7d175
UD
1863 }
1864 break;
1865 case 'B':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 {
1868 token->type = ANCHOR;
24992143 1869 token->opr.ctx_type = NOT_WORD_DELIM;
15a7d175
UD
1870 }
1871 break;
1872 case 'w':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_WORD;
1875 break;
1876 case 'W':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_NOTWORD;
1879 break;
e2b6bfa3
UD
1880 case 's':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_SPACE;
1883 break;
1884 case 'S':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = OP_NOTSPACE;
1887 break;
15a7d175
UD
1888 case '`':
1889 if (!(syntax & RE_NO_GNU_OPS))
1890 {
1891 token->type = ANCHOR;
bb3f4825 1892 token->opr.ctx_type = BUF_FIRST;
15a7d175
UD
1893 }
1894 break;
1895 case '\'':
1896 if (!(syntax & RE_NO_GNU_OPS))
1897 {
1898 token->type = ANCHOR;
bb3f4825 1899 token->opr.ctx_type = BUF_LAST;
15a7d175
UD
1900 }
1901 break;
1902 case '(':
1903 if (!(syntax & RE_NO_BK_PARENS))
1904 token->type = OP_OPEN_SUBEXP;
1905 break;
1906 case ')':
1907 if (!(syntax & RE_NO_BK_PARENS))
1908 token->type = OP_CLOSE_SUBEXP;
1909 break;
1910 case '+':
1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_PLUS;
1913 break;
1914 case '?':
1915 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_QUESTION;
1917 break;
1918 case '{':
1919 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920 token->type = OP_OPEN_DUP_NUM;
1921 break;
1922 case '}':
1923 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924 token->type = OP_CLOSE_DUP_NUM;
1925 break;
1926 default:
1927 break;
1928 }
3b0bdc72
UD
1929 return 2;
1930 }
1931
1932 token->type = CHARACTER;
65e6becf
UD
1933#ifdef RE_ENABLE_I18N
1934 if (input->mb_cur_max > 1)
1935 {
1936 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1937 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1938 }
1939 else
1940#endif
1941 token->word_char = IS_WORD_CHAR (token->opr.c);
1942
3b0bdc72
UD
1943 switch (c)
1944 {
1945 case '\n':
1946 if (syntax & RE_NEWLINE_ALT)
15a7d175 1947 token->type = OP_ALT;
3b0bdc72
UD
1948 break;
1949 case '|':
1950 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1951 token->type = OP_ALT;
3b0bdc72
UD
1952 break;
1953 case '*':
1954 token->type = OP_DUP_ASTERISK;
1955 break;
1956 case '+':
1957 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1958 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1959 break;
1960 case '?':
1961 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1962 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1963 break;
1964 case '{':
1965 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1966 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1967 break;
1968 case '}':
1969 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1970 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1971 break;
1972 case '(':
1973 if (syntax & RE_NO_BK_PARENS)
15a7d175 1974 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1975 break;
1976 case ')':
1977 if (syntax & RE_NO_BK_PARENS)
15a7d175 1978 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1979 break;
1980 case '[':
1981 token->type = OP_OPEN_BRACKET;
1982 break;
1983 case '.':
1984 token->type = OP_PERIOD;
1985 break;
1986 case '^':
34a5a146
JM
1987 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1988 && re_string_cur_idx (input) != 0)
15a7d175
UD
1989 {
1990 char prev = re_string_peek_byte (input, -1);
134abcb5 1991 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1992 break;
1993 }
3b0bdc72 1994 token->type = ANCHOR;
bb3f4825 1995 token->opr.ctx_type = LINE_FIRST;
3b0bdc72
UD
1996 break;
1997 case '$':
34a5a146
JM
1998 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1999 && re_string_cur_idx (input) + 1 != re_string_length (input))
15a7d175
UD
2000 {
2001 re_token_t next;
2002 re_string_skip_bytes (input, 1);
2003 peek_token (&next, input, syntax);
2004 re_string_skip_bytes (input, -1);
2005 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006 break;
2007 }
3b0bdc72 2008 token->type = ANCHOR;
bb3f4825 2009 token->opr.ctx_type = LINE_LAST;
3b0bdc72
UD
2010 break;
2011 default:
2012 break;
2013 }
2014 return 1;
2015}
2016
2017/* Peek a token from INPUT, and return the length of the token.
2018 We must not use this function out of bracket expressions. */
2019
2020static int
0fd8ae9c 2021peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
2022{
2023 unsigned char c;
2024 if (re_string_eoi (input))
2025 {
2026 token->type = END_OF_RE;
2027 return 0;
2028 }
2029 c = re_string_peek_byte (input, 0);
2030 token->opr.c = c;
2031
2032#ifdef RE_ENABLE_I18N
34a5a146
JM
2033 if (input->mb_cur_max > 1
2034 && !re_string_first_byte (input, re_string_cur_idx (input)))
3b0bdc72
UD
2035 {
2036 token->type = CHARACTER;
2037 return 1;
2038 }
2039#endif /* RE_ENABLE_I18N */
2040
294b6bcc
UD
2041 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2042 && re_string_cur_idx (input) + 1 < re_string_length (input))
3b0bdc72
UD
2043 {
2044 /* In this case, '\' escape a character. */
2045 unsigned char c2;
82bbb29e
UD
2046 re_string_skip_bytes (input, 1);
2047 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
2048 token->opr.c = c2;
2049 token->type = CHARACTER;
2050 return 1;
2051 }
2052 if (c == '[') /* '[' is a special char in a bracket exps. */
2053 {
2054 unsigned char c2;
2055 int token_len;
294b6bcc
UD
2056 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2057 c2 = re_string_peek_byte (input, 1);
2058 else
2059 c2 = 0;
3b0bdc72
UD
2060 token->opr.c = c2;
2061 token_len = 2;
2062 switch (c2)
15a7d175
UD
2063 {
2064 case '.':
2065 token->type = OP_OPEN_COLL_ELEM;
2066 break;
eb04c213 2067
15a7d175
UD
2068 case '=':
2069 token->type = OP_OPEN_EQUIV_CLASS;
2070 break;
eb04c213 2071
15a7d175
UD
2072 case ':':
2073 if (syntax & RE_CHAR_CLASSES)
2074 {
2075 token->type = OP_OPEN_CHAR_CLASS;
2076 break;
2077 }
eb04c213 2078 FALLTHROUGH;
15a7d175
UD
2079 default:
2080 token->type = CHARACTER;
2081 token->opr.c = c;
2082 token_len = 1;
2083 break;
2084 }
3b0bdc72
UD
2085 return token_len;
2086 }
2087 switch (c)
2088 {
2089 case '-':
2090 token->type = OP_CHARSET_RANGE;
2091 break;
2092 case ']':
2093 token->type = OP_CLOSE_BRACKET;
2094 break;
2095 case '^':
2096 token->type = OP_NON_MATCH_LIST;
2097 break;
2098 default:
2099 token->type = CHARACTER;
2100 }
2101 return 1;
2102}
2103\f
2104/* Functions for parser. */
2105
2106/* Entry point of the parser.
2107 Parse the regular expression REGEXP and return the structure tree.
d3821ab0 2108 If an error occurs, ERR is set by error code, and return NULL.
3b0bdc72 2109 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
2110 CAT
2111 / \
2112 / \
3b0bdc72
UD
2113 <reg_exp> EOR
2114
2115 CAT means concatenation.
2116 EOR means end of regular expression. */
2117
2118static bin_tree_t *
0fd8ae9c
UD
2119parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2120 reg_errcode_t *err)
3b0bdc72 2121{
eb04c213 2122 re_dfa_t *dfa = preg->buffer;
3b0bdc72
UD
2123 bin_tree_t *tree, *eor, *root;
2124 re_token_t current_token;
56b168be 2125 dfa->syntax = syntax;
f0d77aa8 2126 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2127 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
f4efbdfb 2128 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
3b0bdc72 2129 return NULL;
02f3550c 2130 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
3b0bdc72 2131 if (tree != NULL)
02f3550c 2132 root = create_tree (dfa, tree, eor, CONCAT);
3b0bdc72
UD
2133 else
2134 root = eor;
f4efbdfb 2135 if (__glibc_unlikely (eor == NULL || root == NULL))
485d775d
UD
2136 {
2137 *err = REG_ESPACE;
2138 return NULL;
2139 }
3b0bdc72
UD
2140 return root;
2141}
2142
2143/* This function build the following tree, from regular expression
2144 <branch1>|<branch2>:
15a7d175
UD
2145 ALT
2146 / \
2147 / \
3b0bdc72
UD
2148 <branch1> <branch2>
2149
d3821ab0 2150 ALT means alternative, which represents the operator '|'. */
3b0bdc72
UD
2151
2152static bin_tree_t *
0fd8ae9c 2153parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2154 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2155{
eb04c213 2156 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2157 bin_tree_t *tree, *branch = NULL;
eb04c213 2158 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
3b0bdc72 2159 tree = parse_branch (regexp, preg, token, syntax, nest, err);
f4efbdfb 2160 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
3b0bdc72
UD
2161 return NULL;
2162
2163 while (token->type == OP_ALT)
2164 {
f0d77aa8 2165 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2166 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
2167 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2168 {
eb04c213
AZ
2169 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2170 dfa->completed_bkref_map = initial_bkref_map;
15a7d175 2171 branch = parse_branch (regexp, preg, token, syntax, nest, err);
f4efbdfb 2172 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
aa6ec754
AS
2173 {
2174 if (tree != NULL)
2175 postorder (tree, free_tree, NULL);
2176 return NULL;
2177 }
eb04c213 2178 dfa->completed_bkref_map |= accumulated_bkref_map;
15a7d175 2179 }
9b88fc16
UD
2180 else
2181 branch = NULL;
02f3550c 2182 tree = create_tree (dfa, tree, branch, OP_ALT);
f4efbdfb 2183 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2184 {
2185 *err = REG_ESPACE;
2186 return NULL;
2187 }
3b0bdc72
UD
2188 }
2189 return tree;
2190}
2191
2192/* This function build the following tree, from regular expression
2193 <exp1><exp2>:
15a7d175
UD
2194 CAT
2195 / \
3b0bdc72
UD
2196 / \
2197 <exp1> <exp2>
2198
2199 CAT means concatenation. */
2200
2201static bin_tree_t *
0fd8ae9c 2202parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2203 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2204{
eb04c213
AZ
2205 bin_tree_t *tree, *expr;
2206 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2207 tree = parse_expression (regexp, preg, token, syntax, nest, err);
f4efbdfb 2208 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
3b0bdc72
UD
2209 return NULL;
2210
2211 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2212 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72 2213 {
eb04c213 2214 expr = parse_expression (regexp, preg, token, syntax, nest, err);
f4efbdfb 2215 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
15a7d175 2216 {
b833d51f
UD
2217 if (tree != NULL)
2218 postorder (tree, free_tree, NULL);
15a7d175
UD
2219 return NULL;
2220 }
eb04c213 2221 if (tree != NULL && expr != NULL)
15a7d175 2222 {
eb04c213 2223 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
e9b9cbf5 2224 if (newtree == NULL)
15a7d175 2225 {
eb04c213 2226 postorder (expr, free_tree, NULL);
e9b9cbf5 2227 postorder (tree, free_tree, NULL);
15a7d175
UD
2228 *err = REG_ESPACE;
2229 return NULL;
2230 }
e9b9cbf5 2231 tree = newtree;
15a7d175 2232 }
3b0bdc72 2233 else if (tree == NULL)
eb04c213
AZ
2234 tree = expr;
2235 /* Otherwise expr == NULL, we don't need to create new tree. */
3b0bdc72
UD
2236 }
2237 return tree;
2238}
2239
2240/* This function build the following tree, from regular expression a*:
15a7d175
UD
2241 *
2242 |
2243 a
3b0bdc72
UD
2244*/
2245
2246static bin_tree_t *
0fd8ae9c 2247parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2248 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2249{
eb04c213 2250 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2251 bin_tree_t *tree;
3b0bdc72
UD
2252 switch (token->type)
2253 {
2254 case CHARACTER:
02f3550c 2255 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2256 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2257 {
2258 *err = REG_ESPACE;
2259 return NULL;
2260 }
3b0bdc72 2261#ifdef RE_ENABLE_I18N
3c0fb574 2262 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2263 {
2264 while (!re_string_eoi (regexp)
2265 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2266 {
15a7d175 2267 bin_tree_t *mbc_remain;
f0d77aa8 2268 fetch_token (token, regexp, syntax);
02f3550c
UD
2269 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2270 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
f4efbdfb 2271 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
54e1cabc
UD
2272 {
2273 *err = REG_ESPACE;
2274 return NULL;
2275 }
15a7d175 2276 }
3b0bdc72
UD
2277 }
2278#endif
2279 break;
eb04c213 2280
3b0bdc72
UD
2281 case OP_OPEN_SUBEXP:
2282 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
f4efbdfb 2283 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
15a7d175 2284 return NULL;
3b0bdc72 2285 break;
eb04c213 2286
3b0bdc72
UD
2287 case OP_OPEN_BRACKET:
2288 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
f4efbdfb 2289 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
15a7d175 2290 return NULL;
3b0bdc72 2291 break;
eb04c213 2292
3b0bdc72 2293 case OP_BACK_REF:
f4efbdfb 2294 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
15a7d175
UD
2295 {
2296 *err = REG_ESUBREG;
2297 return NULL;
2298 }
ae73c6c1 2299 dfa->used_bkref_map |= 1 << token->opr.idx;
02f3550c 2300 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2301 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2302 {
2303 *err = REG_ESPACE;
2304 return NULL;
2305 }
3b0bdc72
UD
2306 ++dfa->nbackref;
2307 dfa->has_mb_node = 1;
2308 break;
eb04c213 2309
06e8303a
UD
2310 case OP_OPEN_DUP_NUM:
2311 if (syntax & RE_CONTEXT_INVALID_DUP)
2312 {
2313 *err = REG_BADRPT;
2314 return NULL;
2315 }
eb04c213 2316 FALLTHROUGH;
3b0bdc72
UD
2317 case OP_DUP_ASTERISK:
2318 case OP_DUP_PLUS:
2319 case OP_DUP_QUESTION:
3b0bdc72 2320 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2321 {
2322 *err = REG_BADRPT;
2323 return NULL;
2324 }
3b0bdc72 2325 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2326 {
f0d77aa8 2327 fetch_token (token, regexp, syntax);
15a7d175
UD
2328 return parse_expression (regexp, preg, token, syntax, nest, err);
2329 }
eb04c213 2330 FALLTHROUGH;
3b0bdc72 2331 case OP_CLOSE_SUBEXP:
34a5a146
JM
2332 if ((token->type == OP_CLOSE_SUBEXP)
2333 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
15a7d175
UD
2334 {
2335 *err = REG_ERPAREN;
2336 return NULL;
2337 }
eb04c213 2338 FALLTHROUGH;
3b0bdc72
UD
2339 case OP_CLOSE_DUP_NUM:
2340 /* We treat it as a normal character. */
2341
2342 /* Then we can these characters as normal characters. */
2343 token->type = CHARACTER;
65e6becf
UD
2344 /* mb_partial and word_char bits should be initialized already
2345 by peek_token. */
02f3550c 2346 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2347 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2348 {
2349 *err = REG_ESPACE;
2350 return NULL;
2351 }
3b0bdc72 2352 break;
eb04c213 2353
3b0bdc72 2354 case ANCHOR:
bb3f4825 2355 if ((token->opr.ctx_type
24992143 2356 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
56b168be
UD
2357 && dfa->word_ops_used == 0)
2358 init_word_char (dfa);
24992143 2359 if (token->opr.ctx_type == WORD_DELIM
21f5de55 2360 || token->opr.ctx_type == NOT_WORD_DELIM)
15a7d175
UD
2361 {
2362 bin_tree_t *tree_first, *tree_last;
24992143
UD
2363 if (token->opr.ctx_type == WORD_DELIM)
2364 {
2365 token->opr.ctx_type = WORD_FIRST;
02f3550c 2366 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2367 token->opr.ctx_type = WORD_LAST;
21f5de55
PE
2368 }
2369 else
2370 {
24992143 2371 token->opr.ctx_type = INSIDE_WORD;
02f3550c 2372 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2373 token->opr.ctx_type = INSIDE_NOTWORD;
21f5de55 2374 }
02f3550c
UD
2375 tree_last = create_token_tree (dfa, NULL, NULL, token);
2376 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
f4efbdfb
PE
2377 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2378 || tree == NULL))
15a7d175
UD
2379 {
2380 *err = REG_ESPACE;
2381 return NULL;
2382 }
2383 }
3b0bdc72 2384 else
15a7d175 2385 {
02f3550c 2386 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2387 if (__glibc_unlikely (tree == NULL))
54e1cabc
UD
2388 {
2389 *err = REG_ESPACE;
2390 return NULL;
2391 }
15a7d175 2392 }
3b0bdc72 2393 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2394 by repetition operators.
2395 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2396 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2397 fetch_token (token, regexp, syntax);
3b0bdc72 2398 return tree;
eb04c213 2399
3b0bdc72 2400 case OP_PERIOD:
02f3550c 2401 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2402 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2403 {
2404 *err = REG_ESPACE;
2405 return NULL;
2406 }
3c0fb574 2407 if (dfa->mb_cur_max > 1)
15a7d175 2408 dfa->has_mb_node = 1;
3b0bdc72 2409 break;
eb04c213 2410
3b0bdc72 2411 case OP_WORD:
3b0bdc72 2412 case OP_NOTWORD:
266c1f50 2413 tree = build_charclass_op (dfa, regexp->trans,
eb04c213
AZ
2414 "alnum",
2415 "_",
266c1f50 2416 token->type == OP_NOTWORD, err);
f4efbdfb 2417 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
e2b6bfa3
UD
2418 return NULL;
2419 break;
eb04c213 2420
e2b6bfa3 2421 case OP_SPACE:
e2b6bfa3 2422 case OP_NOTSPACE:
266c1f50 2423 tree = build_charclass_op (dfa, regexp->trans,
eb04c213
AZ
2424 "space",
2425 "",
266c1f50 2426 token->type == OP_NOTSPACE, err);
f4efbdfb 2427 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
15a7d175 2428 return NULL;
3b0bdc72 2429 break;
eb04c213 2430
3b0bdc72
UD
2431 case OP_ALT:
2432 case END_OF_RE:
2433 return NULL;
eb04c213 2434
3b0bdc72
UD
2435 case BACK_SLASH:
2436 *err = REG_EESCAPE;
2437 return NULL;
eb04c213 2438
3b0bdc72
UD
2439 default:
2440 /* Must not happen? */
2a0356e1 2441 DEBUG_ASSERT (false);
3b0bdc72
UD
2442 return NULL;
2443 }
f0d77aa8 2444 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2445
2446 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2447 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72 2448 {
eb04c213
AZ
2449 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2450 syntax, err);
f4efbdfb 2451 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
4d43ef1e
AS
2452 {
2453 if (tree != NULL)
2454 postorder (tree, free_tree, NULL);
2455 return NULL;
2456 }
2457 tree = dup_tree;
c34bfc8d
UD
2458 /* In BRE consecutive duplications are not allowed. */
2459 if ((syntax & RE_CONTEXT_INVALID_DUP)
2460 && (token->type == OP_DUP_ASTERISK
2461 || token->type == OP_OPEN_DUP_NUM))
2462 {
4d43ef1e
AS
2463 if (tree != NULL)
2464 postorder (tree, free_tree, NULL);
c34bfc8d
UD
2465 *err = REG_BADRPT;
2466 return NULL;
2467 }
3b0bdc72
UD
2468 }
2469
2470 return tree;
2471}
2472
2473/* This function build the following tree, from regular expression
2474 (<reg_exp>):
15a7d175
UD
2475 SUBEXP
2476 |
2477 <reg_exp>
3b0bdc72
UD
2478*/
2479
2480static bin_tree_t *
0fd8ae9c 2481parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2482 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2483{
eb04c213 2484 re_dfa_t *dfa = preg->buffer;
02f3550c 2485 bin_tree_t *tree;
3b0bdc72
UD
2486 size_t cur_nsub;
2487 cur_nsub = preg->re_nsub++;
81c64d40 2488
f0d77aa8 2489 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2490
2491 /* The subexpression may be a null string. */
2492 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2493 tree = NULL;
3b0bdc72
UD
2494 else
2495 {
2496 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
f4efbdfb
PE
2497 if (__glibc_unlikely (*err == REG_NOERROR
2498 && token->type != OP_CLOSE_SUBEXP))
a129c80d
UD
2499 {
2500 if (tree != NULL)
e9b9cbf5 2501 postorder (tree, free_tree, NULL);
a129c80d
UD
2502 *err = REG_EPAREN;
2503 }
f4efbdfb 2504 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175 2505 return NULL;
3b0bdc72 2506 }
01ed6ceb
UD
2507
2508 if (cur_nsub <= '9' - '1')
2509 dfa->completed_bkref_map |= 1 << cur_nsub;
02f3550c
UD
2510
2511 tree = create_tree (dfa, tree, NULL, SUBEXP);
f4efbdfb 2512 if (__glibc_unlikely (tree == NULL))
485d775d
UD
2513 {
2514 *err = REG_ESPACE;
2515 return NULL;
2516 }
02f3550c 2517 tree->token.opr.idx = cur_nsub;
3b0bdc72
UD
2518 return tree;
2519}
2520
2521/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2522
2523static bin_tree_t *
0fd8ae9c
UD
2524parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2525 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72 2526{
20dc2f79 2527 bin_tree_t *tree = NULL, *old_tree = NULL;
eb04c213 2528 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
3b0bdc72 2529 re_token_t start_token = *token;
6b6557e8 2530
3b0bdc72
UD
2531 if (token->type == OP_OPEN_DUP_NUM)
2532 {
6b6557e8
UD
2533 end = 0;
2534 start = fetch_number (regexp, token, syntax);
3b0bdc72 2535 if (start == -1)
15a7d175
UD
2536 {
2537 if (token->type == CHARACTER && token->opr.c == ',')
2538 start = 0; /* We treat "{,m}" as "{0,m}". */
2539 else
2540 {
2541 *err = REG_BADBR; /* <re>{} is invalid. */
2542 return NULL;
2543 }
2544 }
f4efbdfb 2545 if (__glibc_likely (start != -2))
15a7d175
UD
2546 {
2547 /* We treat "{n}" as "{n,n}". */
2548 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2549 : ((token->type == CHARACTER && token->opr.c == ',')
2550 ? fetch_number (regexp, token, syntax) : -2));
2551 }
f4efbdfb 2552 if (__glibc_unlikely (start == -2 || end == -2))
15a7d175
UD
2553 {
2554 /* Invalid sequence. */
f4efbdfb 2555 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
6b6557e8
UD
2556 {
2557 if (token->type == END_OF_RE)
2558 *err = REG_EBRACE;
2559 else
2560 *err = REG_BADBR;
2561
2562 return NULL;
2563 }
2564
2565 /* If the syntax bit is set, rollback. */
2566 re_string_set_index (regexp, start_idx);
2567 *token = start_token;
2568 token->type = CHARACTER;
2569 /* mb_partial and word_char bits should be already initialized by
2570 peek_token. */
2571 return elem;
15a7d175 2572 }
6b6557e8 2573
f4efbdfb
PE
2574 if (__glibc_unlikely ((end != -1 && start > end)
2575 || token->type != OP_CLOSE_DUP_NUM))
15a7d175 2576 {
6b6557e8
UD
2577 /* First number greater than second. */
2578 *err = REG_BADBR;
15a7d175
UD
2579 return NULL;
2580 }
eb04c213 2581
f4efbdfb 2582 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
eb04c213
AZ
2583 {
2584 *err = REG_ESIZE;
2585 return NULL;
2586 }
6b6557e8
UD
2587 }
2588 else
2589 {
2590 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2592 }
2593
20dc2f79
UD
2594 fetch_token (token, regexp, syntax);
2595
f4efbdfb 2596 if (__glibc_unlikely (elem == NULL))
20dc2f79 2597 return NULL;
f4efbdfb 2598 if (__glibc_unlikely (start == 0 && end == 0))
02f3550c
UD
2599 {
2600 postorder (elem, free_tree, NULL);
2601 return NULL;
2602 }
602c2f9d 2603
6b6557e8 2604 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
f4efbdfb 2605 if (__glibc_unlikely (start > 0))
6b6557e8
UD
2606 {
2607 tree = elem;
2608 for (i = 2; i <= start; ++i)
7de66108 2609 {
6b6557e8 2610 elem = duplicate_tree (elem, dfa);
02f3550c 2611 tree = create_tree (dfa, tree, elem, CONCAT);
f4efbdfb 2612 if (__glibc_unlikely (elem == NULL || tree == NULL))
7de66108
UD
2613 goto parse_dup_op_espace;
2614 }
6b6557e8 2615
20dc2f79
UD
2616 if (start == end)
2617 return tree;
a96c63ed 2618
20dc2f79
UD
2619 /* Duplicate ELEM before it is marked optional. */
2620 elem = duplicate_tree (elem, dfa);
f4efbdfb 2621 if (__glibc_unlikely (elem == NULL))
7ee03f00 2622 goto parse_dup_op_espace;
20dc2f79
UD
2623 old_tree = tree;
2624 }
2625 else
2626 old_tree = NULL;
2627
02f3550c 2628 if (elem->token.type == SUBEXP)
eb04c213
AZ
2629 {
2630 uintptr_t subidx = elem->token.opr.idx;
2631 postorder (elem, mark_opt_subexp, (void *) subidx);
2632 }
02f3550c 2633
eb04c213
AZ
2634 tree = create_tree (dfa, elem, NULL,
2635 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
f4efbdfb 2636 if (__glibc_unlikely (tree == NULL))
20dc2f79
UD
2637 goto parse_dup_op_espace;
2638
2639 /* This loop is actually executed only when end != -1,
2640 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2641 already created the start+1-th copy. */
eb04c213
AZ
2642 if (TYPE_SIGNED (Idx) || end != -1)
2643 for (i = start + 2; i <= end; ++i)
2644 {
2645 elem = duplicate_tree (elem, dfa);
2646 tree = create_tree (dfa, tree, elem, CONCAT);
f4efbdfb 2647 if (__glibc_unlikely (elem == NULL || tree == NULL))
eb04c213
AZ
2648 goto parse_dup_op_espace;
2649
2650 tree = create_tree (dfa, tree, NULL, OP_ALT);
f4efbdfb 2651 if (__glibc_unlikely (tree == NULL))
eb04c213
AZ
2652 goto parse_dup_op_espace;
2653 }
6b6557e8 2654
20dc2f79 2655 if (old_tree)
02f3550c 2656 tree = create_tree (dfa, old_tree, tree, CONCAT);
20dc2f79 2657
3b0bdc72
UD
2658 return tree;
2659
2660 parse_dup_op_espace:
3b0bdc72
UD
2661 *err = REG_ESPACE;
2662 return NULL;
3b0bdc72
UD
2663}
2664
2665/* Size of the names for collating symbol/equivalence_class/character_class.
2666 I'm not sure, but maybe enough. */
2667#define BRACKET_NAME_BUF_SIZE 32
2668
434d3784 2669#ifndef _LIBC
eb04c213
AZ
2670
2671# ifdef RE_ENABLE_I18N
2672/* Convert the byte B to the corresponding wide character. In a
c77bf91b
PE
2673 unibyte locale, treat B as itself. In a multibyte locale, return
2674 WEOF if B is an encoding error. */
eb04c213
AZ
2675static wint_t
2676parse_byte (unsigned char b, re_charset_t *mbcset)
2677{
c77bf91b 2678 return mbcset == NULL ? b : __btowc (b);
eb04c213 2679}
c77bf91b 2680# endif
eb04c213 2681
434d3784
UD
2682 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2683 Build the range expression which starts from START_ELEM, and ends
2684 at END_ELEM. The result are written to MBCSET and SBCSET.
2685 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
d3821ab0 2686 mbcset->range_ends, is a pointer argument since we may
434d3784
UD
2687 update it. */
2688
2689static reg_errcode_t
c0a0f9a3 2690# ifdef RE_ENABLE_I18N
eb04c213
AZ
2691build_range_exp (const reg_syntax_t syntax,
2692 bitset_t sbcset,
2693 re_charset_t *mbcset,
2694 Idx *range_alloc,
2695 const bracket_elem_t *start_elem,
2696 const bracket_elem_t *end_elem)
c0a0f9a3 2697# else /* not RE_ENABLE_I18N */
eb04c213
AZ
2698build_range_exp (const reg_syntax_t syntax,
2699 bitset_t sbcset,
2700 const bracket_elem_t *start_elem,
2701 const bracket_elem_t *end_elem)
c0a0f9a3 2702# endif /* not RE_ENABLE_I18N */
434d3784
UD
2703{
2704 unsigned int start_ch, end_ch;
2705 /* Equivalence Classes and Character Classes can't be a range start/end. */
f4efbdfb
PE
2706 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2707 || start_elem->type == CHAR_CLASS
2708 || end_elem->type == EQUIV_CLASS
2709 || end_elem->type == CHAR_CLASS))
434d3784
UD
2710 return REG_ERANGE;
2711
2712 /* We can handle no multi character collating elements without libc
2713 support. */
f4efbdfb
PE
2714 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2715 && strlen ((char *) start_elem->opr.name) > 1)
2716 || (end_elem->type == COLL_SYM
2717 && strlen ((char *) end_elem->opr.name) > 1)))
434d3784
UD
2718 return REG_ECOLLATE;
2719
2720# ifdef RE_ENABLE_I18N
2721 {
2d87db5b
UD
2722 wchar_t wc;
2723 wint_t start_wc;
2724 wint_t end_wc;
434d3784
UD
2725
2726 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2727 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2728 : 0));
434d3784 2729 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2730 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2731 : 0));
434d3784 2732 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
eb04c213 2733 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
434d3784 2734 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
eb04c213 2735 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
4bb333cd
UD
2736 if (start_wc == WEOF || end_wc == WEOF)
2737 return REG_ECOLLATE;
f4efbdfb
PE
2738 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2739 && start_wc > end_wc))
434d3784
UD
2740 return REG_ERANGE;
2741
10677727
UD
2742 /* Got valid collation sequence values, add them as a new entry.
2743 However, for !_LIBC we have no collation elements: if the
2744 character set is single byte, the single byte character set
2745 that we build below suffices. parse_bracket_exp passes
2746 no MBCSET if dfa->mb_cur_max == 1. */
2747 if (mbcset)
434d3784 2748 {
21f5de55 2749 /* Check the space of the arrays. */
f4efbdfb 2750 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
21f5de55 2751 {
10677727
UD
2752 /* There is not enough space, need realloc. */
2753 wchar_t *new_array_start, *new_array_end;
eb04c213 2754 Idx new_nranges;
10677727
UD
2755
2756 /* +1 in case of mbcset->nranges is 0. */
2757 new_nranges = 2 * mbcset->nranges + 1;
2758 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2759 are NULL if *range_alloc == 0. */
2760 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
21f5de55 2761 new_nranges);
10677727 2762 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
21f5de55 2763 new_nranges);
10677727 2764
f4efbdfb
PE
2765 if (__glibc_unlikely (new_array_start == NULL
2766 || new_array_end == NULL))
eb04c213
AZ
2767 {
2768 re_free (new_array_start);
2769 re_free (new_array_end);
2770 return REG_ESPACE;
2771 }
10677727
UD
2772
2773 mbcset->range_starts = new_array_start;
2774 mbcset->range_ends = new_array_end;
2775 *range_alloc = new_nranges;
21f5de55 2776 }
10677727 2777
21f5de55
PE
2778 mbcset->range_starts[mbcset->nranges] = start_wc;
2779 mbcset->range_ends[mbcset->nranges++] = end_wc;
434d3784
UD
2780 }
2781
434d3784 2782 /* Build the table for single byte characters. */
1e7947dc 2783 for (wc = 0; wc < SBC_MAX; ++wc)
434d3784 2784 {
eb04c213 2785 if (start_wc <= wc && wc <= end_wc)
15a7d175 2786 bitset_set (sbcset, wc);
434d3784
UD
2787 }
2788 }
2789# else /* not RE_ENABLE_I18N */
2790 {
2791 unsigned int ch;
2792 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2793 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2794 : 0));
434d3784 2795 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2796 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2797 : 0));
434d3784
UD
2798 if (start_ch > end_ch)
2799 return REG_ERANGE;
2800 /* Build the table for single byte characters. */
1e7947dc 2801 for (ch = 0; ch < SBC_MAX; ++ch)
434d3784 2802 if (start_ch <= ch && ch <= end_ch)
15a7d175 2803 bitset_set (sbcset, ch);
434d3784
UD
2804 }
2805# endif /* not RE_ENABLE_I18N */
2806 return REG_NOERROR;
2807}
2808#endif /* not _LIBC */
2809
2810#ifndef _LIBC
2811/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2812 Build the collating element which is represented by NAME.
2813 The result are written to MBCSET and SBCSET.
2814 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2815 pointer argument since we may update it. */
2816
2817static reg_errcode_t
c0a0f9a3 2818# ifdef RE_ENABLE_I18N
0fd8ae9c 2819build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 2820 Idx *coll_sym_alloc, const unsigned char *name)
c0a0f9a3 2821# else /* not RE_ENABLE_I18N */
0fd8ae9c 2822build_collating_symbol (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 2823# endif /* not RE_ENABLE_I18N */
434d3784 2824{
62439eac 2825 size_t name_len = strlen ((const char *) name);
f4efbdfb 2826 if (__glibc_unlikely (name_len != 1))
434d3784
UD
2827 return REG_ECOLLATE;
2828 else
2829 {
62439eac 2830 bitset_set (sbcset, name[0]);
434d3784
UD
2831 return REG_NOERROR;
2832 }
2833}
2834#endif /* not _LIBC */
2835
3b0bdc72
UD
2836/* This function parse bracket expression like "[abc]", "[a-c]",
2837 "[[.a-a.]]" etc. */
2838
2839static bin_tree_t *
0fd8ae9c
UD
2840parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2841 reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72
UD
2842{
2843#ifdef _LIBC
62439eac
UD
2844 const unsigned char *collseqmb;
2845 const char *collseqwc;
3b0bdc72
UD
2846 uint32_t nrules;
2847 int32_t table_size;
2848 const int32_t *symb_table;
2849 const unsigned char *extra;
2850
d3821ab0
RM
2851 /* Local function for parse_bracket_exp used in _LIBC environment.
2852 Seek the collating symbol entry corresponding to NAME.
7e2f0d2d
AS
2853 Return the index of the symbol in the SYMB_TABLE,
2854 or -1 if not found. */
3b0bdc72 2855
7090d3ca 2856 auto inline int32_t
d3821ab0 2857 __attribute__ ((always_inline))
7e2f0d2d 2858 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
3b0bdc72 2859 {
7e2f0d2d 2860 int32_t elem;
a334319f 2861
7e2f0d2d
AS
2862 for (elem = 0; elem < table_size; elem++)
2863 if (symb_table[2 * elem] != 0)
2864 {
2865 int32_t idx = symb_table[2 * elem + 1];
2866 /* Skip the name of collating element name. */
2867 idx += 1 + extra[idx];
2868 if (/* Compare the length of the name. */
2869 name_len == extra[idx]
2870 /* Compare the name. */
2871 && memcmp (name, &extra[idx + 1], name_len) == 0)
2872 /* Yep, this is the entry. */
2873 return elem;
2874 }
2875 return -1;
3b0bdc72
UD
2876 }
2877
a532a41d 2878 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
2879 Look up the collation sequence value of BR_ELEM.
2880 Return the value if succeeded, UINT_MAX otherwise. */
2881
7090d3ca 2882 auto inline unsigned int
d3821ab0 2883 __attribute__ ((always_inline))
7e2f0d2d 2884 lookup_collation_sequence_value (bracket_elem_t *br_elem)
3b0bdc72
UD
2885 {
2886 if (br_elem->type == SB_CHAR)
15a7d175
UD
2887 {
2888 /*
2889 if (MB_CUR_MAX == 1)
2890 */
2891 if (nrules == 0)
2892 return collseqmb[br_elem->opr.ch];
2893 else
2894 {
2895 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2896 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2897 }
2898 }
3b0bdc72 2899 else if (br_elem->type == MB_CHAR)
15a7d175 2900 {
a532a41d
UD
2901 if (nrules != 0)
2902 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2903 }
3b0bdc72 2904 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2905 {
2906 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2907 if (nrules != 0)
2908 {
2909 int32_t elem, idx;
2910 elem = seek_collating_symbol_entry (br_elem->opr.name,
2911 sym_name_len);
7e2f0d2d 2912 if (elem != -1)
15a7d175
UD
2913 {
2914 /* We found the entry. */
2915 idx = symb_table[2 * elem + 1];
2916 /* Skip the name of collating element name. */
2917 idx += 1 + extra[idx];
2918 /* Skip the byte sequence of the collating element. */
2919 idx += 1 + extra[idx];
2920 /* Adjust for the alignment. */
2921 idx = (idx + 3) & ~3;
2922 /* Skip the multibyte collation sequence value. */
2923 idx += sizeof (unsigned int);
2924 /* Skip the wide char sequence of the collating element. */
2925 idx += sizeof (unsigned int) *
2926 (1 + *(unsigned int *) (extra + idx));
2927 /* Return the collation sequence value. */
2928 return *(unsigned int *) (extra + idx);
2929 }
7e2f0d2d 2930 else if (sym_name_len == 1)
15a7d175
UD
2931 {
2932 /* No valid character. Match it as a single byte
2933 character. */
2934 return collseqmb[br_elem->opr.name[0]];
2935 }
2936 }
2937 else if (sym_name_len == 1)
2938 return collseqmb[br_elem->opr.name[0]];
2939 }
3b0bdc72
UD
2940 return UINT_MAX;
2941 }
2942
d3821ab0 2943 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
2944 Build the range expression which starts from START_ELEM, and ends
2945 at END_ELEM. The result are written to MBCSET and SBCSET.
2946 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
d3821ab0 2947 mbcset->range_ends, is a pointer argument since we may
3b0bdc72
UD
2948 update it. */
2949
7090d3ca 2950 auto inline reg_errcode_t
d3821ab0 2951 __attribute__ ((always_inline))
7e2f0d2d
AS
2952 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2953 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
3b0bdc72
UD
2954 {
2955 unsigned int ch;
2956 uint32_t start_collseq;
2957 uint32_t end_collseq;
2958
434d3784 2959 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2960 start/end. */
f4efbdfb
PE
2961 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2962 || start_elem->type == CHAR_CLASS
2963 || end_elem->type == EQUIV_CLASS
2964 || end_elem->type == CHAR_CLASS))
15a7d175 2965 return REG_ERANGE;
3b0bdc72 2966
eb04c213 2967 /* FIXME: Implement rational ranges here, too. */
3b0bdc72
UD
2968 start_collseq = lookup_collation_sequence_value (start_elem);
2969 end_collseq = lookup_collation_sequence_value (end_elem);
2970 /* Check start/end collation sequence values. */
f4efbdfb
PE
2971 if (__glibc_unlikely (start_collseq == UINT_MAX
2972 || end_collseq == UINT_MAX))
15a7d175 2973 return REG_ECOLLATE;
f4efbdfb
PE
2974 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2975 && start_collseq > end_collseq))
15a7d175 2976 return REG_ERANGE;
3b0bdc72 2977
10677727
UD
2978 /* Got valid collation sequence values, add them as a new entry.
2979 However, if we have no collation elements, and the character set
2980 is single byte, the single byte character set that we
2981 build below suffices. */
2982 if (nrules > 0 || dfa->mb_cur_max > 1)
2983 {
21f5de55 2984 /* Check the space of the arrays. */
f4efbdfb 2985 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
10677727
UD
2986 {
2987 /* There is not enough space, need realloc. */
2988 uint32_t *new_array_start;
2989 uint32_t *new_array_end;
eb04c213 2990 Idx new_nranges;
10677727
UD
2991
2992 /* +1 in case of mbcset->nranges is 0. */
2993 new_nranges = 2 * mbcset->nranges + 1;
2994 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2995 new_nranges);
2996 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
21f5de55 2997 new_nranges);
10677727 2998
f4efbdfb
PE
2999 if (__glibc_unlikely (new_array_start == NULL
3000 || new_array_end == NULL))
21f5de55 3001 return REG_ESPACE;
10677727
UD
3002
3003 mbcset->range_starts = new_array_start;
3004 mbcset->range_ends = new_array_end;
3005 *range_alloc = new_nranges;
3006 }
3007
21f5de55
PE
3008 mbcset->range_starts[mbcset->nranges] = start_collseq;
3009 mbcset->range_ends[mbcset->nranges++] = end_collseq;
10677727 3010 }
3b0bdc72
UD
3011
3012 /* Build the table for single byte characters. */
1e7947dc 3013 for (ch = 0; ch < SBC_MAX; ch++)
15a7d175
UD
3014 {
3015 uint32_t ch_collseq;
3016 /*
3017 if (MB_CUR_MAX == 1)
3018 */
3019 if (nrules == 0)
3020 ch_collseq = collseqmb[ch];
3021 else
25337753 3022 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
3023 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3024 bitset_set (sbcset, ch);
3025 }
3b0bdc72
UD
3026 return REG_NOERROR;
3027 }
3b0bdc72 3028
d3821ab0 3029 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
3030 Build the collating element which is represented by NAME.
3031 The result are written to MBCSET and SBCSET.
3032 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
d3821ab0 3033 pointer argument since we may update it. */
3b0bdc72 3034
7090d3ca 3035 auto inline reg_errcode_t
d3821ab0 3036 __attribute__ ((always_inline))
7e2f0d2d 3037 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 3038 Idx *coll_sym_alloc, const unsigned char *name)
3b0bdc72 3039 {
3b0bdc72 3040 int32_t elem, idx;
62439eac 3041 size_t name_len = strlen ((const char *) name);
3b0bdc72 3042 if (nrules != 0)
15a7d175
UD
3043 {
3044 elem = seek_collating_symbol_entry (name, name_len);
7e2f0d2d 3045 if (elem != -1)
15a7d175
UD
3046 {
3047 /* We found the entry. */
3048 idx = symb_table[2 * elem + 1];
3049 /* Skip the name of collating element name. */
3050 idx += 1 + extra[idx];
3051 }
7e2f0d2d 3052 else if (name_len == 1)
15a7d175
UD
3053 {
3054 /* No valid character, treat it as a normal
3055 character. */
3056 bitset_set (sbcset, name[0]);
3057 return REG_NOERROR;
3058 }
3059 else
3060 return REG_ECOLLATE;
3b0bdc72 3061
15a7d175
UD
3062 /* Got valid collation sequence, add it as a new entry. */
3063 /* Check the space of the arrays. */
f4efbdfb 3064 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
15a7d175
UD
3065 {
3066 /* Not enough, realloc it. */
3067 /* +1 in case of mbcset->ncoll_syms is 0. */
eb04c213 3068 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
15a7d175
UD
3069 /* Use realloc since mbcset->coll_syms is NULL
3070 if *alloc == 0. */
951d6408
UD
3071 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3072 new_coll_sym_alloc);
f4efbdfb 3073 if (__glibc_unlikely (new_coll_syms == NULL))
15a7d175 3074 return REG_ESPACE;
951d6408
UD
3075 mbcset->coll_syms = new_coll_syms;
3076 *coll_sym_alloc = new_coll_sym_alloc;
15a7d175
UD
3077 }
3078 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
15a7d175
UD
3079 return REG_NOERROR;
3080 }
3b0bdc72 3081 else
15a7d175 3082 {
f4efbdfb 3083 if (__glibc_unlikely (name_len != 1))
15a7d175
UD
3084 return REG_ECOLLATE;
3085 else
3086 {
3087 bitset_set (sbcset, name[0]);
3088 return REG_NOERROR;
3089 }
3090 }
3b0bdc72 3091 }
434d3784
UD
3092#endif
3093
3b0bdc72
UD
3094 re_token_t br_token;
3095 re_bitset_ptr_t sbcset;
c0a0f9a3 3096#ifdef RE_ENABLE_I18N
3b0bdc72 3097 re_charset_t *mbcset;
eb04c213
AZ
3098 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3099 Idx equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3 3100#endif /* not RE_ENABLE_I18N */
eb04c213 3101 bool non_match = false;
c0a0f9a3 3102 bin_tree_t *work_tree;
ee70274a 3103 int token_len;
eb04c213 3104 bool first_round = true;
3b0bdc72 3105#ifdef _LIBC
62439eac
UD
3106 collseqmb = (const unsigned char *)
3107 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
3108 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3109 if (nrules)
3110 {
3111 /*
3112 if (MB_CUR_MAX > 1)
3113 */
7d4722e3 3114 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
3115 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3116 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3117 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 3118 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3119 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3120 }
3121#endif
2c05d33f 3122 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3123#ifdef RE_ENABLE_I18N
3b0bdc72 3124 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3125#endif /* RE_ENABLE_I18N */
3126#ifdef RE_ENABLE_I18N
f4efbdfb 3127 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
c0a0f9a3 3128#else
f4efbdfb 3129 if (__glibc_unlikely (sbcset == NULL))
c0a0f9a3 3130#endif /* RE_ENABLE_I18N */
3b0bdc72 3131 {
a129c80d
UD
3132 re_free (sbcset);
3133#ifdef RE_ENABLE_I18N
3134 re_free (mbcset);
3135#endif
3b0bdc72
UD
3136 *err = REG_ESPACE;
3137 return NULL;
3138 }
3139
3140 token_len = peek_token_bracket (token, regexp, syntax);
f4efbdfb 3141 if (__glibc_unlikely (token->type == END_OF_RE))
3b0bdc72 3142 {
3b0bdc72 3143 *err = REG_BADPAT;
434d3784 3144 goto parse_bracket_exp_free_return;
3b0bdc72
UD
3145 }
3146 if (token->type == OP_NON_MATCH_LIST)
3147 {
c0a0f9a3 3148#ifdef RE_ENABLE_I18N
3b0bdc72 3149 mbcset->non_match = 1;
c0a0f9a3 3150#endif /* not RE_ENABLE_I18N */
eb04c213 3151 non_match = true;
3b0bdc72 3152 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
784aacea 3153 bitset_set (sbcset, '\n');
3b0bdc72
UD
3154 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3155 token_len = peek_token_bracket (token, regexp, syntax);
f4efbdfb 3156 if (__glibc_unlikely (token->type == END_OF_RE))
15a7d175
UD
3157 {
3158 *err = REG_BADPAT;
3159 goto parse_bracket_exp_free_return;
3160 }
3b0bdc72
UD
3161 }
3162
3163 /* We treat the first ']' as a normal character. */
3164 if (token->type == OP_CLOSE_BRACKET)
3165 token->type = CHARACTER;
3166
3167 while (1)
3168 {
3169 bracket_elem_t start_elem, end_elem;
62439eac
UD
3170 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3171 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72 3172 reg_errcode_t ret;
eb04c213
AZ
3173 int token_len2 = 0;
3174 bool is_range_exp = false;
3b0bdc72
UD
3175 re_token_t token2;
3176
3177 start_elem.opr.name = start_name_buf;
39a12f8d 3178 start_elem.type = COLL_SYM;
3b0bdc72 3179 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 3180 syntax, first_round);
f4efbdfb 3181 if (__glibc_unlikely (ret != REG_NOERROR))
15a7d175
UD
3182 {
3183 *err = ret;
3184 goto parse_bracket_exp_free_return;
3185 }
eb04c213 3186 first_round = false;
3b0bdc72 3187
78d8b07a 3188 /* Get information about the next token. We need it in any case. */
3b0bdc72 3189 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3190
3191 /* Do not check for ranges if we know they are not allowed. */
3192 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3193 {
f4efbdfb 3194 if (__glibc_unlikely (token->type == END_OF_RE))
15a7d175 3195 {
78d8b07a 3196 *err = REG_EBRACK;
15a7d175
UD
3197 goto parse_bracket_exp_free_return;
3198 }
78d8b07a 3199 if (token->type == OP_CHARSET_RANGE)
15a7d175 3200 {
78d8b07a
UD
3201 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3202 token_len2 = peek_token_bracket (&token2, regexp, syntax);
f4efbdfb 3203 if (__glibc_unlikely (token2.type == END_OF_RE))
78d8b07a
UD
3204 {
3205 *err = REG_EBRACK;
3206 goto parse_bracket_exp_free_return;
3207 }
3208 if (token2.type == OP_CLOSE_BRACKET)
3209 {
3210 /* We treat the last '-' as a normal character. */
3211 re_string_skip_bytes (regexp, -token_len);
3212 token->type = CHARACTER;
3213 }
3214 else
eb04c213 3215 is_range_exp = true;
15a7d175 3216 }
15a7d175 3217 }
3b0bdc72 3218
eb04c213 3219 if (is_range_exp == true)
15a7d175
UD
3220 {
3221 end_elem.opr.name = end_name_buf;
39a12f8d 3222 end_elem.type = COLL_SYM;
15a7d175 3223 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
eb04c213 3224 dfa, syntax, true);
f4efbdfb 3225 if (__glibc_unlikely (ret != REG_NOERROR))
15a7d175
UD
3226 {
3227 *err = ret;
3228 goto parse_bracket_exp_free_return;
3229 }
3230
3231 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3232
10677727
UD
3233#ifdef _LIBC
3234 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3235 &start_elem, &end_elem);
3236#else
3237# ifdef RE_ENABLE_I18N
eb04c213 3238 *err = build_range_exp (syntax, sbcset,
10677727
UD
3239 dfa->mb_cur_max > 1 ? mbcset : NULL,
3240 &range_alloc, &start_elem, &end_elem);
3241# else
eb04c213 3242 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
10677727 3243# endif
c0a0f9a3 3244#endif /* RE_ENABLE_I18N */
f4efbdfb 3245 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175
UD
3246 goto parse_bracket_exp_free_return;
3247 }
3b0bdc72 3248 else
15a7d175
UD
3249 {
3250 switch (start_elem.type)
3251 {
3252 case SB_CHAR:
3253 bitset_set (sbcset, start_elem.opr.ch);
3254 break;
c0a0f9a3 3255#ifdef RE_ENABLE_I18N
15a7d175
UD
3256 case MB_CHAR:
3257 /* Check whether the array has enough space. */
f4efbdfb 3258 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
15a7d175 3259 {
951d6408 3260 wchar_t *new_mbchars;
15a7d175
UD
3261 /* Not enough, realloc it. */
3262 /* +1 in case of mbcset->nmbchars is 0. */
3263 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3264 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3265 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3266 mbchar_alloc);
f4efbdfb 3267 if (__glibc_unlikely (new_mbchars == NULL))
15a7d175 3268 goto parse_bracket_exp_espace;
951d6408 3269 mbcset->mbchars = new_mbchars;
15a7d175
UD
3270 }
3271 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3272 break;
c0a0f9a3 3273#endif /* RE_ENABLE_I18N */
15a7d175
UD
3274 case EQUIV_CLASS:
3275 *err = build_equiv_class (sbcset,
c0a0f9a3 3276#ifdef RE_ENABLE_I18N
15a7d175 3277 mbcset, &equiv_class_alloc,
c0a0f9a3 3278#endif /* RE_ENABLE_I18N */
3b0bdc72 3279 start_elem.opr.name);
f4efbdfb 3280 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175
UD
3281 goto parse_bracket_exp_free_return;
3282 break;
3283 case COLL_SYM:
3284 *err = build_collating_symbol (sbcset,
c0a0f9a3 3285#ifdef RE_ENABLE_I18N
15a7d175 3286 mbcset, &coll_sym_alloc,
c0a0f9a3 3287#endif /* RE_ENABLE_I18N */
3b0bdc72 3288 start_elem.opr.name);
f4efbdfb 3289 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175
UD
3290 goto parse_bracket_exp_free_return;
3291 break;
3292 case CHAR_CLASS:
66b110e8 3293 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3294#ifdef RE_ENABLE_I18N
7b7b9e70 3295 mbcset, &char_class_alloc,
c0a0f9a3 3296#endif /* RE_ENABLE_I18N */
eb04c213
AZ
3297 (const char *) start_elem.opr.name,
3298 syntax);
f4efbdfb 3299 if (__glibc_unlikely (*err != REG_NOERROR))
7b7b9e70 3300 goto parse_bracket_exp_free_return;
15a7d175
UD
3301 break;
3302 default:
2a0356e1 3303 DEBUG_ASSERT (false);
15a7d175
UD
3304 break;
3305 }
3306 }
f4efbdfb 3307 if (__glibc_unlikely (token->type == END_OF_RE))
78d8b07a
UD
3308 {
3309 *err = REG_EBRACK;
3310 goto parse_bracket_exp_free_return;
3311 }
3b0bdc72 3312 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3313 break;
3b0bdc72
UD
3314 }
3315
3316 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3317
3318 /* If it is non-matching list. */
c0a0f9a3 3319 if (non_match)
3b0bdc72 3320 bitset_not (sbcset);
10677727 3321
65e6becf
UD
3322#ifdef RE_ENABLE_I18N
3323 /* Ensure only single byte characters are set. */
3324 if (dfa->mb_cur_max > 1)
3325 bitset_mask (sbcset, dfa->sb_char);
3b0bdc72
UD
3326
3327 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3328 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3329 || mbcset->non_match)))
3b0bdc72 3330 {
3b0bdc72 3331 bin_tree_t *mbc_tree;
ad7f28c2 3332 int sbc_idx;
3b0bdc72 3333 /* Build a tree for complex bracket. */
ad7f28c2 3334 dfa->has_mb_node = 1;
02f3550c
UD
3335 br_token.type = COMPLEX_BRACKET;
3336 br_token.opr.mbcset = mbcset;
3337 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3338 if (__glibc_unlikely (mbc_tree == NULL))
02f3550c 3339 goto parse_bracket_exp_espace;
2c05d33f 3340 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
ad7f28c2
UD
3341 if (sbcset[sbc_idx])
3342 break;
3343 /* If there are no bits set in sbcset, there is no point
3344 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
2c05d33f 3345 if (sbc_idx < BITSET_WORDS)
02f3550c 3346 {
21f5de55
PE
3347 /* Build a tree for simple bracket. */
3348 br_token.type = SIMPLE_BRACKET;
3349 br_token.opr.sbcset = sbcset;
3350 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3351 if (__glibc_unlikely (work_tree == NULL))
21f5de55 3352 goto parse_bracket_exp_espace;
02f3550c 3353
21f5de55
PE
3354 /* Then join them by ALT node. */
3355 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
f4efbdfb 3356 if (__glibc_unlikely (work_tree == NULL))
21f5de55 3357 goto parse_bracket_exp_espace;
02f3550c
UD
3358 }
3359 else
ad7f28c2
UD
3360 {
3361 re_free (sbcset);
02f3550c 3362 work_tree = mbc_tree;
ad7f28c2 3363 }
3b0bdc72
UD
3364 }
3365 else
963d8d78 3366#endif /* not RE_ENABLE_I18N */
3b0bdc72 3367 {
963d8d78
UD
3368#ifdef RE_ENABLE_I18N
3369 free_charset (mbcset);
3370#endif
02f3550c
UD
3371 /* Build a tree for simple bracket. */
3372 br_token.type = SIMPLE_BRACKET;
3373 br_token.opr.sbcset = sbcset;
3374 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3375 if (__glibc_unlikely (work_tree == NULL))
21f5de55 3376 goto parse_bracket_exp_espace;
3b0bdc72 3377 }
02f3550c 3378 return work_tree;
3b0bdc72
UD
3379
3380 parse_bracket_exp_espace:
3b0bdc72 3381 *err = REG_ESPACE;
434d3784
UD
3382 parse_bracket_exp_free_return:
3383 re_free (sbcset);
c0a0f9a3 3384#ifdef RE_ENABLE_I18N
434d3784 3385 free_charset (mbcset);
c0a0f9a3 3386#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3387 return NULL;
3388}
3389
434d3784
UD
3390/* Parse an element in the bracket expression. */
3391
3b0bdc72 3392static reg_errcode_t
0fd8ae9c
UD
3393parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3394 re_token_t *token, int token_len, re_dfa_t *dfa,
eb04c213 3395 reg_syntax_t syntax, bool accept_hyphen)
3b0bdc72
UD
3396{
3397#ifdef RE_ENABLE_I18N
3398 int cur_char_size;
3399 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3400 if (cur_char_size > 1)
3401 {
3402 elem->type = MB_CHAR;
3403 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3404 re_string_skip_bytes (regexp, cur_char_size);
3405 return REG_NOERROR;
3406 }
3407#endif /* RE_ENABLE_I18N */
3408 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3409 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3410 || token->type == OP_OPEN_EQUIV_CLASS)
3411 return parse_bracket_symbol (elem, regexp, token);
f4efbdfb 3412 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
78d8b07a
UD
3413 {
3414 /* A '-' must only appear as anything but a range indicator before
3415 the closing bracket. Everything else is an error. */
3416 re_token_t token2;
3417 (void) peek_token_bracket (&token2, regexp, syntax);
3418 if (token2.type != OP_CLOSE_BRACKET)
3419 /* The actual error value is not standardized since this whole
3420 case is undefined. But ERANGE makes good sense. */
3421 return REG_ERANGE;
3422 }
3b0bdc72
UD
3423 elem->type = SB_CHAR;
3424 elem->opr.ch = token->opr.c;
3425 return REG_NOERROR;
3426}
3427
434d3784
UD
3428/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3429 such as [:<character_class>:], [.<collating_element>.], and
3430 [=<equivalent_class>=]. */
3431
3b0bdc72 3432static reg_errcode_t
0fd8ae9c
UD
3433parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3434 re_token_t *token)
3b0bdc72
UD
3435{
3436 unsigned char ch, delim = token->opr.c;
3437 int i = 0;
294b6bcc
UD
3438 if (re_string_eoi(regexp))
3439 return REG_EBRACK;
602c2f9d 3440 for (;; ++i)
3b0bdc72 3441 {
294b6bcc 3442 if (i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3443 return REG_EBRACK;
3b0bdc72 3444 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3445 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3446 else
15a7d175 3447 ch = re_string_fetch_byte (regexp);
294b6bcc
UD
3448 if (re_string_eoi(regexp))
3449 return REG_EBRACK;
3b0bdc72 3450 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3451 break;
3b0bdc72
UD
3452 elem->opr.name[i] = ch;
3453 }
3454 re_string_skip_bytes (regexp, 1);
3455 elem->opr.name[i] = '\0';
3456 switch (token->type)
3457 {
3458 case OP_OPEN_COLL_ELEM:
3459 elem->type = COLL_SYM;
3460 break;
3461 case OP_OPEN_EQUIV_CLASS:
3462 elem->type = EQUIV_CLASS;
3463 break;
3464 case OP_OPEN_CHAR_CLASS:
3465 elem->type = CHAR_CLASS;
3466 break;
3467 default:
3468 break;
3469 }
3470 return REG_NOERROR;
3471}
3472
3473 /* Helper function for parse_bracket_exp.
3474 Build the equivalence class which is represented by NAME.
3475 The result are written to MBCSET and SBCSET.
3476 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
d3821ab0 3477 is a pointer argument since we may update it. */
3b0bdc72
UD
3478
3479static reg_errcode_t
c0a0f9a3 3480#ifdef RE_ENABLE_I18N
0fd8ae9c 3481build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 3482 Idx *equiv_class_alloc, const unsigned char *name)
c0a0f9a3 3483#else /* not RE_ENABLE_I18N */
0fd8ae9c 3484build_equiv_class (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 3485#endif /* not RE_ENABLE_I18N */
3b0bdc72 3486{
0fd8ae9c 3487#ifdef _LIBC
3b0bdc72
UD
3488 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3489 if (nrules != 0)
3490 {
3491 const int32_t *table, *indirect;
3492 const unsigned char *weights, *extra, *cp;
62439eac 3493 unsigned char char_buf[2];
3b0bdc72
UD
3494 int32_t idx1, idx2;
3495 unsigned int ch;
3496 size_t len;
3b0bdc72 3497 /* Calculate the index for equivalence class. */
62439eac 3498 cp = name;
3b0bdc72
UD
3499 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3500 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501 _NL_COLLATE_WEIGHTMB);
3502 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3503 _NL_COLLATE_EXTRAMB);
3b0bdc72 3504 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3505 _NL_COLLATE_INDIRECTMB);
8c0ab919 3506 idx1 = findidx (table, indirect, extra, &cp, -1);
f4efbdfb 3507 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
15a7d175
UD
3508 /* This isn't a valid character. */
3509 return REG_ECOLLATE;
3b0bdc72 3510
5069ff32 3511 /* Build single byte matching table for this equivalence class. */
b7d1c5fa 3512 len = weights[idx1 & 0xffffff];
3b0bdc72 3513 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3514 {
3515 char_buf[0] = ch;
3516 cp = char_buf;
8c0ab919 3517 idx2 = findidx (table, indirect, extra, &cp, 1);
3b0bdc72 3518/*
15a7d175 3519 idx2 = table[ch];
3b0bdc72 3520*/
15a7d175
UD
3521 if (idx2 == 0)
3522 /* This isn't a valid character. */
3523 continue;
b7d1c5fa
UD
3524 /* Compare only if the length matches and the collation rule
3525 index is the same. */
786658a0
FW
3526 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3527 && memcmp (weights + (idx1 & 0xffffff) + 1,
3528 weights + (idx2 & 0xffffff) + 1, len) == 0)
3529 bitset_set (sbcset, ch);
15a7d175 3530 }
a9388965 3531 /* Check whether the array has enough space. */
f4efbdfb 3532 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
15a7d175
UD
3533 {
3534 /* Not enough, realloc it. */
3535 /* +1 in case of mbcset->nequiv_classes is 0. */
eb04c213 3536 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3537 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3538 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3539 int32_t,
3540 new_equiv_class_alloc);
f4efbdfb 3541 if (__glibc_unlikely (new_equiv_classes == NULL))
15a7d175 3542 return REG_ESPACE;
951d6408
UD
3543 mbcset->equiv_classes = new_equiv_classes;
3544 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3545 }
3b0bdc72
UD
3546 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3547 }
3548 else
10677727 3549#endif /* _LIBC */
3b0bdc72 3550 {
f4efbdfb 3551 if (__glibc_unlikely (strlen ((const char *) name) != 1))
15a7d175 3552 return REG_ECOLLATE;
62439eac 3553 bitset_set (sbcset, *name);
3b0bdc72
UD
3554 }
3555 return REG_NOERROR;
3556}
3557
3558 /* Helper function for parse_bracket_exp.
3559 Build the character class which is represented by NAME.
3560 The result are written to MBCSET and SBCSET.
3561 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
d3821ab0 3562 is a pointer argument since we may update it. */
3b0bdc72
UD
3563
3564static reg_errcode_t
c0a0f9a3 3565#ifdef RE_ENABLE_I18N
0fd8ae9c 3566build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
eb04c213
AZ
3567 re_charset_t *mbcset, Idx *char_class_alloc,
3568 const char *class_name, reg_syntax_t syntax)
c0a0f9a3 3569#else /* not RE_ENABLE_I18N */
0fd8ae9c 3570build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
eb04c213 3571 const char *class_name, reg_syntax_t syntax)
c0a0f9a3 3572#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3573{
3574 int i;
eb04c213 3575 const char *name = class_name;
c0a0f9a3
UD
3576
3577 /* In case of REG_ICASE "upper" and "lower" match the both of
3578 upper and lower cases. */
3579 if ((syntax & RE_ICASE)
3580 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3581 name = "alpha";
3582
3583#ifdef RE_ENABLE_I18N
3b0bdc72 3584 /* Check the space of the arrays. */
f4efbdfb 3585 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
a9388965
UD
3586 {
3587 /* Not enough, realloc it. */
3588 /* +1 in case of mbcset->nchar_classes is 0. */
eb04c213 3589 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3590 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3591 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3592 new_char_class_alloc);
f4efbdfb 3593 if (__glibc_unlikely (new_char_classes == NULL))
15a7d175 3594 return REG_ESPACE;
951d6408
UD
3595 mbcset->char_classes = new_char_classes;
3596 *char_class_alloc = new_char_class_alloc;
a9388965 3597 }
3b0bdc72 3598 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3599#endif /* RE_ENABLE_I18N */
3b0bdc72 3600
66b110e8 3601#define BUILD_CHARCLASS_LOOP(ctype_func) \
513bbb25 3602 do { \
f4efbdfb 3603 if (__glibc_unlikely (trans != NULL)) \
0ecb606c 3604 { \
513bbb25 3605 for (i = 0; i < SBC_MAX; ++i) \
eb04c213 3606 if (ctype_func (i)) \
513bbb25
UD
3607 bitset_set (sbcset, trans[i]); \
3608 } \
3609 else \
3610 { \
3611 for (i = 0; i < SBC_MAX; ++i) \
eb04c213 3612 if (ctype_func (i)) \
513bbb25
UD
3613 bitset_set (sbcset, i); \
3614 } \
3615 } while (0)
3b0bdc72
UD
3616
3617 if (strcmp (name, "alnum") == 0)
513bbb25 3618 BUILD_CHARCLASS_LOOP (isalnum);
3b0bdc72 3619 else if (strcmp (name, "cntrl") == 0)
513bbb25 3620 BUILD_CHARCLASS_LOOP (iscntrl);
3b0bdc72 3621 else if (strcmp (name, "lower") == 0)
513bbb25 3622 BUILD_CHARCLASS_LOOP (islower);
3b0bdc72 3623 else if (strcmp (name, "space") == 0)
513bbb25 3624 BUILD_CHARCLASS_LOOP (isspace);
3b0bdc72 3625 else if (strcmp (name, "alpha") == 0)
513bbb25 3626 BUILD_CHARCLASS_LOOP (isalpha);
3b0bdc72 3627 else if (strcmp (name, "digit") == 0)
513bbb25 3628 BUILD_CHARCLASS_LOOP (isdigit);
3b0bdc72 3629 else if (strcmp (name, "print") == 0)
513bbb25 3630 BUILD_CHARCLASS_LOOP (isprint);
3b0bdc72 3631 else if (strcmp (name, "upper") == 0)
513bbb25 3632 BUILD_CHARCLASS_LOOP (isupper);
3b0bdc72 3633 else if (strcmp (name, "blank") == 0)
513bbb25 3634 BUILD_CHARCLASS_LOOP (isblank);
3b0bdc72 3635 else if (strcmp (name, "graph") == 0)
513bbb25 3636 BUILD_CHARCLASS_LOOP (isgraph);
3b0bdc72 3637 else if (strcmp (name, "punct") == 0)
513bbb25 3638 BUILD_CHARCLASS_LOOP (ispunct);
3b0bdc72 3639 else if (strcmp (name, "xdigit") == 0)
513bbb25 3640 BUILD_CHARCLASS_LOOP (isxdigit);
3b0bdc72
UD
3641 else
3642 return REG_ECTYPE;
3643
3644 return REG_NOERROR;
3645}
3646
3647static bin_tree_t *
0fd8ae9c 3648build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
eb04c213
AZ
3649 const char *class_name,
3650 const char *extra, bool non_match,
0fd8ae9c 3651 reg_errcode_t *err)
3b0bdc72
UD
3652{
3653 re_bitset_ptr_t sbcset;
c0a0f9a3 3654#ifdef RE_ENABLE_I18N
3b0bdc72 3655 re_charset_t *mbcset;
eb04c213 3656 Idx alloc = 0;
c0a0f9a3 3657#endif /* not RE_ENABLE_I18N */
3b0bdc72 3658 reg_errcode_t ret;
3b0bdc72 3659 bin_tree_t *tree;
3b0bdc72 3660
2c05d33f 3661 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
f4efbdfb 3662 if (__glibc_unlikely (sbcset == NULL))
3b0bdc72
UD
3663 {
3664 *err = REG_ESPACE;
3665 return NULL;
3666 }
c0a0f9a3 3667#ifdef RE_ENABLE_I18N
eb04c213 3668 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
f4efbdfb 3669 if (__glibc_unlikely (mbcset == NULL))
eb04c213
AZ
3670 {
3671 re_free (sbcset);
3672 *err = REG_ESPACE;
3673 return NULL;
3b0bdc72 3674 }
eb04c213
AZ
3675 mbcset->non_match = non_match;
3676#endif /* RE_ENABLE_I18N */
3b0bdc72 3677
602c2f9d 3678 /* We don't care the syntax in this case. */
66b110e8 3679 ret = build_charclass (trans, sbcset,
c0a0f9a3 3680#ifdef RE_ENABLE_I18N
15a7d175 3681 mbcset, &alloc,
c0a0f9a3 3682#endif /* RE_ENABLE_I18N */
e2b6bfa3 3683 class_name, 0);
c0a0f9a3 3684
f4efbdfb 3685 if (__glibc_unlikely (ret != REG_NOERROR))
3b0bdc72
UD
3686 {
3687 re_free (sbcset);
c0a0f9a3 3688#ifdef RE_ENABLE_I18N
3b0bdc72 3689 free_charset (mbcset);
c0a0f9a3 3690#endif /* RE_ENABLE_I18N */
7b7b9e70 3691 *err = ret;
3b0bdc72
UD
3692 return NULL;
3693 }
434d3784 3694 /* \w match '_' also. */
e2b6bfa3
UD
3695 for (; *extra; extra++)
3696 bitset_set (sbcset, *extra);
3b0bdc72
UD
3697
3698 /* If it is non-matching list. */
c0a0f9a3 3699 if (non_match)
3b0bdc72
UD
3700 bitset_not (sbcset);
3701
65e6becf
UD
3702#ifdef RE_ENABLE_I18N
3703 /* Ensure only single byte characters are set. */
3704 if (dfa->mb_cur_max > 1)
3705 bitset_mask (sbcset, dfa->sb_char);
3706#endif
3707
3b0bdc72 3708 /* Build a tree for simple bracket. */
2a0356e1 3709 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
02f3550c 3710 tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3711 if (__glibc_unlikely (tree == NULL))
3b0bdc72
UD
3712 goto build_word_op_espace;
3713
c0a0f9a3 3714#ifdef RE_ENABLE_I18N
3c0fb574 3715 if (dfa->mb_cur_max > 1)
3b0bdc72 3716 {
3b0bdc72
UD
3717 bin_tree_t *mbc_tree;
3718 /* Build a tree for complex bracket. */
3719 br_token.type = COMPLEX_BRACKET;
3720 br_token.opr.mbcset = mbcset;
3721 dfa->has_mb_node = 1;
02f3550c 3722 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3723 if (__glibc_unlikely (mbc_tree == NULL))
15a7d175 3724 goto build_word_op_espace;
3b0bdc72 3725 /* Then join them by ALT node. */
02f3550c 3726 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
f4efbdfb 3727 if (__glibc_likely (mbc_tree != NULL))
15a7d175 3728 return tree;
3b0bdc72
UD
3729 }
3730 else
3731 {
3732 free_charset (mbcset);
3733 return tree;
3734 }
c0a0f9a3
UD
3735#else /* not RE_ENABLE_I18N */
3736 return tree;
3737#endif /* not RE_ENABLE_I18N */
3738
3b0bdc72
UD
3739 build_word_op_espace:
3740 re_free (sbcset);
c0a0f9a3 3741#ifdef RE_ENABLE_I18N
3b0bdc72 3742 free_charset (mbcset);
c0a0f9a3 3743#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3744 *err = REG_ESPACE;
3745 return NULL;
3746}
3747
3748/* This is intended for the expressions like "a{1,3}".
eb04c213
AZ
3749 Fetch a number from 'input', and return the number.
3750 Return -1 if the number field is empty like "{,1}".
3751 Return RE_DUP_MAX + 1 if the number field is too large.
3752 Return -2 if an error occurred. */
3b0bdc72 3753
eb04c213 3754static Idx
0fd8ae9c 3755fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3b0bdc72 3756{
eb04c213 3757 Idx num = -1;
3b0bdc72
UD
3758 unsigned char c;
3759 while (1)
3760 {
f0d77aa8 3761 fetch_token (token, input, syntax);
3b0bdc72 3762 c = token->opr.c;
f4efbdfb 3763 if (__glibc_unlikely (token->type == END_OF_RE))
15a7d175 3764 return -2;
3b0bdc72 3765 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3766 break;
602c2f9d 3767 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
eb04c213
AZ
3768 ? -2
3769 : num == -1
3770 ? c - '0'
3771 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3b0bdc72 3772 }
3b0bdc72
UD
3773 return num;
3774}
3775\f
c0a0f9a3 3776#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3777static void
3778free_charset (re_charset_t *cset)
3779{
3780 re_free (cset->mbchars);
c0a0f9a3 3781# ifdef _LIBC
3b0bdc72
UD
3782 re_free (cset->coll_syms);
3783 re_free (cset->equiv_classes);
fa67ba06 3784# endif
3b0bdc72
UD
3785 re_free (cset->range_starts);
3786 re_free (cset->range_ends);
3787 re_free (cset->char_classes);
3788 re_free (cset);
3789}
c0a0f9a3 3790#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3791\f
3792/* Functions for binary tree operation. */
3793
ee70274a 3794/* Create a tree node. */
3b0bdc72
UD
3795
3796static bin_tree_t *
0fd8ae9c
UD
3797create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3798 re_token_type_t type)
02f3550c 3799{
2a0356e1 3800 re_token_t t = { .type = type };
02f3550c
UD
3801 return create_token_tree (dfa, left, right, &t);
3802}
3803
3804static bin_tree_t *
0fd8ae9c
UD
3805create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3806 const re_token_t *token)
3b0bdc72
UD
3807{
3808 bin_tree_t *tree;
f4efbdfb 3809 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3b0bdc72 3810 {
ee70274a
UD
3811 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3812
3813 if (storage == NULL)
3814 return NULL;
3815 storage->next = dfa->str_tree_storage;
3816 dfa->str_tree_storage = storage;
3817 dfa->str_tree_storage_idx = 0;
3b0bdc72 3818 }
ee70274a
UD
3819 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3820
3b0bdc72
UD
3821 tree->parent = NULL;
3822 tree->left = left;
3823 tree->right = right;
02f3550c
UD
3824 tree->token = *token;
3825 tree->token.duplicated = 0;
3826 tree->token.opt_subexp = 0;
3827 tree->first = NULL;
3828 tree->next = NULL;
3829 tree->node_idx = -1;
3b0bdc72
UD
3830
3831 if (left != NULL)
3832 left->parent = tree;
3833 if (right != NULL)
3834 right->parent = tree;
3835 return tree;
3836}
3837
02f3550c
UD
3838/* Mark the tree SRC as an optional subexpression.
3839 To be called from preorder or postorder. */
3b0bdc72 3840
02f3550c 3841static reg_errcode_t
0fd8ae9c 3842mark_opt_subexp (void *extra, bin_tree_t *node)
f0d77aa8 3843{
eb04c213 3844 Idx idx = (uintptr_t) extra;
02f3550c
UD
3845 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3846 node->token.opt_subexp = 1;
a334319f 3847
02f3550c 3848 return REG_NOERROR;
3b0bdc72
UD
3849}
3850
02f3550c 3851/* Free the allocated memory inside NODE. */
6b6557e8
UD
3852
3853static void
02f3550c 3854free_token (re_token_t *node)
6b6557e8 3855{
02f3550c
UD
3856#ifdef RE_ENABLE_I18N
3857 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3858 free_charset (node->opr.mbcset);
3859 else
3860#endif /* RE_ENABLE_I18N */
3861 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3862 re_free (node->opr.sbcset);
6b6557e8
UD
3863}
3864
02f3550c
UD
3865/* Worker function for tree walking. Free the allocated memory inside NODE
3866 and its children. */
6b6557e8 3867
02f3550c
UD
3868static reg_errcode_t
3869free_tree (void *extra, bin_tree_t *node)
6b6557e8 3870{
02f3550c
UD
3871 free_token (&node->token);
3872 return REG_NOERROR;
6b6557e8
UD
3873}
3874
ee70274a 3875
02f3550c
UD
3876/* Duplicate the node SRC, and return new node. This is a preorder
3877 visit similar to the one implemented by the generic visitor, but
3878 we need more infrastructure to maintain two parallel trees --- so,
3879 it's easier to duplicate. */
3b0bdc72
UD
3880
3881static bin_tree_t *
0fd8ae9c 3882duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3b0bdc72 3883{
02f3550c
UD
3884 const bin_tree_t *node;
3885 bin_tree_t *dup_root;
3886 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3b0bdc72 3887
02f3550c 3888 for (node = root; ; )
3b0bdc72 3889 {
02f3550c
UD
3890 /* Create a new tree and link it back to the current parent. */
3891 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3892 if (*p_new == NULL)
ee70274a 3893 return NULL;
02f3550c
UD
3894 (*p_new)->parent = dup_node;
3895 (*p_new)->token.duplicated = 1;
3896 dup_node = *p_new;
3b0bdc72 3897
02f3550c
UD
3898 /* Go to the left node, or up and to the right. */
3899 if (node->left)
3900 {
3901 node = node->left;
3902 p_new = &dup_node->left;
3903 }
3904 else
3905 {
3906 const bin_tree_t *prev = NULL;
3907 while (node->right == prev || node->right == NULL)
3908 {
3909 prev = node;
3910 node = node->parent;
3911 dup_node = dup_node->parent;
3912 if (!node)
21f5de55 3913 return dup_root;
02f3550c
UD
3914 }
3915 node = node->right;
3916 p_new = &dup_node->right;
3917 }
3b0bdc72 3918 }
3b0bdc72 3919}