]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
build-many-glibcs.py: Add openrisc hard float glibc variant
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
dff8da6b 2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
eb04c213 18 <https://www.gnu.org/licenses/>. */
e054f494 19
8c0ab919
RM
20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
3b0bdc72 24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
01ed6ceb 25 size_t length, reg_syntax_t syntax);
3b0bdc72 26static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
27 const re_dfastate_t *init_state,
28 char *fastmap);
01ed6ceb 29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
c0a0f9a3 30#ifdef RE_ENABLE_I18N
3b0bdc72 31static void free_charset (re_charset_t *cset);
c0a0f9a3 32#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
33static void free_workarea_compile (regex_t *preg);
34static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
35#ifdef RE_ENABLE_I18N
36static void optimize_utf8 (re_dfa_t *dfa);
37#endif
02f3550c 38static reg_errcode_t analyze (regex_t *preg);
02f3550c
UD
39static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
eb04c213
AZ
52static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
a7d5c291 54 unsigned int constraint);
3b0bdc72 55static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 56static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
eb04c213 57 Idx node, bool root);
963d8d78 58static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
eb04c213 59static Idx fetch_number (re_string_t *input, re_token_t *token,
15a7d175 60 reg_syntax_t syntax);
3b0bdc72 61static int peek_token (re_token_t *token, re_string_t *input,
b41bd5bc 62 reg_syntax_t syntax);
3b0bdc72 63static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 64 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 65static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175 66 re_token_t *token, reg_syntax_t syntax,
eb04c213 67 Idx nest, reg_errcode_t *err);
3b0bdc72 68static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175 69 re_token_t *token, reg_syntax_t syntax,
eb04c213 70 Idx nest, reg_errcode_t *err);
3b0bdc72 71static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175 72 re_token_t *token, reg_syntax_t syntax,
eb04c213 73 Idx nest, reg_errcode_t *err);
3b0bdc72 74static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175 75 re_token_t *token, reg_syntax_t syntax,
eb04c213 76 Idx nest, reg_errcode_t *err);
3b0bdc72 77static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 80static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
3b0bdc72 83static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
78d8b07a 87 reg_syntax_t syntax,
eb04c213 88 bool accept_hyphen);
3b0bdc72 89static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
90 re_string_t *regexp,
91 re_token_t *token);
c0a0f9a3 92#ifdef RE_ENABLE_I18N
2c05d33f 93static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 94 re_charset_t *mbcset,
eb04c213 95 Idx *equiv_class_alloc,
15a7d175 96 const unsigned char *name);
997470b3 97static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 98 bitset_t sbcset,
15a7d175 99 re_charset_t *mbcset,
eb04c213
AZ
100 Idx *char_class_alloc,
101 const char *class_name,
15a7d175 102 reg_syntax_t syntax);
c0a0f9a3 103#else /* not RE_ENABLE_I18N */
2c05d33f 104static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 105 const unsigned char *name);
997470b3 106static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 107 bitset_t sbcset,
eb04c213 108 const char *class_name,
15a7d175 109 reg_syntax_t syntax);
c0a0f9a3 110#endif /* not RE_ENABLE_I18N */
86576b62 111static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
997470b3 112 RE_TRANSLATE_TYPE trans,
eb04c213
AZ
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
ee70274a
UD
116static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
02f3550c
UD
118 re_token_type_t type);
119static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
3b0bdc72 122static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
02f3550c
UD
123static void free_token (re_token_t *node);
124static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
3b0bdc72
UD
126\f
127/* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
131
eb04c213 132static const char __re_error_msgid[] =
3b0bdc72
UD
133 {
134#define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
eb04c213 156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
3b0bdc72 157 "\0"
eb04c213 158#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
3b0bdc72
UD
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 };
185
eb04c213 186static const size_t __re_error_msgid_idx[] =
3b0bdc72
UD
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206\f
207/* Entry points for GNU code. */
208
209/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
211 Returns 0 if the pattern was valid, otherwise an error string.
212
d3821ab0 213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
3b0bdc72
UD
214 are set in BUFP on entry. */
215
216const char *
9dd346ff
JM
217re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
3b0bdc72
UD
219{
220 reg_errcode_t ret;
221
3b0bdc72
UD
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
c06a6956
UD
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
3b0bdc72
UD
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
75e4a282 230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
231
232 if (!ret)
233 return NULL;
6455d255 234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 235}
3b0bdc72 236weak_alias (__re_compile_pattern, re_compile_pattern)
3b0bdc72 237
d3821ab0 238/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
3b0bdc72
UD
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241/* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243reg_syntax_t re_syntax_options;
244
245
246/* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
249
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
252
253reg_syntax_t
9dd346ff 254re_set_syntax (reg_syntax_t syntax)
3b0bdc72
UD
255{
256 reg_syntax_t ret = re_syntax_options;
257
258 re_syntax_options = syntax;
259 return ret;
260}
3b0bdc72 261weak_alias (__re_set_syntax, re_set_syntax)
3b0bdc72
UD
262
263int
9dd346ff 264re_compile_fastmap (struct re_pattern_buffer *bufp)
3b0bdc72 265{
eb04c213 266 re_dfa_t *dfa = bufp->buffer;
3b0bdc72
UD
267 char *fastmap = bufp->fastmap;
268
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
278 return 0;
279}
3b0bdc72 280weak_alias (__re_compile_fastmap, re_compile_fastmap)
3b0bdc72 281
1b2c2628 282static inline void
d3821ab0
RM
283__attribute__ ((always_inline))
284re_set_fastmap (char *fastmap, bool icase, int ch)
1b2c2628
UD
285{
286 fastmap[ch] = 1;
287 if (icase)
288 fastmap[tolower (ch)] = 1;
289}
290
3b0bdc72
UD
291/* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
293
294static void
0fd8ae9c
UD
295re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 char *fastmap)
3b0bdc72 297{
eb04c213
AZ
298 re_dfa_t *dfa = bufp->buffer;
299 Idx node_cnt;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
302 {
eb04c213 303 Idx node = init_state->nodes.elems[node_cnt];
3b0bdc72 304 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
305
306 if (type == CHARACTER)
74e12fbc
UD
307 {
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309#ifdef RE_ENABLE_I18N
14744156 310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 311 {
eb04c213
AZ
312 unsigned char buf[MB_LEN_MAX];
313 unsigned char *p;
74e12fbc
UD
314 wchar_t wc;
315 mbstate_t state;
316
317 p = buf;
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
2c05d33f 323 memset (&state, '\0', sizeof (state));
b3918c7d
UD
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 &state) == p - buf
9dd6b779 326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
88764ae2 327 != (size_t) -1))
eb04c213 328 re_set_fastmap (fastmap, false, buf[0]);
74e12fbc
UD
329 }
330#endif
331 }
3b0bdc72 332 else if (type == SIMPLE_BRACKET)
15a7d175 333 {
2c05d33f
UD
334 int i, ch;
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
336 {
337 int j;
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
342 }
15a7d175 343 }
c0a0f9a3 344#ifdef RE_ENABLE_I18N
3b0bdc72 345 else if (type == COMPLEX_BRACKET)
15a7d175 346 {
15a7d175 347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
eb04c213 348 Idx i;
bdb56bac 349
c0a0f9a3 350# ifdef _LIBC
bdb56bac
UD
351 /* See if we have to try all bytes which start multiple collation
352 elements.
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
15a7d175 359 {
15a7d175
UD
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2c05d33f
UD
362 for (i = 0; i < SBC_MAX; ++i)
363 if (table[i] < 0)
364 re_set_fastmap (fastmap, icase, i);
15a7d175 365 }
bdb56bac
UD
366# endif /* _LIBC */
367
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa->mb_cur_max > 1
815d8147 373 && (cset->nchar_classes || cset->non_match || cset->nranges
bdb56bac
UD
374# ifdef _LIBC
375 || cset->nequiv_classes
376# endif /* _LIBC */
377 ))
15a7d175 378 {
bdb56bac
UD
379 unsigned char c = 0;
380 do
74e12fbc 381 {
bdb56bac
UD
382 mbstate_t mbs;
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
74e12fbc 386 }
bdb56bac 387 while (++c != 0);
15a7d175 388 }
bdb56bac
UD
389
390 else
391 {
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i = 0; i < cset->nmbchars; ++i)
394 {
395 char buf[256];
396 mbstate_t state;
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
401 {
9dd6b779 402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
bdb56bac
UD
403 != (size_t) -1)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
405 }
aa732e2b
UD
406 }
407 }
15a7d175 408 }
c0a0f9a3 409#endif /* RE_ENABLE_I18N */
c0d5034e
UD
410 else if (type == OP_PERIOD
411#ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413#endif /* RE_ENABLE_I18N */
414 || type == END_OF_RE)
15a7d175
UD
415 {
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
420 }
3b0bdc72
UD
421 }
422}
423\f
424/* Entry point for POSIX code. */
425/* regcomp takes a regular expression as a string and compiles it.
426
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
429
d3821ab0
RM
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
3b0bdc72
UD
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
d3821ab0
RM
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
3b0bdc72
UD
439
440 PATTERN is the address of the pattern string.
441
442 CFLAGS is a series of bits which affect compilation.
443
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
446
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
449
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
452
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
456
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
459
460int
c0feb731 461regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
3b0bdc72
UD
462{
463 reg_errcode_t ret;
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 465 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
466
467 preg->buffer = NULL;
468 preg->allocated = 0;
469 preg->used = 0;
470
471 /* Try to allocate space for the fastmap. */
472 preg->fastmap = re_malloc (char, SBC_MAX);
f4efbdfb 473 if (__glibc_unlikely (preg->fastmap == NULL))
3b0bdc72
UD
474 return REG_ESPACE;
475
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
477
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags & REG_NEWLINE)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483 /* It also changes the matching behavior. */
484 preg->newline_anchor = 1;
485 }
486 else
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
490
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret == REG_ERPAREN)
496 ret = REG_EPAREN;
497
a9388965 498 /* We have already checked preg->fastmap != NULL. */
f4efbdfb 499 if (__glibc_likely (ret == REG_NOERROR))
71ccd330 500 /* Compute the fastmap now, since regexec cannot modify the pattern
86576b62 501 buffer. This function never fails in this implementation. */
83b038f2 502 (void) re_compile_fastmap (preg);
71ccd330 503 else
3b0bdc72 504 {
71ccd330
UD
505 /* Some error occurred while compiling the expression. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
3b0bdc72
UD
508 }
509
510 return (int) ret;
511}
b68f8620 512libc_hidden_def (__regcomp)
3b0bdc72 513weak_alias (__regcomp, regcomp)
3b0bdc72
UD
514
515/* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
517
518size_t
c0feb731 519regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
9dd346ff 520 size_t errbuf_size)
3b0bdc72
UD
521{
522 const char *msg;
523 size_t msg_size;
f4efbdfb 524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
3b0bdc72 525
f4efbdfb 526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
3b0bdc72
UD
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
531 abort ();
532
6455d255 533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
534
535 msg_size = strlen (msg) + 1; /* Includes the null. */
536
f4efbdfb 537 if (__glibc_likely (errbuf_size != 0))
3b0bdc72 538 {
eb04c213 539 size_t cpy_size = msg_size;
f4efbdfb 540 if (__glibc_unlikely (msg_size > errbuf_size))
15a7d175 541 {
eb04c213
AZ
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
15a7d175 544 }
eb04c213 545 memcpy (errbuf, msg, cpy_size);
3b0bdc72
UD
546 }
547
548 return msg_size;
549}
3b0bdc72 550weak_alias (__regerror, regerror)
3b0bdc72 551
3b0bdc72 552
e40a38b3
UD
553#ifdef RE_ENABLE_I18N
554/* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
2c05d33f 558static const bitset_t utf8_sb_map =
e40a38b3
UD
559{
560 /* Set the first 128 bits. */
c2a150d0 561# if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
2c05d33f 562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
eb04c213
AZ
563# else
564# if 4 * BITSET_WORD_BITS < ASCII_CHARS
565# error "bitset_word_t is narrower than 32 bits"
566# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571 BITSET_WORD_MAX,
572# endif
573 (BITSET_WORD_MAX
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
575 ? 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577# endif
e40a38b3
UD
578};
579#endif
580
581
71ccd330
UD
582static void
583free_dfa_content (re_dfa_t *dfa)
3b0bdc72 584{
eb04c213 585 Idx i, j;
3b0bdc72 586
ee70274a
UD
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
02f3550c 589 free_token (dfa->nodes + i);
71ccd330
UD
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
592 {
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
599 }
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
3b0bdc72 604
ee70274a
UD
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
607 {
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
610 {
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
613 }
21f5de55 614 re_free (entry->array);
ee70274a 615 }
71ccd330 616 re_free (dfa->state_table);
65e6becf 617#ifdef RE_ENABLE_I18N
e40a38b3
UD
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
a96c63ed 620#endif
c06a6956 621 re_free (dfa->subexp_map);
0742e48e 622#ifdef DEBUG
71ccd330 623 re_free (dfa->re_str);
0742e48e 624#endif
71ccd330
UD
625
626 re_free (dfa);
627}
628
629
630/* Free dynamically allocated space used by PREG. */
631
632void
9dd346ff 633regfree (regex_t *preg)
71ccd330 634{
eb04c213 635 re_dfa_t *dfa = preg->buffer;
f4efbdfb 636 if (__glibc_likely (dfa != NULL))
eb04c213
AZ
637 {
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
640 }
86576b62
UD
641 preg->buffer = NULL;
642 preg->allocated = 0;
71ccd330 643
3b0bdc72 644 re_free (preg->fastmap);
86576b62
UD
645 preg->fastmap = NULL;
646
647 re_free (preg->translate);
648 preg->translate = NULL;
3b0bdc72 649}
b68f8620 650libc_hidden_def (__regfree)
3b0bdc72 651weak_alias (__regfree, regfree)
3b0bdc72
UD
652\f
653/* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
655
656#if defined _REGEX_RE_COMP || defined _LIBC
657
658/* BSD has one and only one pattern buffer. */
659static struct re_pattern_buffer re_comp_buf;
660
661char *
662# ifdef _LIBC
663/* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
666weak_function
667# endif
80d9be81 668re_comp (const char *s)
3b0bdc72
UD
669{
670 reg_errcode_t ret;
240e87c2 671 char *fastmap;
3b0bdc72
UD
672
673 if (!s)
674 {
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
678 }
679
240e87c2
RM
680 if (re_comp_buf.buffer)
681 {
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
1b2c2628 685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
686 re_comp_buf.fastmap = fastmap;
687 }
688
689 if (re_comp_buf.fastmap == NULL)
3b0bdc72 690 {
eb04c213 691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
3b0bdc72 692 if (re_comp_buf.fastmap == NULL)
6455d255
UD
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
695 }
696
d3821ab0 697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
3b0bdc72
UD
698 don't need to initialize the pattern buffer fields which affect it. */
699
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
702
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705 if (!ret)
706 return NULL;
707
eb04c213 708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
6455d255 709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 710}
240e87c2
RM
711
712#ifdef _LIBC
88677348
AZN
713void
714__libc_regcomp_freemem (void)
240e87c2
RM
715{
716 __regfree (&re_comp_buf);
717}
240e87c2
RM
718#endif
719
3b0bdc72
UD
720#endif /* _REGEX_RE_COMP */
721\f
722/* Internal entry point.
723 Compile the regular expression PATTERN, whose length is LENGTH.
724 SYNTAX indicate regular expression's syntax. */
725
726static reg_errcode_t
0fd8ae9c
UD
727re_compile_internal (regex_t *preg, const char * pattern, size_t length,
728 reg_syntax_t syntax)
3b0bdc72
UD
729{
730 reg_errcode_t err = REG_NOERROR;
731 re_dfa_t *dfa;
732 re_string_t regexp;
733
734 /* Initialize the pattern buffer. */
735 preg->fastmap_accurate = 0;
736 preg->syntax = syntax;
737 preg->not_bol = preg->not_eol = 0;
738 preg->used = 0;
739 preg->re_nsub = 0;
1b2c2628
UD
740 preg->can_be_null = 0;
741 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
742
743 /* Initialize the dfa. */
eb04c213 744 dfa = preg->buffer;
f4efbdfb 745 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
3b0bdc72
UD
746 {
747 /* If zero allocated, but buffer is non-null, try to realloc
748 enough space. This loses if buffer's address is bogus, but
749 that is the user's responsibility. If ->buffer is NULL this
750 is a simple allocation. */
751 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
752 if (dfa == NULL)
753 return REG_ESPACE;
3b0bdc72 754 preg->allocated = sizeof (re_dfa_t);
eb04c213 755 preg->buffer = dfa;
3b0bdc72 756 }
3b0bdc72
UD
757 preg->used = sizeof (re_dfa_t);
758
759 err = init_dfa (dfa, length);
f4efbdfb 760 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
eb04c213 761 err = REG_ESPACE;
f4efbdfb 762 if (__glibc_unlikely (err != REG_NOERROR))
3b0bdc72 763 {
ee70274a 764 free_dfa_content (dfa);
3b0bdc72 765 preg->buffer = NULL;
ca3c505e 766 preg->allocated = 0;
3b0bdc72
UD
767 return err;
768 }
0742e48e 769#ifdef DEBUG
01ed6ceb 770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
0742e48e
UD
771 dfa->re_str = re_malloc (char, length + 1);
772 strncpy (dfa->re_str, pattern, length + 1);
773#endif
3b0bdc72 774
612546c6 775 err = re_string_construct (&regexp, pattern, length, preg->translate,
eb04c213 776 (syntax & RE_ICASE) != 0, dfa);
f4efbdfb 777 if (__glibc_unlikely (err != REG_NOERROR))
3b0bdc72 778 {
ee70274a
UD
779 re_compile_internal_free_return:
780 free_workarea_compile (preg);
781 re_string_destruct (&regexp);
eb04c213 782 lock_fini (dfa->lock);
ee70274a 783 free_dfa_content (dfa);
3b0bdc72 784 preg->buffer = NULL;
ca3c505e 785 preg->allocated = 0;
3b0bdc72
UD
786 return err;
787 }
788
789 /* Parse the regular expression, and build a structure tree. */
790 preg->re_nsub = 0;
791 dfa->str_tree = parse (&regexp, preg, syntax, &err);
f4efbdfb 792 if (__glibc_unlikely (dfa->str_tree == NULL))
3b0bdc72
UD
793 goto re_compile_internal_free_return;
794
02f3550c
UD
795 /* Analyze the tree and create the nfa. */
796 err = analyze (preg);
f4efbdfb 797 if (__glibc_unlikely (err != REG_NOERROR))
02f3550c
UD
798 goto re_compile_internal_free_return;
799
14744156
UD
800#ifdef RE_ENABLE_I18N
801 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 802 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
803 optimize_utf8 (dfa);
804#endif
805
3b0bdc72
UD
806 /* Then create the initial state of the dfa. */
807 err = create_initial_state (dfa);
3b0bdc72 808
3b0bdc72
UD
809 /* Release work areas. */
810 free_workarea_compile (preg);
811 re_string_destruct (&regexp);
812
f4efbdfb 813 if (__glibc_unlikely (err != REG_NOERROR))
71ccd330 814 {
eb04c213 815 lock_fini (dfa->lock);
71ccd330
UD
816 free_dfa_content (dfa);
817 preg->buffer = NULL;
ca3c505e 818 preg->allocated = 0;
71ccd330
UD
819 }
820
3b0bdc72
UD
821 return err;
822}
823
824/* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
826
827static reg_errcode_t
0fd8ae9c 828init_dfa (re_dfa_t *dfa, size_t pat_len)
3b0bdc72 829{
eb04c213 830 __re_size_t table_size;
e40a38b3 831#ifndef _LIBC
eb04c213
AZ
832 const char *codeset_name;
833#endif
834#ifdef RE_ENABLE_I18N
835 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
836#else
837 size_t max_i18n_object_size = 0;
e40a38b3 838#endif
eb04c213
AZ
839 size_t max_object_size =
840 MAX (sizeof (struct re_state_table_entry),
841 MAX (sizeof (re_token_t),
842 MAX (sizeof (re_node_set),
843 MAX (sizeof (regmatch_t),
844 max_i18n_object_size))));
81c64d40
UD
845
846 memset (dfa, '\0', sizeof (re_dfa_t));
847
ee70274a
UD
848 /* Force allocation of str_tree_storage the first time. */
849 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
850
eb04c213
AZ
851 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
852 calculation below, and for similar doubling calculations
853 elsewhere. And it's <= rather than <, because some of the
854 doubling calculations add 1 afterwards. */
f4efbdfb
PE
855 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
856 <= pat_len))
01ed6ceb
UD
857 return REG_ESPACE;
858
3b0bdc72
UD
859 dfa->nodes_alloc = pat_len + 1;
860 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861
3b0bdc72 862 /* table_size = 2 ^ ceil(log pat_len) */
01ed6ceb 863 for (table_size = 1; ; table_size <<= 1)
3b0bdc72
UD
864 if (table_size > pat_len)
865 break;
866
867 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
868 dfa->state_hash_mask = table_size - 1;
869
3c0fb574
UD
870 dfa->mb_cur_max = MB_CUR_MAX;
871#ifdef _LIBC
bb3f4825 872 if (dfa->mb_cur_max == 6
3c0fb574
UD
873 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 dfa->is_utf8 = 1;
f0c7c524
UD
875 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
876 != 0);
e40a38b3 877#else
e40a38b3 878 codeset_name = nl_langinfo (CODESET);
eb04c213
AZ
879 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
880 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
881 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
882 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
e40a38b3
UD
883 dfa->is_utf8 = 1;
884
885 /* We check exhaustively in the loop below if this charset is a
886 superset of ASCII. */
887 dfa->map_notascii = 0;
3c0fb574 888#endif
e40a38b3 889
65e6becf
UD
890#ifdef RE_ENABLE_I18N
891 if (dfa->mb_cur_max > 1)
892 {
65e6becf 893 if (dfa->is_utf8)
e40a38b3 894 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
65e6becf 895 else
e40a38b3
UD
896 {
897 int i, j, ch;
898
2c05d33f 899 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
f4efbdfb 900 if (__glibc_unlikely (dfa->sb_char == NULL))
e40a38b3
UD
901 return REG_ESPACE;
902
2c05d33f
UD
903 /* Set the bits corresponding to single byte chars. */
904 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
905 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
e40a38b3 906 {
2d87db5b 907 wint_t wch = __btowc (ch);
e40a38b3 908 if (wch != WEOF)
2c05d33f 909 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
e40a38b3 910# ifndef _LIBC
2d87db5b 911 if (isascii (ch) && wch != ch)
e40a38b3
UD
912 dfa->map_notascii = 1;
913# endif
914 }
915 }
65e6becf
UD
916 }
917#endif
3c0fb574 918
f4efbdfb 919 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
ee70274a 920 return REG_ESPACE;
3b0bdc72
UD
921 return REG_NOERROR;
922}
923
924/* Initialize WORD_CHAR table, which indicate which character is
925 "word". In this case "word" means that it is the word construction
926 character used by some operators like "\<", "\>", etc. */
927
56b168be 928static void
0fd8ae9c 929init_word_char (re_dfa_t *dfa)
3b0bdc72 930{
9f115170 931 int i = 0;
eb04c213 932 int j;
9f115170 933 int ch = 0;
eb04c213 934 dfa->word_ops_used = 1;
f4efbdfb 935 if (__glibc_likely (dfa->map_notascii == 0))
9f115170 936 {
0285e6bd
PE
937 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
938 them, an issue when this code is used in Gnulib. */
567d8c1f
PE
939 bitset_word_t bits0 = 0x00000000;
940 bitset_word_t bits1 = 0x03ff0000;
941 bitset_word_t bits2 = 0x87fffffe;
942 bitset_word_t bits3 = 0x07fffffe;
943 if (BITSET_WORD_BITS == 64)
9f115170 944 {
0285e6bd 945 /* Pacify gcc -Woverflow on 32-bit platformns. */
567d8c1f
PE
946 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
947 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
9f115170
UD
948 i = 2;
949 }
567d8c1f 950 else if (BITSET_WORD_BITS == 32)
9f115170 951 {
567d8c1f
PE
952 dfa->word_char[0] = bits0;
953 dfa->word_char[1] = bits1;
954 dfa->word_char[2] = bits2;
955 dfa->word_char[3] = bits3;
9f115170
UD
956 i = 4;
957 }
958 else
567d8c1f 959 goto general_case;
9f115170
UD
960 ch = 128;
961
f4efbdfb 962 if (__glibc_likely (dfa->is_utf8))
9f115170
UD
963 {
964 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
965 return;
966 }
967 }
968
567d8c1f 969 general_case:
9f115170 970 for (; i < BITSET_WORDS; ++i)
eb04c213 971 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3b0bdc72 972 if (isalnum (ch) || ch == '_')
2c05d33f 973 dfa->word_char[i] |= (bitset_word_t) 1 << j;
3b0bdc72
UD
974}
975
976/* Free the work area which are only used while compiling. */
977
978static void
0fd8ae9c 979free_workarea_compile (regex_t *preg)
3b0bdc72 980{
eb04c213 981 re_dfa_t *dfa = preg->buffer;
ee70274a
UD
982 bin_tree_storage_t *storage, *next;
983 for (storage = dfa->str_tree_storage; storage; storage = next)
984 {
985 next = storage->next;
986 re_free (storage);
987 }
988 dfa->str_tree_storage = NULL;
989 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 990 dfa->str_tree = NULL;
a7d5c291
UD
991 re_free (dfa->org_indices);
992 dfa->org_indices = NULL;
3b0bdc72
UD
993}
994
995/* Create initial states for all contexts. */
996
997static reg_errcode_t
0fd8ae9c 998create_initial_state (re_dfa_t *dfa)
3b0bdc72 999{
eb04c213 1000 Idx first, i;
3b0bdc72
UD
1001 reg_errcode_t err;
1002 re_node_set init_nodes;
1003
1004 /* Initial states have the epsilon closure of the node which is
1005 the first node of the regular expression. */
02f3550c 1006 first = dfa->str_tree->first->node_idx;
3b0bdc72
UD
1007 dfa->init_node = first;
1008 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
f4efbdfb 1009 if (__glibc_unlikely (err != REG_NOERROR))
3b0bdc72
UD
1010 return err;
1011
1012 /* The back-references which are in initial states can epsilon transit,
1013 since in this case all of the subexpressions can be null.
1014 Then we add epsilon closures of the nodes which are the next nodes of
1015 the back-references. */
1016 if (dfa->nbackref > 0)
1017 for (i = 0; i < init_nodes.nelem; ++i)
1018 {
eb04c213 1019 Idx node_idx = init_nodes.elems[i];
15a7d175
UD
1020 re_token_type_t type = dfa->nodes[node_idx].type;
1021
eb04c213 1022 Idx clexp_idx;
15a7d175
UD
1023 if (type != OP_BACK_REF)
1024 continue;
1025 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1026 {
1027 re_token_t *clexp_node;
1028 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1029 if (clexp_node->type == OP_CLOSE_SUBEXP
ae73c6c1 1030 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
15a7d175
UD
1031 break;
1032 }
1033 if (clexp_idx == init_nodes.nelem)
1034 continue;
1035
1036 if (type == OP_BACK_REF)
1037 {
eb04c213 1038 Idx dest_idx = dfa->edests[node_idx].elems[0];
15a7d175
UD
1039 if (!re_node_set_contains (&init_nodes, dest_idx))
1040 {
eb04c213
AZ
1041 reg_errcode_t merge_err
1042 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1043 if (merge_err != REG_NOERROR)
1044 return merge_err;
15a7d175
UD
1045 i = 0;
1046 }
1047 }
3b0bdc72
UD
1048 }
1049
1050 /* It must be the first time to invoke acquire_state. */
a9388965
UD
1051 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1052 /* We don't check ERR here, since the initial state must not be NULL. */
f4efbdfb 1053 if (__glibc_unlikely (dfa->init_state == NULL))
a9388965 1054 return err;
3b0bdc72
UD
1055 if (dfa->init_state->has_constraint)
1056 {
a9388965 1057 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1058 CONTEXT_WORD);
a9388965 1059 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1060 CONTEXT_NEWLINE);
a9388965 1061 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
1062 &init_nodes,
1063 CONTEXT_NEWLINE
1064 | CONTEXT_BEGBUF);
f4efbdfb
PE
1065 if (__glibc_unlikely (dfa->init_state_word == NULL
1066 || dfa->init_state_nl == NULL
1067 || dfa->init_state_begbuf == NULL))
15a7d175 1068 return err;
3b0bdc72
UD
1069 }
1070 else
1071 dfa->init_state_word = dfa->init_state_nl
1072 = dfa->init_state_begbuf = dfa->init_state;
1073
3b0bdc72
UD
1074 re_node_set_free (&init_nodes);
1075 return REG_NOERROR;
1076}
1077\f
14744156
UD
1078#ifdef RE_ENABLE_I18N
1079/* If it is possible to do searching in single byte encoding instead of UTF-8
1080 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1081 DFA nodes where needed. */
1082
1083static void
0fd8ae9c 1084optimize_utf8 (re_dfa_t *dfa)
14744156 1085{
eb04c213
AZ
1086 Idx node;
1087 int i;
1088 bool mb_chars = false;
1089 bool has_period = false;
14744156
UD
1090
1091 for (node = 0; node < dfa->nodes_len; ++node)
1092 switch (dfa->nodes[node].type)
1093 {
1094 case CHARACTER:
eb04c213
AZ
1095 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1096 mb_chars = true;
14744156
UD
1097 break;
1098 case ANCHOR:
bc3e1c12 1099 switch (dfa->nodes[node].opr.ctx_type)
14744156
UD
1100 {
1101 case LINE_FIRST:
1102 case LINE_LAST:
1103 case BUF_FIRST:
1104 case BUF_LAST:
1105 break;
1106 default:
0caca71a
UD
1107 /* Word anchors etc. cannot be handled. It's okay to test
1108 opr.ctx_type since constraints (for all DFA nodes) are
1109 created by ORing one or more opr.ctx_type values. */
14744156
UD
1110 return;
1111 }
1112 break;
ad7f28c2 1113 case OP_PERIOD:
eb04c213 1114 has_period = true;
21f5de55 1115 break;
14744156
UD
1116 case OP_BACK_REF:
1117 case OP_ALT:
1118 case END_OF_RE:
14744156 1119 case OP_DUP_ASTERISK:
14744156
UD
1120 case OP_OPEN_SUBEXP:
1121 case OP_CLOSE_SUBEXP:
1122 break;
02f3550c
UD
1123 case COMPLEX_BRACKET:
1124 return;
a8067e8f 1125 case SIMPLE_BRACKET:
eb04c213
AZ
1126 /* Just double check. */
1127 {
1128 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1129 ? 0
1130 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1131 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1132 {
1133 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1134 return;
1135 rshift = 0;
1136 }
1137 }
a8067e8f 1138 break;
14744156 1139 default:
02f3550c 1140 abort ();
14744156
UD
1141 }
1142
ad7f28c2 1143 if (mb_chars || has_period)
5f93cd52 1144 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1145 {
1146 if (dfa->nodes[node].type == CHARACTER
eb04c213 1147 && dfa->nodes[node].opr.c >= ASCII_CHARS)
ad7f28c2
UD
1148 dfa->nodes[node].mb_partial = 0;
1149 else if (dfa->nodes[node].type == OP_PERIOD)
1150 dfa->nodes[node].type = OP_UTF8_PERIOD;
1151 }
5f93cd52 1152
14744156
UD
1153 /* The search can be in single byte locale. */
1154 dfa->mb_cur_max = 1;
1155 dfa->is_utf8 = 0;
ad7f28c2 1156 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1157}
1158#endif
1159\f
3b0bdc72
UD
1160/* Analyze the structure tree, and calculate "first", "next", "edest",
1161 "eclosure", and "inveclosure". */
1162
1163static reg_errcode_t
0fd8ae9c 1164analyze (regex_t *preg)
3b0bdc72 1165{
eb04c213 1166 re_dfa_t *dfa = preg->buffer;
3b0bdc72
UD
1167 reg_errcode_t ret;
1168
1169 /* Allocate arrays. */
eb04c213
AZ
1170 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1171 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
3b0bdc72
UD
1172 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1173 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
f4efbdfb
PE
1174 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1175 || dfa->edests == NULL || dfa->eclosures == NULL))
3b0bdc72 1176 return REG_ESPACE;
02f3550c 1177
eb04c213 1178 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
02f3550c 1179 if (dfa->subexp_map != NULL)
3b0bdc72 1180 {
eb04c213 1181 Idx i;
02f3550c
UD
1182 for (i = 0; i < preg->re_nsub; i++)
1183 dfa->subexp_map[i] = i;
1184 preorder (dfa->str_tree, optimize_subexps, dfa);
1185 for (i = 0; i < preg->re_nsub; i++)
1186 if (dfa->subexp_map[i] != i)
1187 break;
1188 if (i == preg->re_nsub)
1189 {
eb04c213 1190 re_free (dfa->subexp_map);
02f3550c
UD
1191 dfa->subexp_map = NULL;
1192 }
3b0bdc72
UD
1193 }
1194
02f3550c 1195 ret = postorder (dfa->str_tree, lower_subexps, preg);
f4efbdfb 1196 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c
UD
1197 return ret;
1198 ret = postorder (dfa->str_tree, calc_first, dfa);
f4efbdfb 1199 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c
UD
1200 return ret;
1201 preorder (dfa->str_tree, calc_next, dfa);
1202 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
f4efbdfb 1203 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c
UD
1204 return ret;
1205 ret = calc_eclosure (dfa);
f4efbdfb 1206 if (__glibc_unlikely (ret != REG_NOERROR))
02f3550c 1207 return ret;
963d8d78
UD
1208
1209 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1210 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1211 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1212 || dfa->nbackref)
1213 {
1214 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
f4efbdfb 1215 if (__glibc_unlikely (dfa->inveclosures == NULL))
21f5de55 1216 return REG_ESPACE;
963d8d78
UD
1217 ret = calc_inveclosure (dfa);
1218 }
1219
02f3550c
UD
1220 return ret;
1221}
1222
1223/* Our parse trees are very unbalanced, so we cannot use a stack to
1224 implement parse tree visits. Instead, we use parent pointers and
1225 some hairy code in these two functions. */
1226static reg_errcode_t
0fd8ae9c
UD
1227postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1228 void *extra)
02f3550c
UD
1229{
1230 bin_tree_t *node, *prev;
1231
1232 for (node = root; ; )
3b0bdc72 1233 {
02f3550c
UD
1234 /* Descend down the tree, preferably to the left (or to the right
1235 if that's the only child). */
1236 while (node->left || node->right)
1237 if (node->left)
21f5de55
PE
1238 node = node->left;
1239 else
1240 node = node->right;
02f3550c
UD
1241
1242 do
1243 {
1244 reg_errcode_t err = fn (extra, node);
f4efbdfb 1245 if (__glibc_unlikely (err != REG_NOERROR))
02f3550c 1246 return err;
21f5de55 1247 if (node->parent == NULL)
02f3550c
UD
1248 return REG_NOERROR;
1249 prev = node;
1250 node = node->parent;
1251 }
1252 /* Go up while we have a node that is reached from the right. */
1253 while (node->right == prev || node->right == NULL);
1254 node = node->right;
3b0bdc72 1255 }
3b0bdc72
UD
1256}
1257
02f3550c 1258static reg_errcode_t
0fd8ae9c
UD
1259preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1260 void *extra)
02f3550c
UD
1261{
1262 bin_tree_t *node;
1263
1264 for (node = root; ; )
1265 {
1266 reg_errcode_t err = fn (extra, node);
f4efbdfb 1267 if (__glibc_unlikely (err != REG_NOERROR))
02f3550c 1268 return err;
0ecb606c 1269
02f3550c
UD
1270 /* Go to the left node, or up and to the right. */
1271 if (node->left)
1272 node = node->left;
1273 else
1274 {
1275 bin_tree_t *prev = NULL;
1276 while (node->right == prev || node->right == NULL)
1277 {
1278 prev = node;
1279 node = node->parent;
1280 if (!node)
21f5de55 1281 return REG_NOERROR;
02f3550c
UD
1282 }
1283 node = node->right;
1284 }
1285 }
1286}
1287
1288/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1289 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1290 backreferences as well. Requires a preorder visit. */
0ecb606c 1291static reg_errcode_t
0fd8ae9c 1292optimize_subexps (void *extra, bin_tree_t *node)
0ecb606c 1293{
02f3550c 1294 re_dfa_t *dfa = (re_dfa_t *) extra;
0ecb606c 1295
02f3550c 1296 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3b0bdc72 1297 {
02f3550c
UD
1298 int idx = node->token.opr.idx;
1299 node->token.opr.idx = dfa->subexp_map[idx];
1300 dfa->used_bkref_map |= 1 << node->token.opr.idx;
0ecb606c 1301 }
02f3550c
UD
1302
1303 else if (node->token.type == SUBEXP
21f5de55 1304 && node->left && node->left->token.type == SUBEXP)
0ecb606c 1305 {
eb04c213 1306 Idx other_idx = node->left->token.opr.idx;
02f3550c
UD
1307
1308 node->left = node->left->left;
1309 if (node->left)
21f5de55 1310 node->left->parent = node;
02f3550c
UD
1311
1312 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
2c05d33f 1313 if (other_idx < BITSET_WORD_BITS)
eb04c213 1314 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3b0bdc72 1315 }
02f3550c 1316
3b0bdc72
UD
1317 return REG_NOERROR;
1318}
1319
02f3550c
UD
1320/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1321 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1322static reg_errcode_t
0fd8ae9c 1323lower_subexps (void *extra, bin_tree_t *node)
3b0bdc72 1324{
02f3550c
UD
1325 regex_t *preg = (regex_t *) extra;
1326 reg_errcode_t err = REG_NOERROR;
3b0bdc72 1327
02f3550c 1328 if (node->left && node->left->token.type == SUBEXP)
0ecb606c 1329 {
02f3550c
UD
1330 node->left = lower_subexp (&err, preg, node->left);
1331 if (node->left)
1332 node->left->parent = node;
1333 }
1334 if (node->right && node->right->token.type == SUBEXP)
1335 {
1336 node->right = lower_subexp (&err, preg, node->right);
1337 if (node->right)
1338 node->right->parent = node;
3b0bdc72 1339 }
3b0bdc72 1340
02f3550c
UD
1341 return err;
1342}
3b0bdc72 1343
02f3550c 1344static bin_tree_t *
0fd8ae9c 1345lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
0ecb606c 1346{
eb04c213 1347 re_dfa_t *dfa = preg->buffer;
02f3550c
UD
1348 bin_tree_t *body = node->left;
1349 bin_tree_t *op, *cls, *tree1, *tree;
1350
1351 if (preg->no_sub
744eb12b
UD
1352 /* We do not optimize empty subexpressions, because otherwise we may
1353 have bad CONCAT nodes with NULL children. This is obviously not
1354 very common, so we do not lose much. An example that triggers
1355 this case is the sed "script" /\(\)/x. */
1356 && node->left != NULL
2c05d33f
UD
1357 && (node->token.opr.idx >= BITSET_WORD_BITS
1358 || !(dfa->used_bkref_map
1359 & ((bitset_word_t) 1 << node->token.opr.idx))))
02f3550c
UD
1360 return node->left;
1361
1362 /* Convert the SUBEXP node to the concatenation of an
1363 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1364 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1365 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1366 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1367 tree = create_tree (dfa, op, tree1, CONCAT);
f4efbdfb
PE
1368 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1369 || op == NULL || cls == NULL))
0ecb606c 1370 {
02f3550c
UD
1371 *err = REG_ESPACE;
1372 return NULL;
0ecb606c 1373 }
3b0bdc72 1374
02f3550c
UD
1375 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1376 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1377 return tree;
1378}
a334319f 1379
02f3550c
UD
1380/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1381 nodes. Requires a postorder visit. */
1382static reg_errcode_t
0fd8ae9c 1383calc_first (void *extra, bin_tree_t *node)
02f3550c
UD
1384{
1385 re_dfa_t *dfa = (re_dfa_t *) extra;
1386 if (node->token.type == CONCAT)
1387 {
1388 node->first = node->left->first;
1389 node->node_idx = node->left->node_idx;
1390 }
1391 else
1392 {
1393 node->first = node;
1394 node->node_idx = re_dfa_add_node (dfa, node->token);
f4efbdfb 1395 if (__glibc_unlikely (node->node_idx == -1))
21f5de55 1396 return REG_ESPACE;
0caca71a 1397 if (node->token.type == ANCHOR)
21f5de55 1398 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
02f3550c
UD
1399 }
1400 return REG_NOERROR;
1401}
1402
1403/* Pass 2: compute NEXT on the tree. Preorder visit. */
1404static reg_errcode_t
0fd8ae9c 1405calc_next (void *extra, bin_tree_t *node)
02f3550c
UD
1406{
1407 switch (node->token.type)
3b0bdc72
UD
1408 {
1409 case OP_DUP_ASTERISK:
02f3550c 1410 node->left->next = node;
3b0bdc72
UD
1411 break;
1412 case CONCAT:
02f3550c
UD
1413 node->left->next = node->right->first;
1414 node->right->next = node->next;
1415 break;
3b0bdc72 1416 default:
02f3550c
UD
1417 if (node->left)
1418 node->left->next = node->next;
1419 if (node->right)
21f5de55 1420 node->right->next = node->next;
3b0bdc72
UD
1421 break;
1422 }
02f3550c 1423 return REG_NOERROR;
3b0bdc72
UD
1424}
1425
02f3550c
UD
1426/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1427static reg_errcode_t
0fd8ae9c 1428link_nfa_nodes (void *extra, bin_tree_t *node)
a334319f 1429{
02f3550c 1430 re_dfa_t *dfa = (re_dfa_t *) extra;
eb04c213 1431 Idx idx = node->node_idx;
02f3550c
UD
1432 reg_errcode_t err = REG_NOERROR;
1433
1434 switch (node->token.type)
3b0bdc72 1435 {
02f3550c
UD
1436 case CONCAT:
1437 break;
1438
1439 case END_OF_RE:
2a0356e1 1440 DEBUG_ASSERT (node->next == NULL);
02f3550c
UD
1441 break;
1442
1443 case OP_DUP_ASTERISK:
1444 case OP_ALT:
1445 {
eb04c213 1446 Idx left, right;
02f3550c
UD
1447 dfa->has_plural_match = 1;
1448 if (node->left != NULL)
1449 left = node->left->first->node_idx;
1450 else
1451 left = node->next->node_idx;
1452 if (node->right != NULL)
1453 right = node->right->first->node_idx;
1454 else
1455 right = node->next->node_idx;
2a0356e1
AZ
1456 DEBUG_ASSERT (left > -1);
1457 DEBUG_ASSERT (right > -1);
02f3550c
UD
1458 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1459 }
1460 break;
1461
1462 case ANCHOR:
1463 case OP_OPEN_SUBEXP:
1464 case OP_CLOSE_SUBEXP:
1465 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1466 break;
1467
1468 case OP_BACK_REF:
1469 dfa->nexts[idx] = node->next->node_idx;
1470 if (node->token.type == OP_BACK_REF)
2da42bc0 1471 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
02f3550c
UD
1472 break;
1473
1474 default:
2a0356e1 1475 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
02f3550c
UD
1476 dfa->nexts[idx] = node->next->node_idx;
1477 break;
3b0bdc72 1478 }
02f3550c
UD
1479
1480 return err;
3b0bdc72
UD
1481}
1482
485d775d
UD
1483/* Duplicate the epsilon closure of the node ROOT_NODE.
1484 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1485 to their own constraint. */
1486
1487static reg_errcode_t
eb04c213
AZ
1488duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1489 Idx root_node, unsigned int init_constraint)
485d775d 1490{
eb04c213
AZ
1491 Idx org_node, clone_node;
1492 bool ok;
485d775d
UD
1493 unsigned int constraint = init_constraint;
1494 for (org_node = top_org_node, clone_node = top_clone_node;;)
1495 {
eb04c213 1496 Idx org_dest, clone_dest;
485d775d 1497 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1498 {
485d775d
UD
1499 /* If the back reference epsilon-transit, its destination must
1500 also have the constraint. Then duplicate the epsilon closure
1501 of the destination of the back reference, and store it in
1502 edests of the back reference. */
15a7d175
UD
1503 org_dest = dfa->nexts[org_node];
1504 re_node_set_empty (dfa->edests + clone_node);
2d87db5b 1505 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1506 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1507 return REG_ESPACE;
15a7d175 1508 dfa->nexts[clone_node] = dfa->nexts[org_node];
eb04c213 1509 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1510 if (__glibc_unlikely (! ok))
15a7d175
UD
1511 return REG_ESPACE;
1512 }
485d775d 1513 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1514 {
485d775d
UD
1515 /* In case of the node can't epsilon-transit, don't duplicate the
1516 destination and store the original destination as the
1517 destination of the node. */
15a7d175
UD
1518 dfa->nexts[clone_node] = dfa->nexts[org_node];
1519 break;
1520 }
485d775d 1521 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1522 {
485d775d
UD
1523 /* In case of the node can epsilon-transit, and it has only one
1524 destination. */
15a7d175
UD
1525 org_dest = dfa->edests[org_node].elems[0];
1526 re_node_set_empty (dfa->edests + clone_node);
d3821ab0 1527 /* If the node is root_node itself, it means the epsilon closure
eb04c213 1528 has a loop. Then tie it to the destination of the root_node. */
0caca71a 1529 if (org_node == root_node && clone_node != org_node)
15a7d175 1530 {
eb04c213 1531 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
f4efbdfb 1532 if (__glibc_unlikely (! ok))
eb04c213 1533 return REG_ESPACE;
0caca71a 1534 break;
15a7d175 1535 }
d3821ab0 1536 /* In case the node has another constraint, append it. */
0caca71a 1537 constraint |= dfa->nodes[org_node].constraint;
2d87db5b 1538 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1539 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1540 return REG_ESPACE;
eb04c213 1541 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1542 if (__glibc_unlikely (! ok))
15a7d175
UD
1543 return REG_ESPACE;
1544 }
485d775d 1545 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1546 {
485d775d 1547 /* In case of the node can epsilon-transit, and it has two
02f3550c 1548 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
15a7d175
UD
1549 org_dest = dfa->edests[org_node].elems[0];
1550 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1551 /* Search for a duplicated node which satisfies the constraint. */
1552 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1553 if (clone_dest == -1)
1554 {
0caca71a 1555 /* There is no such duplicated node, create a new one. */
2d87db5b
UD
1556 reg_errcode_t err;
1557 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1558 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1559 return REG_ESPACE;
eb04c213 1560 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1561 if (__glibc_unlikely (! ok))
7de66108
UD
1562 return REG_ESPACE;
1563 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1564 root_node, constraint);
f4efbdfb 1565 if (__glibc_unlikely (err != REG_NOERROR))
7de66108 1566 return err;
a7d5c291
UD
1567 }
1568 else
1569 {
0caca71a 1570 /* There is a duplicated node which satisfies the constraint,
a7d5c291 1571 use it to avoid infinite loop. */
eb04c213 1572 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1573 if (__glibc_unlikely (! ok))
a7d5c291
UD
1574 return REG_ESPACE;
1575 }
15a7d175
UD
1576
1577 org_dest = dfa->edests[org_node].elems[1];
2d87db5b 1578 clone_dest = duplicate_node (dfa, org_dest, constraint);
f4efbdfb 1579 if (__glibc_unlikely (clone_dest == -1))
2d87db5b 1580 return REG_ESPACE;
eb04c213 1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
f4efbdfb 1582 if (__glibc_unlikely (! ok))
15a7d175
UD
1583 return REG_ESPACE;
1584 }
485d775d
UD
1585 org_node = org_dest;
1586 clone_node = clone_dest;
1587 }
1588 return REG_NOERROR;
1589}
1590
a7d5c291
UD
1591/* Search for a node which is duplicated from the node ORG_NODE, and
1592 satisfies the constraint CONSTRAINT. */
1593
eb04c213
AZ
1594static Idx
1595search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
0fd8ae9c 1596 unsigned int constraint)
a7d5c291 1597{
eb04c213 1598 Idx idx;
a7d5c291
UD
1599 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1600 {
1601 if (org_node == dfa->org_indices[idx]
1602 && constraint == dfa->nodes[idx].constraint)
1603 return idx; /* Found. */
1604 }
1605 return -1; /* Not found. */
1606}
1607
a9388965 1608/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
2d87db5b
UD
1609 Return the index of the new node, or -1 if insufficient storage is
1610 available. */
a9388965 1611
eb04c213
AZ
1612static Idx
1613duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
3b0bdc72 1614{
eb04c213 1615 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
f4efbdfb 1616 if (__glibc_likely (dup_idx != -1))
2d87db5b
UD
1617 {
1618 dfa->nodes[dup_idx].constraint = constraint;
0caca71a 1619 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
2d87db5b
UD
1620 dfa->nodes[dup_idx].duplicated = 1;
1621
1622 /* Store the index of the original node. */
1623 dfa->org_indices[dup_idx] = org_idx;
1624 }
1625 return dup_idx;
3b0bdc72
UD
1626}
1627
963d8d78 1628static reg_errcode_t
0fd8ae9c 1629calc_inveclosure (re_dfa_t *dfa)
3b0bdc72 1630{
eb04c213
AZ
1631 Idx src, idx;
1632 bool ok;
963d8d78
UD
1633 for (idx = 0; idx < dfa->nodes_len; ++idx)
1634 re_node_set_init_empty (dfa->inveclosures + idx);
1635
3b0bdc72
UD
1636 for (src = 0; src < dfa->nodes_len; ++src)
1637 {
eb04c213 1638 Idx *elems = dfa->eclosures[src].elems;
3b0bdc72 1639 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175 1640 {
eb04c213 1641 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
f4efbdfb 1642 if (__glibc_unlikely (! ok))
963d8d78 1643 return REG_ESPACE;
15a7d175 1644 }
3b0bdc72 1645 }
963d8d78
UD
1646
1647 return REG_NOERROR;
3b0bdc72
UD
1648}
1649
1650/* Calculate "eclosure" for all the node in DFA. */
1651
1652static reg_errcode_t
0fd8ae9c 1653calc_eclosure (re_dfa_t *dfa)
3b0bdc72 1654{
eb04c213
AZ
1655 Idx node_idx;
1656 bool incomplete;
2a0356e1 1657 DEBUG_ASSERT (dfa->nodes_len > 0);
eb04c213 1658 incomplete = false;
3b0bdc72 1659 /* For each nodes, calculate epsilon closure. */
485d775d 1660 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1661 {
a9388965 1662 reg_errcode_t err;
3b0bdc72 1663 re_node_set eclosure_elem;
485d775d 1664 if (node_idx == dfa->nodes_len)
15a7d175
UD
1665 {
1666 if (!incomplete)
1667 break;
eb04c213 1668 incomplete = false;
15a7d175
UD
1669 node_idx = 0;
1670 }
3b0bdc72 1671
2a0356e1 1672 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
c06a6956 1673
3b0bdc72
UD
1674 /* If we have already calculated, skip it. */
1675 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1676 continue;
d3821ab0 1677 /* Calculate epsilon closure of 'node_idx'. */
eb04c213 1678 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
f4efbdfb 1679 if (__glibc_unlikely (err != REG_NOERROR))
15a7d175 1680 return err;
3b0bdc72
UD
1681
1682 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175 1683 {
eb04c213 1684 incomplete = true;
15a7d175
UD
1685 re_node_set_free (&eclosure_elem);
1686 }
3b0bdc72 1687 }
3b0bdc72
UD
1688 return REG_NOERROR;
1689}
1690
1691/* Calculate epsilon closure of NODE. */
1692
a9388965 1693static reg_errcode_t
eb04c213 1694calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
3b0bdc72 1695{
a9388965 1696 reg_errcode_t err;
eb04c213 1697 Idx i;
3b0bdc72 1698 re_node_set eclosure;
eb04c213 1699 bool incomplete = false;
a9388965 1700 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
f4efbdfb 1701 if (__glibc_unlikely (err != REG_NOERROR))
a9388965 1702 return err;
3b0bdc72 1703
0b5ca7c3
PE
1704 /* An epsilon closure includes itself. */
1705 eclosure.elems[eclosure.nelem++] = node;
1706
3b0bdc72
UD
1707 /* This indicates that we are calculating this node now.
1708 We reference this value to avoid infinite loop. */
1709 dfa->eclosures[node].nelem = -1;
1710
0caca71a
UD
1711 /* If the current node has constraints, duplicate all nodes
1712 since they must inherit the constraints. */
1713 if (dfa->nodes[node].constraint
b4ae56bd
UD
1714 && dfa->edests[node].nelem
1715 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
485d775d 1716 {
0caca71a
UD
1717 err = duplicate_node_closure (dfa, node, node, node,
1718 dfa->nodes[node].constraint);
f4efbdfb 1719 if (__glibc_unlikely (err != REG_NOERROR))
15a7d175 1720 return err;
485d775d 1721 }
3b0bdc72
UD
1722
1723 /* Expand each epsilon destination nodes. */
485d775d 1724 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1725 for (i = 0; i < dfa->edests[node].nelem; ++i)
1726 {
15a7d175 1727 re_node_set eclosure_elem;
eb04c213
AZ
1728 Idx edest = dfa->edests[node].elems[i];
1729 /* If calculating the epsilon closure of 'edest' is in progress,
15a7d175
UD
1730 return intermediate result. */
1731 if (dfa->eclosures[edest].nelem == -1)
1732 {
eb04c213 1733 incomplete = true;
15a7d175
UD
1734 continue;
1735 }
eb04c213 1736 /* If we haven't calculated the epsilon closure of 'edest' yet,
15a7d175
UD
1737 calculate now. Otherwise use calculated epsilon closure. */
1738 if (dfa->eclosures[edest].nelem == 0)
1739 {
eb04c213 1740 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
f4efbdfb 1741 if (__glibc_unlikely (err != REG_NOERROR))
15a7d175
UD
1742 return err;
1743 }
1744 else
1745 eclosure_elem = dfa->eclosures[edest];
d3821ab0 1746 /* Merge the epsilon closure of 'edest'. */
2da42bc0 1747 err = re_node_set_merge (&eclosure, &eclosure_elem);
f4efbdfb 1748 if (__glibc_unlikely (err != REG_NOERROR))
2da42bc0 1749 return err;
d3821ab0 1750 /* If the epsilon closure of 'edest' is incomplete,
15a7d175
UD
1751 the epsilon closure of this node is also incomplete. */
1752 if (dfa->eclosures[edest].nelem == 0)
1753 {
eb04c213 1754 incomplete = true;
15a7d175
UD
1755 re_node_set_free (&eclosure_elem);
1756 }
3b0bdc72
UD
1757 }
1758
3b0bdc72
UD
1759 if (incomplete && !root)
1760 dfa->eclosures[node].nelem = 0;
1761 else
1762 dfa->eclosures[node] = eclosure;
a9388965
UD
1763 *new_set = eclosure;
1764 return REG_NOERROR;
3b0bdc72
UD
1765}
1766\f
1767/* Functions for token which are used in the parser. */
1768
1769/* Fetch a token from INPUT.
1770 We must not use this function inside bracket expressions. */
1771
f0d77aa8 1772static void
0fd8ae9c 1773fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
3b0bdc72 1774{
f0d77aa8 1775 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1776}
1777
1778/* Peek a token from INPUT, and return the length of the token.
1779 We must not use this function inside bracket expressions. */
1780
1781static int
0fd8ae9c 1782peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
1783{
1784 unsigned char c;
1785
1786 if (re_string_eoi (input))
1787 {
1788 token->type = END_OF_RE;
1789 return 0;
1790 }
1791
1792 c = re_string_peek_byte (input, 0);
1793 token->opr.c = c;
1794
65e6becf 1795 token->word_char = 0;
3b0bdc72
UD
1796#ifdef RE_ENABLE_I18N
1797 token->mb_partial = 0;
34a5a146
JM
1798 if (input->mb_cur_max > 1
1799 && !re_string_first_byte (input, re_string_cur_idx (input)))
3b0bdc72
UD
1800 {
1801 token->type = CHARACTER;
1802 token->mb_partial = 1;
1803 return 1;
1804 }
1805#endif
1806 if (c == '\\')
1807 {
1808 unsigned char c2;
1809 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1810 {
1811 token->type = BACK_SLASH;
1812 return 1;
1813 }
3b0bdc72
UD
1814
1815 c2 = re_string_peek_byte_case (input, 1);
1816 token->opr.c = c2;
1817 token->type = CHARACTER;
65e6becf
UD
1818#ifdef RE_ENABLE_I18N
1819 if (input->mb_cur_max > 1)
1820 {
1821 wint_t wc = re_string_wchar_at (input,
1822 re_string_cur_idx (input) + 1);
1823 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1824 }
1825 else
1826#endif
1827 token->word_char = IS_WORD_CHAR (c2) != 0;
1828
3b0bdc72 1829 switch (c2)
15a7d175
UD
1830 {
1831 case '|':
1832 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833 token->type = OP_ALT;
1834 break;
1835 case '1': case '2': case '3': case '4': case '5':
1836 case '6': case '7': case '8': case '9':
1837 if (!(syntax & RE_NO_BK_REFS))
1838 {
1839 token->type = OP_BACK_REF;
ae73c6c1 1840 token->opr.idx = c2 - '1';
15a7d175
UD
1841 }
1842 break;
1843 case '<':
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 {
1846 token->type = ANCHOR;
bb3f4825 1847 token->opr.ctx_type = WORD_FIRST;
15a7d175
UD
1848 }
1849 break;
1850 case '>':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
bb3f4825 1854 token->opr.ctx_type = WORD_LAST;
15a7d175
UD
1855 }
1856 break;
1857 case 'b':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
bb3f4825 1861 token->opr.ctx_type = WORD_DELIM;
15a7d175
UD
1862 }
1863 break;
1864 case 'B':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 {
1867 token->type = ANCHOR;
24992143 1868 token->opr.ctx_type = NOT_WORD_DELIM;
15a7d175
UD
1869 }
1870 break;
1871 case 'w':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_WORD;
1874 break;
1875 case 'W':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTWORD;
1878 break;
e2b6bfa3
UD
1879 case 's':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_SPACE;
1882 break;
1883 case 'S':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_NOTSPACE;
1886 break;
15a7d175
UD
1887 case '`':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 {
1890 token->type = ANCHOR;
bb3f4825 1891 token->opr.ctx_type = BUF_FIRST;
15a7d175
UD
1892 }
1893 break;
1894 case '\'':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 {
1897 token->type = ANCHOR;
bb3f4825 1898 token->opr.ctx_type = BUF_LAST;
15a7d175
UD
1899 }
1900 break;
1901 case '(':
1902 if (!(syntax & RE_NO_BK_PARENS))
1903 token->type = OP_OPEN_SUBEXP;
1904 break;
1905 case ')':
1906 if (!(syntax & RE_NO_BK_PARENS))
1907 token->type = OP_CLOSE_SUBEXP;
1908 break;
1909 case '+':
1910 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_PLUS;
1912 break;
1913 case '?':
1914 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915 token->type = OP_DUP_QUESTION;
1916 break;
1917 case '{':
1918 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919 token->type = OP_OPEN_DUP_NUM;
1920 break;
1921 case '}':
1922 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923 token->type = OP_CLOSE_DUP_NUM;
1924 break;
1925 default:
1926 break;
1927 }
3b0bdc72
UD
1928 return 2;
1929 }
1930
1931 token->type = CHARACTER;
65e6becf
UD
1932#ifdef RE_ENABLE_I18N
1933 if (input->mb_cur_max > 1)
1934 {
1935 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1937 }
1938 else
1939#endif
1940 token->word_char = IS_WORD_CHAR (token->opr.c);
1941
3b0bdc72
UD
1942 switch (c)
1943 {
1944 case '\n':
1945 if (syntax & RE_NEWLINE_ALT)
15a7d175 1946 token->type = OP_ALT;
3b0bdc72
UD
1947 break;
1948 case '|':
1949 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1950 token->type = OP_ALT;
3b0bdc72
UD
1951 break;
1952 case '*':
1953 token->type = OP_DUP_ASTERISK;
1954 break;
1955 case '+':
1956 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1957 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1958 break;
1959 case '?':
1960 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1961 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1962 break;
1963 case '{':
1964 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1965 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1966 break;
1967 case '}':
1968 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1969 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1970 break;
1971 case '(':
1972 if (syntax & RE_NO_BK_PARENS)
15a7d175 1973 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1974 break;
1975 case ')':
1976 if (syntax & RE_NO_BK_PARENS)
15a7d175 1977 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1978 break;
1979 case '[':
1980 token->type = OP_OPEN_BRACKET;
1981 break;
1982 case '.':
1983 token->type = OP_PERIOD;
1984 break;
1985 case '^':
34a5a146
JM
1986 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1987 && re_string_cur_idx (input) != 0)
15a7d175
UD
1988 {
1989 char prev = re_string_peek_byte (input, -1);
134abcb5 1990 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1991 break;
1992 }
3b0bdc72 1993 token->type = ANCHOR;
bb3f4825 1994 token->opr.ctx_type = LINE_FIRST;
3b0bdc72
UD
1995 break;
1996 case '$':
34a5a146
JM
1997 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1998 && re_string_cur_idx (input) + 1 != re_string_length (input))
15a7d175
UD
1999 {
2000 re_token_t next;
2001 re_string_skip_bytes (input, 1);
2002 peek_token (&next, input, syntax);
2003 re_string_skip_bytes (input, -1);
2004 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005 break;
2006 }
3b0bdc72 2007 token->type = ANCHOR;
bb3f4825 2008 token->opr.ctx_type = LINE_LAST;
3b0bdc72
UD
2009 break;
2010 default:
2011 break;
2012 }
2013 return 1;
2014}
2015
2016/* Peek a token from INPUT, and return the length of the token.
2017 We must not use this function out of bracket expressions. */
2018
2019static int
0fd8ae9c 2020peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
2021{
2022 unsigned char c;
2023 if (re_string_eoi (input))
2024 {
2025 token->type = END_OF_RE;
2026 return 0;
2027 }
2028 c = re_string_peek_byte (input, 0);
2029 token->opr.c = c;
2030
2031#ifdef RE_ENABLE_I18N
34a5a146
JM
2032 if (input->mb_cur_max > 1
2033 && !re_string_first_byte (input, re_string_cur_idx (input)))
3b0bdc72
UD
2034 {
2035 token->type = CHARACTER;
2036 return 1;
2037 }
2038#endif /* RE_ENABLE_I18N */
2039
294b6bcc
UD
2040 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041 && re_string_cur_idx (input) + 1 < re_string_length (input))
3b0bdc72
UD
2042 {
2043 /* In this case, '\' escape a character. */
2044 unsigned char c2;
82bbb29e
UD
2045 re_string_skip_bytes (input, 1);
2046 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
2047 token->opr.c = c2;
2048 token->type = CHARACTER;
2049 return 1;
2050 }
2051 if (c == '[') /* '[' is a special char in a bracket exps. */
2052 {
2053 unsigned char c2;
2054 int token_len;
294b6bcc
UD
2055 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056 c2 = re_string_peek_byte (input, 1);
2057 else
2058 c2 = 0;
3b0bdc72
UD
2059 token->opr.c = c2;
2060 token_len = 2;
2061 switch (c2)
15a7d175
UD
2062 {
2063 case '.':
2064 token->type = OP_OPEN_COLL_ELEM;
2065 break;
eb04c213 2066
15a7d175
UD
2067 case '=':
2068 token->type = OP_OPEN_EQUIV_CLASS;
2069 break;
eb04c213 2070
15a7d175
UD
2071 case ':':
2072 if (syntax & RE_CHAR_CLASSES)
2073 {
2074 token->type = OP_OPEN_CHAR_CLASS;
2075 break;
2076 }
eb04c213 2077 FALLTHROUGH;
15a7d175
UD
2078 default:
2079 token->type = CHARACTER;
2080 token->opr.c = c;
2081 token_len = 1;
2082 break;
2083 }
3b0bdc72
UD
2084 return token_len;
2085 }
2086 switch (c)
2087 {
2088 case '-':
2089 token->type = OP_CHARSET_RANGE;
2090 break;
2091 case ']':
2092 token->type = OP_CLOSE_BRACKET;
2093 break;
2094 case '^':
2095 token->type = OP_NON_MATCH_LIST;
2096 break;
2097 default:
2098 token->type = CHARACTER;
2099 }
2100 return 1;
2101}
2102\f
2103/* Functions for parser. */
2104
2105/* Entry point of the parser.
2106 Parse the regular expression REGEXP and return the structure tree.
d3821ab0 2107 If an error occurs, ERR is set by error code, and return NULL.
3b0bdc72 2108 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
2109 CAT
2110 / \
2111 / \
3b0bdc72
UD
2112 <reg_exp> EOR
2113
2114 CAT means concatenation.
2115 EOR means end of regular expression. */
2116
2117static bin_tree_t *
0fd8ae9c
UD
2118parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2119 reg_errcode_t *err)
3b0bdc72 2120{
eb04c213 2121 re_dfa_t *dfa = preg->buffer;
3b0bdc72
UD
2122 bin_tree_t *tree, *eor, *root;
2123 re_token_t current_token;
56b168be 2124 dfa->syntax = syntax;
f0d77aa8 2125 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2126 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
f4efbdfb 2127 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
3b0bdc72 2128 return NULL;
02f3550c 2129 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
3b0bdc72 2130 if (tree != NULL)
02f3550c 2131 root = create_tree (dfa, tree, eor, CONCAT);
3b0bdc72
UD
2132 else
2133 root = eor;
f4efbdfb 2134 if (__glibc_unlikely (eor == NULL || root == NULL))
485d775d
UD
2135 {
2136 *err = REG_ESPACE;
2137 return NULL;
2138 }
3b0bdc72
UD
2139 return root;
2140}
2141
2142/* This function build the following tree, from regular expression
2143 <branch1>|<branch2>:
15a7d175
UD
2144 ALT
2145 / \
2146 / \
3b0bdc72
UD
2147 <branch1> <branch2>
2148
d3821ab0 2149 ALT means alternative, which represents the operator '|'. */
3b0bdc72
UD
2150
2151static bin_tree_t *
0fd8ae9c 2152parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2153 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2154{
eb04c213 2155 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2156 bin_tree_t *tree, *branch = NULL;
eb04c213 2157 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
3b0bdc72 2158 tree = parse_branch (regexp, preg, token, syntax, nest, err);
f4efbdfb 2159 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
3b0bdc72
UD
2160 return NULL;
2161
2162 while (token->type == OP_ALT)
2163 {
f0d77aa8 2164 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2165 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
2166 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2167 {
eb04c213
AZ
2168 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2169 dfa->completed_bkref_map = initial_bkref_map;
15a7d175 2170 branch = parse_branch (regexp, preg, token, syntax, nest, err);
f4efbdfb 2171 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
aa6ec754
AS
2172 {
2173 if (tree != NULL)
2174 postorder (tree, free_tree, NULL);
2175 return NULL;
2176 }
eb04c213 2177 dfa->completed_bkref_map |= accumulated_bkref_map;
15a7d175 2178 }
9b88fc16
UD
2179 else
2180 branch = NULL;
02f3550c 2181 tree = create_tree (dfa, tree, branch, OP_ALT);
f4efbdfb 2182 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2183 {
2184 *err = REG_ESPACE;
2185 return NULL;
2186 }
3b0bdc72
UD
2187 }
2188 return tree;
2189}
2190
2191/* This function build the following tree, from regular expression
2192 <exp1><exp2>:
15a7d175
UD
2193 CAT
2194 / \
3b0bdc72
UD
2195 / \
2196 <exp1> <exp2>
2197
2198 CAT means concatenation. */
2199
2200static bin_tree_t *
0fd8ae9c 2201parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2202 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2203{
eb04c213
AZ
2204 bin_tree_t *tree, *expr;
2205 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2206 tree = parse_expression (regexp, preg, token, syntax, nest, err);
f4efbdfb 2207 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
3b0bdc72
UD
2208 return NULL;
2209
2210 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2211 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72 2212 {
eb04c213 2213 expr = parse_expression (regexp, preg, token, syntax, nest, err);
f4efbdfb 2214 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
15a7d175 2215 {
b833d51f
UD
2216 if (tree != NULL)
2217 postorder (tree, free_tree, NULL);
15a7d175
UD
2218 return NULL;
2219 }
eb04c213 2220 if (tree != NULL && expr != NULL)
15a7d175 2221 {
eb04c213 2222 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
e9b9cbf5 2223 if (newtree == NULL)
15a7d175 2224 {
eb04c213 2225 postorder (expr, free_tree, NULL);
e9b9cbf5 2226 postorder (tree, free_tree, NULL);
15a7d175
UD
2227 *err = REG_ESPACE;
2228 return NULL;
2229 }
e9b9cbf5 2230 tree = newtree;
15a7d175 2231 }
3b0bdc72 2232 else if (tree == NULL)
eb04c213
AZ
2233 tree = expr;
2234 /* Otherwise expr == NULL, we don't need to create new tree. */
3b0bdc72
UD
2235 }
2236 return tree;
2237}
2238
2239/* This function build the following tree, from regular expression a*:
15a7d175
UD
2240 *
2241 |
2242 a
3b0bdc72
UD
2243*/
2244
2245static bin_tree_t *
0fd8ae9c 2246parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2247 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2248{
eb04c213 2249 re_dfa_t *dfa = preg->buffer;
3b0bdc72 2250 bin_tree_t *tree;
3b0bdc72
UD
2251 switch (token->type)
2252 {
2253 case CHARACTER:
02f3550c 2254 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2255 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2256 {
2257 *err = REG_ESPACE;
2258 return NULL;
2259 }
3b0bdc72 2260#ifdef RE_ENABLE_I18N
3c0fb574 2261 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2262 {
2263 while (!re_string_eoi (regexp)
2264 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2265 {
15a7d175 2266 bin_tree_t *mbc_remain;
f0d77aa8 2267 fetch_token (token, regexp, syntax);
02f3550c
UD
2268 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2269 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
f4efbdfb 2270 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
54e1cabc
UD
2271 {
2272 *err = REG_ESPACE;
2273 return NULL;
2274 }
15a7d175 2275 }
3b0bdc72
UD
2276 }
2277#endif
2278 break;
eb04c213 2279
3b0bdc72
UD
2280 case OP_OPEN_SUBEXP:
2281 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
f4efbdfb 2282 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
15a7d175 2283 return NULL;
3b0bdc72 2284 break;
eb04c213 2285
3b0bdc72
UD
2286 case OP_OPEN_BRACKET:
2287 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
f4efbdfb 2288 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
15a7d175 2289 return NULL;
3b0bdc72 2290 break;
eb04c213 2291
3b0bdc72 2292 case OP_BACK_REF:
f4efbdfb 2293 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
15a7d175
UD
2294 {
2295 *err = REG_ESUBREG;
2296 return NULL;
2297 }
ae73c6c1 2298 dfa->used_bkref_map |= 1 << token->opr.idx;
02f3550c 2299 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2300 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2301 {
2302 *err = REG_ESPACE;
2303 return NULL;
2304 }
3b0bdc72
UD
2305 ++dfa->nbackref;
2306 dfa->has_mb_node = 1;
2307 break;
eb04c213 2308
06e8303a
UD
2309 case OP_OPEN_DUP_NUM:
2310 if (syntax & RE_CONTEXT_INVALID_DUP)
2311 {
2312 *err = REG_BADRPT;
2313 return NULL;
2314 }
eb04c213 2315 FALLTHROUGH;
3b0bdc72
UD
2316 case OP_DUP_ASTERISK:
2317 case OP_DUP_PLUS:
2318 case OP_DUP_QUESTION:
3b0bdc72 2319 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2320 {
2321 *err = REG_BADRPT;
2322 return NULL;
2323 }
3b0bdc72 2324 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2325 {
f0d77aa8 2326 fetch_token (token, regexp, syntax);
15a7d175
UD
2327 return parse_expression (regexp, preg, token, syntax, nest, err);
2328 }
eb04c213 2329 FALLTHROUGH;
3b0bdc72 2330 case OP_CLOSE_SUBEXP:
34a5a146
JM
2331 if ((token->type == OP_CLOSE_SUBEXP)
2332 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
15a7d175
UD
2333 {
2334 *err = REG_ERPAREN;
2335 return NULL;
2336 }
eb04c213 2337 FALLTHROUGH;
3b0bdc72
UD
2338 case OP_CLOSE_DUP_NUM:
2339 /* We treat it as a normal character. */
2340
2341 /* Then we can these characters as normal characters. */
2342 token->type = CHARACTER;
65e6becf
UD
2343 /* mb_partial and word_char bits should be initialized already
2344 by peek_token. */
02f3550c 2345 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2346 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2347 {
2348 *err = REG_ESPACE;
2349 return NULL;
2350 }
3b0bdc72 2351 break;
eb04c213 2352
3b0bdc72 2353 case ANCHOR:
bb3f4825 2354 if ((token->opr.ctx_type
24992143 2355 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
56b168be
UD
2356 && dfa->word_ops_used == 0)
2357 init_word_char (dfa);
24992143 2358 if (token->opr.ctx_type == WORD_DELIM
21f5de55 2359 || token->opr.ctx_type == NOT_WORD_DELIM)
15a7d175
UD
2360 {
2361 bin_tree_t *tree_first, *tree_last;
24992143
UD
2362 if (token->opr.ctx_type == WORD_DELIM)
2363 {
2364 token->opr.ctx_type = WORD_FIRST;
02f3550c 2365 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2366 token->opr.ctx_type = WORD_LAST;
21f5de55
PE
2367 }
2368 else
2369 {
24992143 2370 token->opr.ctx_type = INSIDE_WORD;
02f3550c 2371 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2372 token->opr.ctx_type = INSIDE_NOTWORD;
21f5de55 2373 }
02f3550c
UD
2374 tree_last = create_token_tree (dfa, NULL, NULL, token);
2375 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
f4efbdfb
PE
2376 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2377 || tree == NULL))
15a7d175
UD
2378 {
2379 *err = REG_ESPACE;
2380 return NULL;
2381 }
2382 }
3b0bdc72 2383 else
15a7d175 2384 {
02f3550c 2385 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2386 if (__glibc_unlikely (tree == NULL))
54e1cabc
UD
2387 {
2388 *err = REG_ESPACE;
2389 return NULL;
2390 }
15a7d175 2391 }
3b0bdc72 2392 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2393 by repetition operators.
2394 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2395 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2396 fetch_token (token, regexp, syntax);
3b0bdc72 2397 return tree;
eb04c213 2398
3b0bdc72 2399 case OP_PERIOD:
02f3550c 2400 tree = create_token_tree (dfa, NULL, NULL, token);
f4efbdfb 2401 if (__glibc_unlikely (tree == NULL))
15a7d175
UD
2402 {
2403 *err = REG_ESPACE;
2404 return NULL;
2405 }
3c0fb574 2406 if (dfa->mb_cur_max > 1)
15a7d175 2407 dfa->has_mb_node = 1;
3b0bdc72 2408 break;
eb04c213 2409
3b0bdc72 2410 case OP_WORD:
3b0bdc72 2411 case OP_NOTWORD:
266c1f50 2412 tree = build_charclass_op (dfa, regexp->trans,
eb04c213
AZ
2413 "alnum",
2414 "_",
266c1f50 2415 token->type == OP_NOTWORD, err);
f4efbdfb 2416 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
e2b6bfa3
UD
2417 return NULL;
2418 break;
eb04c213 2419
e2b6bfa3 2420 case OP_SPACE:
e2b6bfa3 2421 case OP_NOTSPACE:
266c1f50 2422 tree = build_charclass_op (dfa, regexp->trans,
eb04c213
AZ
2423 "space",
2424 "",
266c1f50 2425 token->type == OP_NOTSPACE, err);
f4efbdfb 2426 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
15a7d175 2427 return NULL;
3b0bdc72 2428 break;
eb04c213 2429
3b0bdc72
UD
2430 case OP_ALT:
2431 case END_OF_RE:
2432 return NULL;
eb04c213 2433
3b0bdc72
UD
2434 case BACK_SLASH:
2435 *err = REG_EESCAPE;
2436 return NULL;
eb04c213 2437
3b0bdc72
UD
2438 default:
2439 /* Must not happen? */
2a0356e1 2440 DEBUG_ASSERT (false);
3b0bdc72
UD
2441 return NULL;
2442 }
f0d77aa8 2443 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2444
2445 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2446 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72 2447 {
eb04c213
AZ
2448 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2449 syntax, err);
f4efbdfb 2450 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
4d43ef1e
AS
2451 {
2452 if (tree != NULL)
2453 postorder (tree, free_tree, NULL);
2454 return NULL;
2455 }
2456 tree = dup_tree;
c34bfc8d
UD
2457 /* In BRE consecutive duplications are not allowed. */
2458 if ((syntax & RE_CONTEXT_INVALID_DUP)
2459 && (token->type == OP_DUP_ASTERISK
2460 || token->type == OP_OPEN_DUP_NUM))
2461 {
4d43ef1e
AS
2462 if (tree != NULL)
2463 postorder (tree, free_tree, NULL);
c34bfc8d
UD
2464 *err = REG_BADRPT;
2465 return NULL;
2466 }
3b0bdc72
UD
2467 }
2468
2469 return tree;
2470}
2471
2472/* This function build the following tree, from regular expression
2473 (<reg_exp>):
15a7d175
UD
2474 SUBEXP
2475 |
2476 <reg_exp>
3b0bdc72
UD
2477*/
2478
2479static bin_tree_t *
0fd8ae9c 2480parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
eb04c213 2481 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
3b0bdc72 2482{
eb04c213 2483 re_dfa_t *dfa = preg->buffer;
02f3550c 2484 bin_tree_t *tree;
3b0bdc72
UD
2485 size_t cur_nsub;
2486 cur_nsub = preg->re_nsub++;
81c64d40 2487
f0d77aa8 2488 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2489
2490 /* The subexpression may be a null string. */
2491 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2492 tree = NULL;
3b0bdc72
UD
2493 else
2494 {
2495 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
f4efbdfb
PE
2496 if (__glibc_unlikely (*err == REG_NOERROR
2497 && token->type != OP_CLOSE_SUBEXP))
a129c80d
UD
2498 {
2499 if (tree != NULL)
e9b9cbf5 2500 postorder (tree, free_tree, NULL);
a129c80d
UD
2501 *err = REG_EPAREN;
2502 }
f4efbdfb 2503 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175 2504 return NULL;
3b0bdc72 2505 }
01ed6ceb
UD
2506
2507 if (cur_nsub <= '9' - '1')
2508 dfa->completed_bkref_map |= 1 << cur_nsub;
02f3550c
UD
2509
2510 tree = create_tree (dfa, tree, NULL, SUBEXP);
f4efbdfb 2511 if (__glibc_unlikely (tree == NULL))
485d775d
UD
2512 {
2513 *err = REG_ESPACE;
2514 return NULL;
2515 }
02f3550c 2516 tree->token.opr.idx = cur_nsub;
3b0bdc72
UD
2517 return tree;
2518}
2519
2520/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2521
2522static bin_tree_t *
0fd8ae9c
UD
2523parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2524 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72 2525{
20dc2f79 2526 bin_tree_t *tree = NULL, *old_tree = NULL;
eb04c213 2527 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
3b0bdc72 2528 re_token_t start_token = *token;
6b6557e8 2529
3b0bdc72
UD
2530 if (token->type == OP_OPEN_DUP_NUM)
2531 {
6b6557e8
UD
2532 end = 0;
2533 start = fetch_number (regexp, token, syntax);
3b0bdc72 2534 if (start == -1)
15a7d175
UD
2535 {
2536 if (token->type == CHARACTER && token->opr.c == ',')
2537 start = 0; /* We treat "{,m}" as "{0,m}". */
2538 else
2539 {
2540 *err = REG_BADBR; /* <re>{} is invalid. */
2541 return NULL;
2542 }
2543 }
f4efbdfb 2544 if (__glibc_likely (start != -2))
15a7d175
UD
2545 {
2546 /* We treat "{n}" as "{n,n}". */
2547 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548 : ((token->type == CHARACTER && token->opr.c == ',')
2549 ? fetch_number (regexp, token, syntax) : -2));
2550 }
f4efbdfb 2551 if (__glibc_unlikely (start == -2 || end == -2))
15a7d175
UD
2552 {
2553 /* Invalid sequence. */
f4efbdfb 2554 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
6b6557e8
UD
2555 {
2556 if (token->type == END_OF_RE)
2557 *err = REG_EBRACE;
2558 else
2559 *err = REG_BADBR;
2560
2561 return NULL;
2562 }
2563
2564 /* If the syntax bit is set, rollback. */
2565 re_string_set_index (regexp, start_idx);
2566 *token = start_token;
2567 token->type = CHARACTER;
2568 /* mb_partial and word_char bits should be already initialized by
2569 peek_token. */
2570 return elem;
15a7d175 2571 }
6b6557e8 2572
f4efbdfb
PE
2573 if (__glibc_unlikely ((end != -1 && start > end)
2574 || token->type != OP_CLOSE_DUP_NUM))
15a7d175 2575 {
6b6557e8
UD
2576 /* First number greater than second. */
2577 *err = REG_BADBR;
15a7d175
UD
2578 return NULL;
2579 }
eb04c213 2580
f4efbdfb 2581 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
eb04c213
AZ
2582 {
2583 *err = REG_ESIZE;
2584 return NULL;
2585 }
6b6557e8
UD
2586 }
2587 else
2588 {
2589 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2590 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2591 }
2592
20dc2f79
UD
2593 fetch_token (token, regexp, syntax);
2594
f4efbdfb 2595 if (__glibc_unlikely (elem == NULL))
20dc2f79 2596 return NULL;
f4efbdfb 2597 if (__glibc_unlikely (start == 0 && end == 0))
02f3550c
UD
2598 {
2599 postorder (elem, free_tree, NULL);
2600 return NULL;
2601 }
602c2f9d 2602
6b6557e8 2603 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
f4efbdfb 2604 if (__glibc_unlikely (start > 0))
6b6557e8
UD
2605 {
2606 tree = elem;
2607 for (i = 2; i <= start; ++i)
7de66108 2608 {
6b6557e8 2609 elem = duplicate_tree (elem, dfa);
02f3550c 2610 tree = create_tree (dfa, tree, elem, CONCAT);
f4efbdfb 2611 if (__glibc_unlikely (elem == NULL || tree == NULL))
7de66108
UD
2612 goto parse_dup_op_espace;
2613 }
6b6557e8 2614
20dc2f79
UD
2615 if (start == end)
2616 return tree;
a96c63ed 2617
20dc2f79
UD
2618 /* Duplicate ELEM before it is marked optional. */
2619 elem = duplicate_tree (elem, dfa);
f4efbdfb 2620 if (__glibc_unlikely (elem == NULL))
7ee03f00 2621 goto parse_dup_op_espace;
20dc2f79
UD
2622 old_tree = tree;
2623 }
2624 else
2625 old_tree = NULL;
2626
02f3550c 2627 if (elem->token.type == SUBEXP)
eb04c213
AZ
2628 {
2629 uintptr_t subidx = elem->token.opr.idx;
2630 postorder (elem, mark_opt_subexp, (void *) subidx);
2631 }
02f3550c 2632
eb04c213
AZ
2633 tree = create_tree (dfa, elem, NULL,
2634 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
f4efbdfb 2635 if (__glibc_unlikely (tree == NULL))
20dc2f79
UD
2636 goto parse_dup_op_espace;
2637
2638 /* This loop is actually executed only when end != -1,
2639 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2640 already created the start+1-th copy. */
eb04c213
AZ
2641 if (TYPE_SIGNED (Idx) || end != -1)
2642 for (i = start + 2; i <= end; ++i)
2643 {
2644 elem = duplicate_tree (elem, dfa);
2645 tree = create_tree (dfa, tree, elem, CONCAT);
f4efbdfb 2646 if (__glibc_unlikely (elem == NULL || tree == NULL))
eb04c213
AZ
2647 goto parse_dup_op_espace;
2648
2649 tree = create_tree (dfa, tree, NULL, OP_ALT);
f4efbdfb 2650 if (__glibc_unlikely (tree == NULL))
eb04c213
AZ
2651 goto parse_dup_op_espace;
2652 }
6b6557e8 2653
20dc2f79 2654 if (old_tree)
02f3550c 2655 tree = create_tree (dfa, old_tree, tree, CONCAT);
20dc2f79 2656
3b0bdc72
UD
2657 return tree;
2658
2659 parse_dup_op_espace:
3b0bdc72
UD
2660 *err = REG_ESPACE;
2661 return NULL;
3b0bdc72
UD
2662}
2663
2664/* Size of the names for collating symbol/equivalence_class/character_class.
2665 I'm not sure, but maybe enough. */
2666#define BRACKET_NAME_BUF_SIZE 32
2667
434d3784 2668#ifndef _LIBC
eb04c213
AZ
2669
2670# ifdef RE_ENABLE_I18N
2671/* Convert the byte B to the corresponding wide character. In a
c77bf91b
PE
2672 unibyte locale, treat B as itself. In a multibyte locale, return
2673 WEOF if B is an encoding error. */
eb04c213
AZ
2674static wint_t
2675parse_byte (unsigned char b, re_charset_t *mbcset)
2676{
c77bf91b 2677 return mbcset == NULL ? b : __btowc (b);
eb04c213 2678}
c77bf91b 2679# endif
eb04c213 2680
434d3784
UD
2681 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2682 Build the range expression which starts from START_ELEM, and ends
2683 at END_ELEM. The result are written to MBCSET and SBCSET.
2684 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
d3821ab0 2685 mbcset->range_ends, is a pointer argument since we may
434d3784
UD
2686 update it. */
2687
2688static reg_errcode_t
c0a0f9a3 2689# ifdef RE_ENABLE_I18N
eb04c213
AZ
2690build_range_exp (const reg_syntax_t syntax,
2691 bitset_t sbcset,
2692 re_charset_t *mbcset,
2693 Idx *range_alloc,
2694 const bracket_elem_t *start_elem,
2695 const bracket_elem_t *end_elem)
c0a0f9a3 2696# else /* not RE_ENABLE_I18N */
eb04c213
AZ
2697build_range_exp (const reg_syntax_t syntax,
2698 bitset_t sbcset,
2699 const bracket_elem_t *start_elem,
2700 const bracket_elem_t *end_elem)
c0a0f9a3 2701# endif /* not RE_ENABLE_I18N */
434d3784
UD
2702{
2703 unsigned int start_ch, end_ch;
2704 /* Equivalence Classes and Character Classes can't be a range start/end. */
f4efbdfb
PE
2705 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2706 || start_elem->type == CHAR_CLASS
2707 || end_elem->type == EQUIV_CLASS
2708 || end_elem->type == CHAR_CLASS))
434d3784
UD
2709 return REG_ERANGE;
2710
2711 /* We can handle no multi character collating elements without libc
2712 support. */
f4efbdfb
PE
2713 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2714 && strlen ((char *) start_elem->opr.name) > 1)
2715 || (end_elem->type == COLL_SYM
2716 && strlen ((char *) end_elem->opr.name) > 1)))
434d3784
UD
2717 return REG_ECOLLATE;
2718
2719# ifdef RE_ENABLE_I18N
2720 {
2d87db5b
UD
2721 wchar_t wc;
2722 wint_t start_wc;
2723 wint_t end_wc;
434d3784
UD
2724
2725 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2726 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2727 : 0));
434d3784 2728 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2729 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2730 : 0));
434d3784 2731 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
eb04c213 2732 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
434d3784 2733 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
eb04c213 2734 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
4bb333cd
UD
2735 if (start_wc == WEOF || end_wc == WEOF)
2736 return REG_ECOLLATE;
f4efbdfb
PE
2737 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2738 && start_wc > end_wc))
434d3784
UD
2739 return REG_ERANGE;
2740
10677727
UD
2741 /* Got valid collation sequence values, add them as a new entry.
2742 However, for !_LIBC we have no collation elements: if the
2743 character set is single byte, the single byte character set
2744 that we build below suffices. parse_bracket_exp passes
2745 no MBCSET if dfa->mb_cur_max == 1. */
2746 if (mbcset)
434d3784 2747 {
21f5de55 2748 /* Check the space of the arrays. */
f4efbdfb 2749 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
21f5de55 2750 {
10677727
UD
2751 /* There is not enough space, need realloc. */
2752 wchar_t *new_array_start, *new_array_end;
eb04c213 2753 Idx new_nranges;
10677727
UD
2754
2755 /* +1 in case of mbcset->nranges is 0. */
2756 new_nranges = 2 * mbcset->nranges + 1;
2757 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2758 are NULL if *range_alloc == 0. */
2759 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
21f5de55 2760 new_nranges);
10677727 2761 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
21f5de55 2762 new_nranges);
10677727 2763
f4efbdfb
PE
2764 if (__glibc_unlikely (new_array_start == NULL
2765 || new_array_end == NULL))
eb04c213
AZ
2766 {
2767 re_free (new_array_start);
2768 re_free (new_array_end);
2769 return REG_ESPACE;
2770 }
10677727
UD
2771
2772 mbcset->range_starts = new_array_start;
2773 mbcset->range_ends = new_array_end;
2774 *range_alloc = new_nranges;
21f5de55 2775 }
10677727 2776
21f5de55
PE
2777 mbcset->range_starts[mbcset->nranges] = start_wc;
2778 mbcset->range_ends[mbcset->nranges++] = end_wc;
434d3784
UD
2779 }
2780
434d3784 2781 /* Build the table for single byte characters. */
1e7947dc 2782 for (wc = 0; wc < SBC_MAX; ++wc)
434d3784 2783 {
eb04c213 2784 if (start_wc <= wc && wc <= end_wc)
15a7d175 2785 bitset_set (sbcset, wc);
434d3784
UD
2786 }
2787 }
2788# else /* not RE_ENABLE_I18N */
2789 {
2790 unsigned int ch;
2791 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2792 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2793 : 0));
434d3784 2794 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2795 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2796 : 0));
434d3784
UD
2797 if (start_ch > end_ch)
2798 return REG_ERANGE;
2799 /* Build the table for single byte characters. */
1e7947dc 2800 for (ch = 0; ch < SBC_MAX; ++ch)
434d3784 2801 if (start_ch <= ch && ch <= end_ch)
15a7d175 2802 bitset_set (sbcset, ch);
434d3784
UD
2803 }
2804# endif /* not RE_ENABLE_I18N */
2805 return REG_NOERROR;
2806}
2807#endif /* not _LIBC */
2808
2809#ifndef _LIBC
2810/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2811 Build the collating element which is represented by NAME.
2812 The result are written to MBCSET and SBCSET.
2813 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2814 pointer argument since we may update it. */
2815
2816static reg_errcode_t
c0a0f9a3 2817# ifdef RE_ENABLE_I18N
0fd8ae9c 2818build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 2819 Idx *coll_sym_alloc, const unsigned char *name)
c0a0f9a3 2820# else /* not RE_ENABLE_I18N */
0fd8ae9c 2821build_collating_symbol (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 2822# endif /* not RE_ENABLE_I18N */
434d3784 2823{
62439eac 2824 size_t name_len = strlen ((const char *) name);
f4efbdfb 2825 if (__glibc_unlikely (name_len != 1))
434d3784
UD
2826 return REG_ECOLLATE;
2827 else
2828 {
62439eac 2829 bitset_set (sbcset, name[0]);
434d3784
UD
2830 return REG_NOERROR;
2831 }
2832}
2833#endif /* not _LIBC */
2834
3b0bdc72 2835#ifdef _LIBC
fdcd177f
FS
2836/* Local function for parse_bracket_exp used in _LIBC environment.
2837 Seek the collating symbol entry corresponding to NAME.
2838 Return the index of the symbol in the SYMB_TABLE,
2839 or -1 if not found. */
3b0bdc72 2840
fdcd177f
FS
2841static inline int32_t
2842__attribute__ ((always_inline))
2843seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2844 const int32_t *symb_table, int32_t table_size,
2845 const unsigned char *extra)
2846{
2847 int32_t elem;
a334319f 2848
fdcd177f
FS
2849 for (elem = 0; elem < table_size; elem++)
2850 if (symb_table[2 * elem] != 0)
2851 {
2852 int32_t idx = symb_table[2 * elem + 1];
2853 /* Skip the name of collating element name. */
2854 idx += 1 + extra[idx];
2855 if (/* Compare the length of the name. */
2856 name_len == extra[idx]
2857 /* Compare the name. */
2858 && memcmp (name, &extra[idx + 1], name_len) == 0)
2859 /* Yep, this is the entry. */
2860 return elem;
2861 }
2862 return -1;
2863}
3b0bdc72 2864
fdcd177f
FS
2865/* Local function for parse_bracket_exp used in _LIBC environment.
2866 Look up the collation sequence value of BR_ELEM.
2867 Return the value if succeeded, UINT_MAX otherwise. */
3b0bdc72 2868
fdcd177f
FS
2869static inline unsigned int
2870__attribute__ ((always_inline))
2871lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2872 const unsigned char *collseqmb,
2873 const char *collseqwc, int32_t table_size,
2874 const int32_t *symb_table,
2875 const unsigned char *extra)
2876{
2877 if (br_elem->type == SB_CHAR)
3b0bdc72 2878 {
fdcd177f
FS
2879 /* if (MB_CUR_MAX == 1) */
2880 if (nrules == 0)
2881 return collseqmb[br_elem->opr.ch];
2882 else
15a7d175 2883 {
fdcd177f
FS
2884 wint_t wc = __btowc (br_elem->opr.ch);
2885 return __collseq_table_lookup (collseqwc, wc);
15a7d175 2886 }
fdcd177f
FS
2887 }
2888 else if (br_elem->type == MB_CHAR)
2889 {
2890 if (nrules != 0)
2891 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2892 }
2893 else if (br_elem->type == COLL_SYM)
2894 {
2895 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2896 if (nrules != 0)
15a7d175 2897 {
fdcd177f
FS
2898 int32_t elem, idx;
2899 elem = seek_collating_symbol_entry (br_elem->opr.name,
2900 sym_name_len,
2901 symb_table, table_size,
2902 extra);
2903 if (elem != -1)
15a7d175 2904 {
fdcd177f
FS
2905 /* We found the entry. */
2906 idx = symb_table[2 * elem + 1];
2907 /* Skip the name of collating element name. */
2908 idx += 1 + extra[idx];
2909 /* Skip the byte sequence of the collating element. */
2910 idx += 1 + extra[idx];
2911 /* Adjust for the alignment. */
2912 idx = (idx + 3) & ~3;
2913 /* Skip the multibyte collation sequence value. */
2914 idx += sizeof (unsigned int);
2915 /* Skip the wide char sequence of the collating element. */
2916 idx += sizeof (unsigned int) *
2917 (1 + *(unsigned int *) (extra + idx));
2918 /* Return the collation sequence value. */
2919 return *(unsigned int *) (extra + idx);
15a7d175
UD
2920 }
2921 else if (sym_name_len == 1)
fdcd177f
FS
2922 {
2923 /* No valid character. Match it as a single byte
2924 character. */
2925 return collseqmb[br_elem->opr.name[0]];
2926 }
15a7d175 2927 }
fdcd177f
FS
2928 else if (sym_name_len == 1)
2929 return collseqmb[br_elem->opr.name[0]];
3b0bdc72 2930 }
fdcd177f
FS
2931 return UINT_MAX;
2932}
3b0bdc72 2933
fdcd177f
FS
2934/* Local function for parse_bracket_exp used in _LIBC environment.
2935 Build the range expression which starts from START_ELEM, and ends
2936 at END_ELEM. The result are written to MBCSET and SBCSET.
2937 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2938 mbcset->range_ends, is a pointer argument since we may
2939 update it. */
10677727 2940
fdcd177f
FS
2941static inline reg_errcode_t
2942__attribute__ ((always_inline))
be9ad300 2943build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
fdcd177f
FS
2944 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2945 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2946 const unsigned char *collseqmb, const char *collseqwc,
2947 int32_t table_size, const int32_t *symb_table,
2948 const unsigned char *extra)
2949{
2950 unsigned int ch;
2951 uint32_t start_collseq;
2952 uint32_t end_collseq;
10677727 2953
fdcd177f
FS
2954 /* Equivalence Classes and Character Classes can't be a range
2955 start/end. */
2956 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2957 || start_elem->type == CHAR_CLASS
2958 || end_elem->type == EQUIV_CLASS
2959 || end_elem->type == CHAR_CLASS))
2960 return REG_ERANGE;
10677727 2961
fdcd177f
FS
2962 /* FIXME: Implement rational ranges here, too. */
2963 start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2964 table_size, symb_table, extra);
2965 end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2966 table_size, symb_table, extra);
2967 /* Check start/end collation sequence values. */
2968 if (__glibc_unlikely (start_collseq == UINT_MAX
2969 || end_collseq == UINT_MAX))
2970 return REG_ECOLLATE;
2971 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2972 && start_collseq > end_collseq))
2973 return REG_ERANGE;
3b0bdc72 2974
fdcd177f
FS
2975 /* Got valid collation sequence values, add them as a new entry.
2976 However, if we have no collation elements, and the character set
2977 is single byte, the single byte character set that we
2978 build below suffices. */
2979 if (nrules > 0 || dfa->mb_cur_max > 1)
2980 {
2981 /* Check the space of the arrays. */
2982 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
15a7d175 2983 {
fdcd177f
FS
2984 /* There is not enough space, need realloc. */
2985 uint32_t *new_array_start;
2986 uint32_t *new_array_end;
be9ad300 2987 Idx new_nranges;
fdcd177f
FS
2988
2989 /* +1 in case of mbcset->nranges is 0. */
2990 new_nranges = 2 * mbcset->nranges + 1;
2991 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2992 new_nranges);
2993 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994 new_nranges);
2995
2996 if (__glibc_unlikely (new_array_start == NULL
2997 || new_array_end == NULL))
2998 return REG_ESPACE;
2999
3000 mbcset->range_starts = new_array_start;
3001 mbcset->range_ends = new_array_end;
3002 *range_alloc = new_nranges;
15a7d175 3003 }
fdcd177f
FS
3004
3005 mbcset->range_starts[mbcset->nranges] = start_collseq;
3006 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3b0bdc72 3007 }
3b0bdc72 3008
fdcd177f
FS
3009 /* Build the table for single byte characters. */
3010 for (ch = 0; ch < SBC_MAX; ch++)
3011 {
3012 uint32_t ch_collseq;
3013 /* if (MB_CUR_MAX == 1) */
3014 if (nrules == 0)
3015 ch_collseq = collseqmb[ch];
3016 else
3017 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3018 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3019 bitset_set (sbcset, ch);
3020 }
3021 return REG_NOERROR;
3022}
3023
3024/* Local function for parse_bracket_exp used in _LIBC environment.
3025 Build the collating element which is represented by NAME.
3026 The result are written to MBCSET and SBCSET.
3027 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3028 pointer argument since we may update it. */
3b0bdc72 3029
fdcd177f
FS
3030static inline reg_errcode_t
3031__attribute__ ((always_inline))
3032build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
be9ad300 3033 Idx *coll_sym_alloc, const unsigned char *name,
fdcd177f
FS
3034 uint32_t nrules, int32_t table_size,
3035 const int32_t *symb_table, const unsigned char *extra)
3036{
3037 int32_t elem, idx;
3038 size_t name_len = strlen ((const char *) name);
3039 if (nrules != 0)
3b0bdc72 3040 {
fdcd177f
FS
3041 elem = seek_collating_symbol_entry (name, name_len, symb_table,
3042 table_size, extra);
3043 if (elem != -1)
15a7d175 3044 {
fdcd177f
FS
3045 /* We found the entry. */
3046 idx = symb_table[2 * elem + 1];
3047 /* Skip the name of collating element name. */
3048 idx += 1 + extra[idx];
3049 }
3050 else if (name_len == 1)
3051 {
3052 /* No valid character, treat it as a normal
3053 character. */
3054 bitset_set (sbcset, name[0]);
15a7d175
UD
3055 return REG_NOERROR;
3056 }
fdcd177f
FS
3057 else
3058 return REG_ECOLLATE;
3059
3060 /* Got valid collation sequence, add it as a new entry. */
3061 /* Check the space of the arrays. */
3062 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3063 {
3064 /* Not enough, realloc it. */
3065 /* +1 in case of mbcset->ncoll_syms is 0. */
be9ad300 3066 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
fdcd177f
FS
3067 /* Use realloc since mbcset->coll_syms is NULL
3068 if *alloc == 0. */
3069 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3070 new_coll_sym_alloc);
3071 if (__glibc_unlikely (new_coll_syms == NULL))
3072 return REG_ESPACE;
3073 mbcset->coll_syms = new_coll_syms;
3074 *coll_sym_alloc = new_coll_sym_alloc;
3075 }
3076 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3077 return REG_NOERROR;
3078 }
3079 else
3080 {
3081 if (__glibc_unlikely (name_len != 1))
3082 return REG_ECOLLATE;
3b0bdc72 3083 else
15a7d175 3084 {
fdcd177f
FS
3085 bitset_set (sbcset, name[0]);
3086 return REG_NOERROR;
15a7d175 3087 }
3b0bdc72 3088 }
fdcd177f
FS
3089}
3090#endif /* _LIBC */
3091
3092/* This function parse bracket expression like "[abc]", "[a-c]",
3093 "[[.a-a.]]" etc. */
3094
3095static bin_tree_t *
3096parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3097 reg_syntax_t syntax, reg_errcode_t *err)
3098{
3099#ifdef _LIBC
3100 const unsigned char *collseqmb;
3101 const char *collseqwc = NULL;
3102 uint32_t nrules;
3103 int32_t table_size = 0;
3104 const int32_t *symb_table = NULL;
3105 const unsigned char *extra = NULL;
434d3784
UD
3106#endif
3107
3b0bdc72
UD
3108 re_token_t br_token;
3109 re_bitset_ptr_t sbcset;
c0a0f9a3 3110#ifdef RE_ENABLE_I18N
3b0bdc72 3111 re_charset_t *mbcset;
eb04c213
AZ
3112 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3113 Idx equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3 3114#endif /* not RE_ENABLE_I18N */
eb04c213 3115 bool non_match = false;
c0a0f9a3 3116 bin_tree_t *work_tree;
ee70274a 3117 int token_len;
eb04c213 3118 bool first_round = true;
3b0bdc72 3119#ifdef _LIBC
62439eac
UD
3120 collseqmb = (const unsigned char *)
3121 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
3122 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123 if (nrules)
3124 {
3125 /*
3126 if (MB_CUR_MAX > 1)
3127 */
7d4722e3 3128 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
3129 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3130 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3131 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 3132 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3133 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3134 }
3135#endif
2c05d33f 3136 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3137#ifdef RE_ENABLE_I18N
3b0bdc72 3138 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3139#endif /* RE_ENABLE_I18N */
3140#ifdef RE_ENABLE_I18N
f4efbdfb 3141 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
c0a0f9a3 3142#else
f4efbdfb 3143 if (__glibc_unlikely (sbcset == NULL))
c0a0f9a3 3144#endif /* RE_ENABLE_I18N */
3b0bdc72 3145 {
a129c80d
UD
3146 re_free (sbcset);
3147#ifdef RE_ENABLE_I18N
3148 re_free (mbcset);
3149#endif
3b0bdc72
UD
3150 *err = REG_ESPACE;
3151 return NULL;
3152 }
3153
3154 token_len = peek_token_bracket (token, regexp, syntax);
f4efbdfb 3155 if (__glibc_unlikely (token->type == END_OF_RE))
3b0bdc72 3156 {
3b0bdc72 3157 *err = REG_BADPAT;
434d3784 3158 goto parse_bracket_exp_free_return;
3b0bdc72
UD
3159 }
3160 if (token->type == OP_NON_MATCH_LIST)
3161 {
c0a0f9a3 3162#ifdef RE_ENABLE_I18N
3b0bdc72 3163 mbcset->non_match = 1;
c0a0f9a3 3164#endif /* not RE_ENABLE_I18N */
eb04c213 3165 non_match = true;
3b0bdc72 3166 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
784aacea 3167 bitset_set (sbcset, '\n');
3b0bdc72
UD
3168 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3169 token_len = peek_token_bracket (token, regexp, syntax);
f4efbdfb 3170 if (__glibc_unlikely (token->type == END_OF_RE))
15a7d175
UD
3171 {
3172 *err = REG_BADPAT;
3173 goto parse_bracket_exp_free_return;
3174 }
3b0bdc72
UD
3175 }
3176
3177 /* We treat the first ']' as a normal character. */
3178 if (token->type == OP_CLOSE_BRACKET)
3179 token->type = CHARACTER;
3180
3181 while (1)
3182 {
3183 bracket_elem_t start_elem, end_elem;
62439eac
UD
3184 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3185 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72 3186 reg_errcode_t ret;
eb04c213
AZ
3187 int token_len2 = 0;
3188 bool is_range_exp = false;
3b0bdc72
UD
3189 re_token_t token2;
3190
3191 start_elem.opr.name = start_name_buf;
39a12f8d 3192 start_elem.type = COLL_SYM;
3b0bdc72 3193 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 3194 syntax, first_round);
f4efbdfb 3195 if (__glibc_unlikely (ret != REG_NOERROR))
15a7d175
UD
3196 {
3197 *err = ret;
3198 goto parse_bracket_exp_free_return;
3199 }
eb04c213 3200 first_round = false;
3b0bdc72 3201
78d8b07a 3202 /* Get information about the next token. We need it in any case. */
3b0bdc72 3203 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3204
3205 /* Do not check for ranges if we know they are not allowed. */
3206 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3207 {
f4efbdfb 3208 if (__glibc_unlikely (token->type == END_OF_RE))
15a7d175 3209 {
78d8b07a 3210 *err = REG_EBRACK;
15a7d175
UD
3211 goto parse_bracket_exp_free_return;
3212 }
78d8b07a 3213 if (token->type == OP_CHARSET_RANGE)
15a7d175 3214 {
78d8b07a
UD
3215 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3216 token_len2 = peek_token_bracket (&token2, regexp, syntax);
f4efbdfb 3217 if (__glibc_unlikely (token2.type == END_OF_RE))
78d8b07a
UD
3218 {
3219 *err = REG_EBRACK;
3220 goto parse_bracket_exp_free_return;
3221 }
3222 if (token2.type == OP_CLOSE_BRACKET)
3223 {
3224 /* We treat the last '-' as a normal character. */
3225 re_string_skip_bytes (regexp, -token_len);
3226 token->type = CHARACTER;
3227 }
3228 else
eb04c213 3229 is_range_exp = true;
15a7d175 3230 }
15a7d175 3231 }
3b0bdc72 3232
eb04c213 3233 if (is_range_exp == true)
15a7d175
UD
3234 {
3235 end_elem.opr.name = end_name_buf;
39a12f8d 3236 end_elem.type = COLL_SYM;
15a7d175 3237 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
eb04c213 3238 dfa, syntax, true);
f4efbdfb 3239 if (__glibc_unlikely (ret != REG_NOERROR))
15a7d175
UD
3240 {
3241 *err = ret;
3242 goto parse_bracket_exp_free_return;
3243 }
3244
3245 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3246
10677727
UD
3247#ifdef _LIBC
3248 *err = build_range_exp (sbcset, mbcset, &range_alloc,
fdcd177f
FS
3249 &start_elem, &end_elem,
3250 dfa, syntax, nrules, collseqmb, collseqwc,
3251 table_size, symb_table, extra);
10677727
UD
3252#else
3253# ifdef RE_ENABLE_I18N
eb04c213 3254 *err = build_range_exp (syntax, sbcset,
10677727
UD
3255 dfa->mb_cur_max > 1 ? mbcset : NULL,
3256 &range_alloc, &start_elem, &end_elem);
3257# else
eb04c213 3258 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
10677727 3259# endif
c0a0f9a3 3260#endif /* RE_ENABLE_I18N */
f4efbdfb 3261 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175
UD
3262 goto parse_bracket_exp_free_return;
3263 }
3b0bdc72 3264 else
15a7d175
UD
3265 {
3266 switch (start_elem.type)
3267 {
3268 case SB_CHAR:
3269 bitset_set (sbcset, start_elem.opr.ch);
3270 break;
c0a0f9a3 3271#ifdef RE_ENABLE_I18N
15a7d175
UD
3272 case MB_CHAR:
3273 /* Check whether the array has enough space. */
f4efbdfb 3274 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
15a7d175 3275 {
951d6408 3276 wchar_t *new_mbchars;
15a7d175
UD
3277 /* Not enough, realloc it. */
3278 /* +1 in case of mbcset->nmbchars is 0. */
3279 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3280 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3281 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3282 mbchar_alloc);
f4efbdfb 3283 if (__glibc_unlikely (new_mbchars == NULL))
15a7d175 3284 goto parse_bracket_exp_espace;
951d6408 3285 mbcset->mbchars = new_mbchars;
15a7d175
UD
3286 }
3287 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3288 break;
c0a0f9a3 3289#endif /* RE_ENABLE_I18N */
15a7d175
UD
3290 case EQUIV_CLASS:
3291 *err = build_equiv_class (sbcset,
c0a0f9a3 3292#ifdef RE_ENABLE_I18N
15a7d175 3293 mbcset, &equiv_class_alloc,
c0a0f9a3 3294#endif /* RE_ENABLE_I18N */
3b0bdc72 3295 start_elem.opr.name);
f4efbdfb 3296 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175
UD
3297 goto parse_bracket_exp_free_return;
3298 break;
3299 case COLL_SYM:
3300 *err = build_collating_symbol (sbcset,
c0a0f9a3 3301#ifdef RE_ENABLE_I18N
15a7d175 3302 mbcset, &coll_sym_alloc,
c0a0f9a3 3303#endif /* RE_ENABLE_I18N */
fdcd177f
FS
3304 start_elem.opr.name,
3305 nrules, table_size, symb_table, extra);
f4efbdfb 3306 if (__glibc_unlikely (*err != REG_NOERROR))
15a7d175
UD
3307 goto parse_bracket_exp_free_return;
3308 break;
3309 case CHAR_CLASS:
66b110e8 3310 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3311#ifdef RE_ENABLE_I18N
7b7b9e70 3312 mbcset, &char_class_alloc,
c0a0f9a3 3313#endif /* RE_ENABLE_I18N */
eb04c213
AZ
3314 (const char *) start_elem.opr.name,
3315 syntax);
f4efbdfb 3316 if (__glibc_unlikely (*err != REG_NOERROR))
7b7b9e70 3317 goto parse_bracket_exp_free_return;
15a7d175
UD
3318 break;
3319 default:
2a0356e1 3320 DEBUG_ASSERT (false);
15a7d175
UD
3321 break;
3322 }
3323 }
f4efbdfb 3324 if (__glibc_unlikely (token->type == END_OF_RE))
78d8b07a
UD
3325 {
3326 *err = REG_EBRACK;
3327 goto parse_bracket_exp_free_return;
3328 }
3b0bdc72 3329 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3330 break;
3b0bdc72
UD
3331 }
3332
3333 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3334
3335 /* If it is non-matching list. */
c0a0f9a3 3336 if (non_match)
3b0bdc72 3337 bitset_not (sbcset);
10677727 3338
65e6becf
UD
3339#ifdef RE_ENABLE_I18N
3340 /* Ensure only single byte characters are set. */
3341 if (dfa->mb_cur_max > 1)
3342 bitset_mask (sbcset, dfa->sb_char);
3b0bdc72
UD
3343
3344 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3345 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3346 || mbcset->non_match)))
3b0bdc72 3347 {
3b0bdc72 3348 bin_tree_t *mbc_tree;
ad7f28c2 3349 int sbc_idx;
3b0bdc72 3350 /* Build a tree for complex bracket. */
ad7f28c2 3351 dfa->has_mb_node = 1;
02f3550c
UD
3352 br_token.type = COMPLEX_BRACKET;
3353 br_token.opr.mbcset = mbcset;
3354 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3355 if (__glibc_unlikely (mbc_tree == NULL))
02f3550c 3356 goto parse_bracket_exp_espace;
2c05d33f 3357 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
ad7f28c2
UD
3358 if (sbcset[sbc_idx])
3359 break;
3360 /* If there are no bits set in sbcset, there is no point
3361 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
2c05d33f 3362 if (sbc_idx < BITSET_WORDS)
02f3550c 3363 {
21f5de55
PE
3364 /* Build a tree for simple bracket. */
3365 br_token.type = SIMPLE_BRACKET;
3366 br_token.opr.sbcset = sbcset;
3367 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3368 if (__glibc_unlikely (work_tree == NULL))
21f5de55 3369 goto parse_bracket_exp_espace;
02f3550c 3370
21f5de55
PE
3371 /* Then join them by ALT node. */
3372 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
f4efbdfb 3373 if (__glibc_unlikely (work_tree == NULL))
21f5de55 3374 goto parse_bracket_exp_espace;
02f3550c
UD
3375 }
3376 else
ad7f28c2
UD
3377 {
3378 re_free (sbcset);
02f3550c 3379 work_tree = mbc_tree;
ad7f28c2 3380 }
3b0bdc72
UD
3381 }
3382 else
963d8d78 3383#endif /* not RE_ENABLE_I18N */
3b0bdc72 3384 {
963d8d78
UD
3385#ifdef RE_ENABLE_I18N
3386 free_charset (mbcset);
3387#endif
02f3550c
UD
3388 /* Build a tree for simple bracket. */
3389 br_token.type = SIMPLE_BRACKET;
3390 br_token.opr.sbcset = sbcset;
3391 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3392 if (__glibc_unlikely (work_tree == NULL))
21f5de55 3393 goto parse_bracket_exp_espace;
3b0bdc72 3394 }
02f3550c 3395 return work_tree;
3b0bdc72
UD
3396
3397 parse_bracket_exp_espace:
3b0bdc72 3398 *err = REG_ESPACE;
434d3784
UD
3399 parse_bracket_exp_free_return:
3400 re_free (sbcset);
c0a0f9a3 3401#ifdef RE_ENABLE_I18N
434d3784 3402 free_charset (mbcset);
c0a0f9a3 3403#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3404 return NULL;
3405}
3406
434d3784
UD
3407/* Parse an element in the bracket expression. */
3408
3b0bdc72 3409static reg_errcode_t
0fd8ae9c
UD
3410parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3411 re_token_t *token, int token_len, re_dfa_t *dfa,
eb04c213 3412 reg_syntax_t syntax, bool accept_hyphen)
3b0bdc72
UD
3413{
3414#ifdef RE_ENABLE_I18N
3415 int cur_char_size;
3416 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3417 if (cur_char_size > 1)
3418 {
3419 elem->type = MB_CHAR;
3420 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3421 re_string_skip_bytes (regexp, cur_char_size);
3422 return REG_NOERROR;
3423 }
3424#endif /* RE_ENABLE_I18N */
3425 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3426 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3427 || token->type == OP_OPEN_EQUIV_CLASS)
3428 return parse_bracket_symbol (elem, regexp, token);
f4efbdfb 3429 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
78d8b07a
UD
3430 {
3431 /* A '-' must only appear as anything but a range indicator before
3432 the closing bracket. Everything else is an error. */
3433 re_token_t token2;
3434 (void) peek_token_bracket (&token2, regexp, syntax);
3435 if (token2.type != OP_CLOSE_BRACKET)
3436 /* The actual error value is not standardized since this whole
3437 case is undefined. But ERANGE makes good sense. */
3438 return REG_ERANGE;
3439 }
3b0bdc72
UD
3440 elem->type = SB_CHAR;
3441 elem->opr.ch = token->opr.c;
3442 return REG_NOERROR;
3443}
3444
434d3784
UD
3445/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3446 such as [:<character_class>:], [.<collating_element>.], and
3447 [=<equivalent_class>=]. */
3448
3b0bdc72 3449static reg_errcode_t
0fd8ae9c
UD
3450parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3451 re_token_t *token)
3b0bdc72
UD
3452{
3453 unsigned char ch, delim = token->opr.c;
3454 int i = 0;
294b6bcc
UD
3455 if (re_string_eoi(regexp))
3456 return REG_EBRACK;
602c2f9d 3457 for (;; ++i)
3b0bdc72 3458 {
294b6bcc 3459 if (i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3460 return REG_EBRACK;
3b0bdc72 3461 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3462 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3463 else
15a7d175 3464 ch = re_string_fetch_byte (regexp);
294b6bcc
UD
3465 if (re_string_eoi(regexp))
3466 return REG_EBRACK;
3b0bdc72 3467 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3468 break;
3b0bdc72
UD
3469 elem->opr.name[i] = ch;
3470 }
3471 re_string_skip_bytes (regexp, 1);
3472 elem->opr.name[i] = '\0';
3473 switch (token->type)
3474 {
3475 case OP_OPEN_COLL_ELEM:
3476 elem->type = COLL_SYM;
3477 break;
3478 case OP_OPEN_EQUIV_CLASS:
3479 elem->type = EQUIV_CLASS;
3480 break;
3481 case OP_OPEN_CHAR_CLASS:
3482 elem->type = CHAR_CLASS;
3483 break;
3484 default:
3485 break;
3486 }
3487 return REG_NOERROR;
3488}
3489
3490 /* Helper function for parse_bracket_exp.
3491 Build the equivalence class which is represented by NAME.
3492 The result are written to MBCSET and SBCSET.
3493 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
d3821ab0 3494 is a pointer argument since we may update it. */
3b0bdc72
UD
3495
3496static reg_errcode_t
c0a0f9a3 3497#ifdef RE_ENABLE_I18N
0fd8ae9c 3498build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
eb04c213 3499 Idx *equiv_class_alloc, const unsigned char *name)
c0a0f9a3 3500#else /* not RE_ENABLE_I18N */
0fd8ae9c 3501build_equiv_class (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 3502#endif /* not RE_ENABLE_I18N */
3b0bdc72 3503{
0fd8ae9c 3504#ifdef _LIBC
3b0bdc72
UD
3505 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3506 if (nrules != 0)
3507 {
3508 const int32_t *table, *indirect;
3509 const unsigned char *weights, *extra, *cp;
62439eac 3510 unsigned char char_buf[2];
3b0bdc72
UD
3511 int32_t idx1, idx2;
3512 unsigned int ch;
3513 size_t len;
3b0bdc72 3514 /* Calculate the index for equivalence class. */
62439eac 3515 cp = name;
3b0bdc72
UD
3516 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3517 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3518 _NL_COLLATE_WEIGHTMB);
3519 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3520 _NL_COLLATE_EXTRAMB);
3b0bdc72 3521 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3522 _NL_COLLATE_INDIRECTMB);
8c0ab919 3523 idx1 = findidx (table, indirect, extra, &cp, -1);
f4efbdfb 3524 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
15a7d175
UD
3525 /* This isn't a valid character. */
3526 return REG_ECOLLATE;
3b0bdc72 3527
5069ff32 3528 /* Build single byte matching table for this equivalence class. */
b7d1c5fa 3529 len = weights[idx1 & 0xffffff];
3b0bdc72 3530 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3531 {
3532 char_buf[0] = ch;
3533 cp = char_buf;
8c0ab919 3534 idx2 = findidx (table, indirect, extra, &cp, 1);
3b0bdc72 3535/*
15a7d175 3536 idx2 = table[ch];
3b0bdc72 3537*/
15a7d175
UD
3538 if (idx2 == 0)
3539 /* This isn't a valid character. */
3540 continue;
b7d1c5fa
UD
3541 /* Compare only if the length matches and the collation rule
3542 index is the same. */
786658a0
FW
3543 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3544 && memcmp (weights + (idx1 & 0xffffff) + 1,
3545 weights + (idx2 & 0xffffff) + 1, len) == 0)
3546 bitset_set (sbcset, ch);
15a7d175 3547 }
a9388965 3548 /* Check whether the array has enough space. */
f4efbdfb 3549 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
15a7d175
UD
3550 {
3551 /* Not enough, realloc it. */
3552 /* +1 in case of mbcset->nequiv_classes is 0. */
eb04c213 3553 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3554 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3555 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3556 int32_t,
3557 new_equiv_class_alloc);
f4efbdfb 3558 if (__glibc_unlikely (new_equiv_classes == NULL))
15a7d175 3559 return REG_ESPACE;
951d6408
UD
3560 mbcset->equiv_classes = new_equiv_classes;
3561 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3562 }
3b0bdc72
UD
3563 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3564 }
3565 else
10677727 3566#endif /* _LIBC */
3b0bdc72 3567 {
f4efbdfb 3568 if (__glibc_unlikely (strlen ((const char *) name) != 1))
15a7d175 3569 return REG_ECOLLATE;
62439eac 3570 bitset_set (sbcset, *name);
3b0bdc72
UD
3571 }
3572 return REG_NOERROR;
3573}
3574
3575 /* Helper function for parse_bracket_exp.
3576 Build the character class which is represented by NAME.
3577 The result are written to MBCSET and SBCSET.
3578 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
d3821ab0 3579 is a pointer argument since we may update it. */
3b0bdc72
UD
3580
3581static reg_errcode_t
c0a0f9a3 3582#ifdef RE_ENABLE_I18N
0fd8ae9c 3583build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
eb04c213
AZ
3584 re_charset_t *mbcset, Idx *char_class_alloc,
3585 const char *class_name, reg_syntax_t syntax)
c0a0f9a3 3586#else /* not RE_ENABLE_I18N */
0fd8ae9c 3587build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
eb04c213 3588 const char *class_name, reg_syntax_t syntax)
c0a0f9a3 3589#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3590{
3591 int i;
eb04c213 3592 const char *name = class_name;
c0a0f9a3
UD
3593
3594 /* In case of REG_ICASE "upper" and "lower" match the both of
3595 upper and lower cases. */
3596 if ((syntax & RE_ICASE)
3597 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3598 name = "alpha";
3599
3600#ifdef RE_ENABLE_I18N
3b0bdc72 3601 /* Check the space of the arrays. */
f4efbdfb 3602 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
a9388965
UD
3603 {
3604 /* Not enough, realloc it. */
3605 /* +1 in case of mbcset->nchar_classes is 0. */
eb04c213 3606 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3607 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3608 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3609 new_char_class_alloc);
f4efbdfb 3610 if (__glibc_unlikely (new_char_classes == NULL))
15a7d175 3611 return REG_ESPACE;
951d6408
UD
3612 mbcset->char_classes = new_char_classes;
3613 *char_class_alloc = new_char_class_alloc;
a9388965 3614 }
3b0bdc72 3615 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3616#endif /* RE_ENABLE_I18N */
3b0bdc72 3617
66b110e8 3618#define BUILD_CHARCLASS_LOOP(ctype_func) \
513bbb25 3619 do { \
f4efbdfb 3620 if (__glibc_unlikely (trans != NULL)) \
0ecb606c 3621 { \
513bbb25 3622 for (i = 0; i < SBC_MAX; ++i) \
eb04c213 3623 if (ctype_func (i)) \
513bbb25
UD
3624 bitset_set (sbcset, trans[i]); \
3625 } \
3626 else \
3627 { \
3628 for (i = 0; i < SBC_MAX; ++i) \
eb04c213 3629 if (ctype_func (i)) \
513bbb25
UD
3630 bitset_set (sbcset, i); \
3631 } \
3632 } while (0)
3b0bdc72
UD
3633
3634 if (strcmp (name, "alnum") == 0)
513bbb25 3635 BUILD_CHARCLASS_LOOP (isalnum);
3b0bdc72 3636 else if (strcmp (name, "cntrl") == 0)
513bbb25 3637 BUILD_CHARCLASS_LOOP (iscntrl);
3b0bdc72 3638 else if (strcmp (name, "lower") == 0)
513bbb25 3639 BUILD_CHARCLASS_LOOP (islower);
3b0bdc72 3640 else if (strcmp (name, "space") == 0)
513bbb25 3641 BUILD_CHARCLASS_LOOP (isspace);
3b0bdc72 3642 else if (strcmp (name, "alpha") == 0)
513bbb25 3643 BUILD_CHARCLASS_LOOP (isalpha);
3b0bdc72 3644 else if (strcmp (name, "digit") == 0)
513bbb25 3645 BUILD_CHARCLASS_LOOP (isdigit);
3b0bdc72 3646 else if (strcmp (name, "print") == 0)
513bbb25 3647 BUILD_CHARCLASS_LOOP (isprint);
3b0bdc72 3648 else if (strcmp (name, "upper") == 0)
513bbb25 3649 BUILD_CHARCLASS_LOOP (isupper);
3b0bdc72 3650 else if (strcmp (name, "blank") == 0)
513bbb25 3651 BUILD_CHARCLASS_LOOP (isblank);
3b0bdc72 3652 else if (strcmp (name, "graph") == 0)
513bbb25 3653 BUILD_CHARCLASS_LOOP (isgraph);
3b0bdc72 3654 else if (strcmp (name, "punct") == 0)
513bbb25 3655 BUILD_CHARCLASS_LOOP (ispunct);
3b0bdc72 3656 else if (strcmp (name, "xdigit") == 0)
513bbb25 3657 BUILD_CHARCLASS_LOOP (isxdigit);
3b0bdc72
UD
3658 else
3659 return REG_ECTYPE;
3660
3661 return REG_NOERROR;
3662}
3663
3664static bin_tree_t *
0fd8ae9c 3665build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
eb04c213
AZ
3666 const char *class_name,
3667 const char *extra, bool non_match,
0fd8ae9c 3668 reg_errcode_t *err)
3b0bdc72
UD
3669{
3670 re_bitset_ptr_t sbcset;
c0a0f9a3 3671#ifdef RE_ENABLE_I18N
3b0bdc72 3672 re_charset_t *mbcset;
eb04c213 3673 Idx alloc = 0;
c0a0f9a3 3674#endif /* not RE_ENABLE_I18N */
3b0bdc72 3675 reg_errcode_t ret;
3b0bdc72 3676 bin_tree_t *tree;
3b0bdc72 3677
2c05d33f 3678 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
f4efbdfb 3679 if (__glibc_unlikely (sbcset == NULL))
3b0bdc72
UD
3680 {
3681 *err = REG_ESPACE;
3682 return NULL;
3683 }
c0a0f9a3 3684#ifdef RE_ENABLE_I18N
eb04c213 3685 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
f4efbdfb 3686 if (__glibc_unlikely (mbcset == NULL))
eb04c213
AZ
3687 {
3688 re_free (sbcset);
3689 *err = REG_ESPACE;
3690 return NULL;
3b0bdc72 3691 }
eb04c213
AZ
3692 mbcset->non_match = non_match;
3693#endif /* RE_ENABLE_I18N */
3b0bdc72 3694
602c2f9d 3695 /* We don't care the syntax in this case. */
66b110e8 3696 ret = build_charclass (trans, sbcset,
c0a0f9a3 3697#ifdef RE_ENABLE_I18N
15a7d175 3698 mbcset, &alloc,
c0a0f9a3 3699#endif /* RE_ENABLE_I18N */
e2b6bfa3 3700 class_name, 0);
c0a0f9a3 3701
f4efbdfb 3702 if (__glibc_unlikely (ret != REG_NOERROR))
3b0bdc72
UD
3703 {
3704 re_free (sbcset);
c0a0f9a3 3705#ifdef RE_ENABLE_I18N
3b0bdc72 3706 free_charset (mbcset);
c0a0f9a3 3707#endif /* RE_ENABLE_I18N */
7b7b9e70 3708 *err = ret;
3b0bdc72
UD
3709 return NULL;
3710 }
434d3784 3711 /* \w match '_' also. */
e2b6bfa3
UD
3712 for (; *extra; extra++)
3713 bitset_set (sbcset, *extra);
3b0bdc72
UD
3714
3715 /* If it is non-matching list. */
c0a0f9a3 3716 if (non_match)
3b0bdc72
UD
3717 bitset_not (sbcset);
3718
65e6becf
UD
3719#ifdef RE_ENABLE_I18N
3720 /* Ensure only single byte characters are set. */
3721 if (dfa->mb_cur_max > 1)
3722 bitset_mask (sbcset, dfa->sb_char);
3723#endif
3724
3b0bdc72 3725 /* Build a tree for simple bracket. */
2a0356e1 3726 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
02f3550c 3727 tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3728 if (__glibc_unlikely (tree == NULL))
3b0bdc72
UD
3729 goto build_word_op_espace;
3730
c0a0f9a3 3731#ifdef RE_ENABLE_I18N
3c0fb574 3732 if (dfa->mb_cur_max > 1)
3b0bdc72 3733 {
3b0bdc72
UD
3734 bin_tree_t *mbc_tree;
3735 /* Build a tree for complex bracket. */
3736 br_token.type = COMPLEX_BRACKET;
3737 br_token.opr.mbcset = mbcset;
3738 dfa->has_mb_node = 1;
02f3550c 3739 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
f4efbdfb 3740 if (__glibc_unlikely (mbc_tree == NULL))
15a7d175 3741 goto build_word_op_espace;
3b0bdc72 3742 /* Then join them by ALT node. */
02f3550c 3743 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
f4efbdfb 3744 if (__glibc_likely (mbc_tree != NULL))
15a7d175 3745 return tree;
3b0bdc72
UD
3746 }
3747 else
3748 {
3749 free_charset (mbcset);
3750 return tree;
3751 }
c0a0f9a3
UD
3752#else /* not RE_ENABLE_I18N */
3753 return tree;
3754#endif /* not RE_ENABLE_I18N */
3755
3b0bdc72
UD
3756 build_word_op_espace:
3757 re_free (sbcset);
c0a0f9a3 3758#ifdef RE_ENABLE_I18N
3b0bdc72 3759 free_charset (mbcset);
c0a0f9a3 3760#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3761 *err = REG_ESPACE;
3762 return NULL;
3763}
3764
3765/* This is intended for the expressions like "a{1,3}".
eb04c213
AZ
3766 Fetch a number from 'input', and return the number.
3767 Return -1 if the number field is empty like "{,1}".
3768 Return RE_DUP_MAX + 1 if the number field is too large.
3769 Return -2 if an error occurred. */
3b0bdc72 3770
eb04c213 3771static Idx
0fd8ae9c 3772fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3b0bdc72 3773{
eb04c213 3774 Idx num = -1;
3b0bdc72
UD
3775 unsigned char c;
3776 while (1)
3777 {
f0d77aa8 3778 fetch_token (token, input, syntax);
3b0bdc72 3779 c = token->opr.c;
f4efbdfb 3780 if (__glibc_unlikely (token->type == END_OF_RE))
15a7d175 3781 return -2;
3b0bdc72 3782 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3783 break;
602c2f9d 3784 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
eb04c213
AZ
3785 ? -2
3786 : num == -1
3787 ? c - '0'
3788 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3b0bdc72 3789 }
3b0bdc72
UD
3790 return num;
3791}
3792\f
c0a0f9a3 3793#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3794static void
3795free_charset (re_charset_t *cset)
3796{
3797 re_free (cset->mbchars);
c0a0f9a3 3798# ifdef _LIBC
3b0bdc72
UD
3799 re_free (cset->coll_syms);
3800 re_free (cset->equiv_classes);
fa67ba06 3801# endif
3b0bdc72
UD
3802 re_free (cset->range_starts);
3803 re_free (cset->range_ends);
3804 re_free (cset->char_classes);
3805 re_free (cset);
3806}
c0a0f9a3 3807#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3808\f
3809/* Functions for binary tree operation. */
3810
ee70274a 3811/* Create a tree node. */
3b0bdc72
UD
3812
3813static bin_tree_t *
0fd8ae9c
UD
3814create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3815 re_token_type_t type)
02f3550c 3816{
2a0356e1 3817 re_token_t t = { .type = type };
02f3550c
UD
3818 return create_token_tree (dfa, left, right, &t);
3819}
3820
3821static bin_tree_t *
0fd8ae9c
UD
3822create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823 const re_token_t *token)
3b0bdc72
UD
3824{
3825 bin_tree_t *tree;
f4efbdfb 3826 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3b0bdc72 3827 {
ee70274a
UD
3828 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829
3830 if (storage == NULL)
3831 return NULL;
3832 storage->next = dfa->str_tree_storage;
3833 dfa->str_tree_storage = storage;
3834 dfa->str_tree_storage_idx = 0;
3b0bdc72 3835 }
ee70274a
UD
3836 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837
3b0bdc72
UD
3838 tree->parent = NULL;
3839 tree->left = left;
3840 tree->right = right;
02f3550c
UD
3841 tree->token = *token;
3842 tree->token.duplicated = 0;
3843 tree->token.opt_subexp = 0;
3844 tree->first = NULL;
3845 tree->next = NULL;
3846 tree->node_idx = -1;
3b0bdc72
UD
3847
3848 if (left != NULL)
3849 left->parent = tree;
3850 if (right != NULL)
3851 right->parent = tree;
3852 return tree;
3853}
3854
02f3550c
UD
3855/* Mark the tree SRC as an optional subexpression.
3856 To be called from preorder or postorder. */
3b0bdc72 3857
02f3550c 3858static reg_errcode_t
0fd8ae9c 3859mark_opt_subexp (void *extra, bin_tree_t *node)
f0d77aa8 3860{
eb04c213 3861 Idx idx = (uintptr_t) extra;
02f3550c
UD
3862 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863 node->token.opt_subexp = 1;
a334319f 3864
02f3550c 3865 return REG_NOERROR;
3b0bdc72
UD
3866}
3867
02f3550c 3868/* Free the allocated memory inside NODE. */
6b6557e8
UD
3869
3870static void
02f3550c 3871free_token (re_token_t *node)
6b6557e8 3872{
02f3550c
UD
3873#ifdef RE_ENABLE_I18N
3874 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875 free_charset (node->opr.mbcset);
3876 else
3877#endif /* RE_ENABLE_I18N */
3878 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879 re_free (node->opr.sbcset);
6b6557e8
UD
3880}
3881
02f3550c
UD
3882/* Worker function for tree walking. Free the allocated memory inside NODE
3883 and its children. */
6b6557e8 3884
02f3550c
UD
3885static reg_errcode_t
3886free_tree (void *extra, bin_tree_t *node)
6b6557e8 3887{
02f3550c
UD
3888 free_token (&node->token);
3889 return REG_NOERROR;
6b6557e8
UD
3890}
3891
ee70274a 3892
02f3550c
UD
3893/* Duplicate the node SRC, and return new node. This is a preorder
3894 visit similar to the one implemented by the generic visitor, but
3895 we need more infrastructure to maintain two parallel trees --- so,
3896 it's easier to duplicate. */
3b0bdc72
UD
3897
3898static bin_tree_t *
0fd8ae9c 3899duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3b0bdc72 3900{
02f3550c
UD
3901 const bin_tree_t *node;
3902 bin_tree_t *dup_root;
3903 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3b0bdc72 3904
02f3550c 3905 for (node = root; ; )
3b0bdc72 3906 {
02f3550c
UD
3907 /* Create a new tree and link it back to the current parent. */
3908 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909 if (*p_new == NULL)
ee70274a 3910 return NULL;
02f3550c
UD
3911 (*p_new)->parent = dup_node;
3912 (*p_new)->token.duplicated = 1;
3913 dup_node = *p_new;
3b0bdc72 3914
02f3550c
UD
3915 /* Go to the left node, or up and to the right. */
3916 if (node->left)
3917 {
3918 node = node->left;
3919 p_new = &dup_node->left;
3920 }
3921 else
3922 {
3923 const bin_tree_t *prev = NULL;
3924 while (node->right == prev || node->right == NULL)
3925 {
3926 prev = node;
3927 node = node->parent;
3928 dup_node = dup_node->parent;
3929 if (!node)
21f5de55 3930 return dup_root;
02f3550c
UD
3931 }
3932 node = node->right;
3933 p_new = &dup_node->right;
3934 }
3b0bdc72 3935 }
3b0bdc72 3936}