]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
Replace FSF snail mail address with URLs.
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
9f115170 2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
3b0bdc72 19
3b0bdc72 20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
01ed6ceb 21 size_t length, reg_syntax_t syntax);
3b0bdc72 22static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
23 const re_dfastate_t *init_state,
24 char *fastmap);
01ed6ceb 25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
c0a0f9a3 26#ifdef RE_ENABLE_I18N
3b0bdc72 27static void free_charset (re_charset_t *cset);
c0a0f9a3 28#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
02f3550c 34static reg_errcode_t analyze (regex_t *preg);
02f3550c
UD
35static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
2d87db5b 48static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
01ed6ceb 49static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
a7d5c291 50 unsigned int constraint);
3b0bdc72 51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
15a7d175 53 int node, int root);
963d8d78 54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
3b0bdc72 55static int fetch_number (re_string_t *input, re_token_t *token,
15a7d175 56 reg_syntax_t syntax);
3b0bdc72 57static int peek_token (re_token_t *token, re_string_t *input,
9b0c3f38 58 reg_syntax_t syntax) internal_function;
3b0bdc72 59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 60 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
3b0bdc72 64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175
UD
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
3b0bdc72 67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175
UD
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
3b0bdc72 70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
3b0bdc72 73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
3b0bdc72 79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
78d8b07a
UD
83 reg_syntax_t syntax,
84 int accept_hyphen);
3b0bdc72 85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
86 re_string_t *regexp,
87 re_token_t *token);
c0a0f9a3 88#ifdef RE_ENABLE_I18N
2c05d33f 89static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175
UD
90 re_charset_t *mbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
997470b3 93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 94 bitset_t sbcset,
15a7d175
UD
95 re_charset_t *mbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
c0a0f9a3 99#else /* not RE_ENABLE_I18N */
2c05d33f 100static reg_errcode_t build_equiv_class (bitset_t sbcset,
15a7d175 101 const unsigned char *name);
997470b3 102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2c05d33f 103 bitset_t sbcset,
15a7d175
UD
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
c0a0f9a3 106#endif /* not RE_ENABLE_I18N */
86576b62 107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
997470b3 108 RE_TRANSLATE_TYPE trans,
06e8303a 109 const unsigned char *class_name,
10677727
UD
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
ee70274a
UD
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
02f3550c
UD
114 re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
3b0bdc72 118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
02f3550c
UD
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
3b0bdc72
UD
122\f
123/* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
6455d255 128const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
129 {
130#define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
6455d255 182const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202\f
203/* Entry points for GNU code. */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
207 Returns 0 if the pattern was valid, otherwise an error string.
208
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
211
212const char *
213re_compile_pattern (pattern, length, bufp)
214 const char *pattern;
215 size_t length;
216 struct re_pattern_buffer *bufp;
217{
218 reg_errcode_t ret;
219
3b0bdc72
UD
220 /* And GNU code determines whether or not to get register information
221 by passing null for the REGS argument to re_match, etc., not by
c06a6956
UD
222 setting no_sub, unless RE_NO_SUB is set. */
223 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
3b0bdc72
UD
224
225 /* Match anchors at newline. */
226 bufp->newline_anchor = 1;
227
75e4a282 228 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
229
230 if (!ret)
231 return NULL;
6455d255 232 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
233}
234#ifdef _LIBC
235weak_alias (__re_compile_pattern, re_compile_pattern)
236#endif
237
238/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241/* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243reg_syntax_t re_syntax_options;
244
245
246/* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
249
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
252
253reg_syntax_t
254re_set_syntax (syntax)
255 reg_syntax_t syntax;
256{
257 reg_syntax_t ret = re_syntax_options;
258
259 re_syntax_options = syntax;
260 return ret;
261}
262#ifdef _LIBC
263weak_alias (__re_set_syntax, re_set_syntax)
264#endif
265
266int
267re_compile_fastmap (bufp)
268 struct re_pattern_buffer *bufp;
269{
270 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271 char *fastmap = bufp->fastmap;
272
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
283}
284#ifdef _LIBC
285weak_alias (__re_compile_fastmap, re_compile_fastmap)
286#endif
287
1b2c2628 288static inline void
25337753 289__attribute ((always_inline))
1b2c2628
UD
290re_set_fastmap (char *fastmap, int icase, int ch)
291{
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
295}
296
3b0bdc72
UD
297/* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
299
300static void
0fd8ae9c
UD
301re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
3b0bdc72
UD
303{
304 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
305 int node_cnt;
3c0fb574 306 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308 {
309 int node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
311
312 if (type == CHARACTER)
74e12fbc
UD
313 {
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315#ifdef RE_ENABLE_I18N
14744156 316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 317 {
3c0fb574 318 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
74e12fbc
UD
319 wchar_t wc;
320 mbstate_t state;
321
322 p = buf;
323 *p++ = dfa->nodes[node].opr.c;
324 while (++node < dfa->nodes_len
325 && dfa->nodes[node].type == CHARACTER
326 && dfa->nodes[node].mb_partial)
327 *p++ = dfa->nodes[node].opr.c;
2c05d33f 328 memset (&state, '\0', sizeof (state));
b3918c7d
UD
329 if (__mbrtowc (&wc, (const char *) buf, p - buf,
330 &state) == p - buf
88764ae2
UD
331 && (__wcrtomb ((char *) buf, towlower (wc), &state)
332 != (size_t) -1))
74e12fbc
UD
333 re_set_fastmap (fastmap, 0, buf[0]);
334 }
335#endif
336 }
3b0bdc72 337 else if (type == SIMPLE_BRACKET)
15a7d175 338 {
2c05d33f
UD
339 int i, ch;
340 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
341 {
342 int j;
343 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345 if (w & ((bitset_word_t) 1 << j))
346 re_set_fastmap (fastmap, icase, ch);
347 }
15a7d175 348 }
c0a0f9a3 349#ifdef RE_ENABLE_I18N
3b0bdc72 350 else if (type == COMPLEX_BRACKET)
15a7d175 351 {
15a7d175 352 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
bdb56bac
UD
353 int i;
354
c0a0f9a3 355# ifdef _LIBC
bdb56bac
UD
356 /* See if we have to try all bytes which start multiple collation
357 elements.
358 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359 collation element, and don't catch 'b' since 'b' is
360 the only collation element which starts from 'b' (and
361 it is caught by SIMPLE_BRACKET). */
362 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363 && (cset->ncoll_syms || cset->nranges))
15a7d175 364 {
15a7d175
UD
365 const int32_t *table = (const int32_t *)
366 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2c05d33f
UD
367 for (i = 0; i < SBC_MAX; ++i)
368 if (table[i] < 0)
369 re_set_fastmap (fastmap, icase, i);
15a7d175 370 }
bdb56bac
UD
371# endif /* _LIBC */
372
373 /* See if we have to start the match at all multibyte characters,
374 i.e. where we would not find an invalid sequence. This only
375 applies to multibyte character sets; for single byte character
376 sets, the SIMPLE_BRACKET again suffices. */
377 if (dfa->mb_cur_max > 1
815d8147 378 && (cset->nchar_classes || cset->non_match || cset->nranges
bdb56bac
UD
379# ifdef _LIBC
380 || cset->nequiv_classes
381# endif /* _LIBC */
382 ))
15a7d175 383 {
bdb56bac
UD
384 unsigned char c = 0;
385 do
74e12fbc 386 {
bdb56bac
UD
387 mbstate_t mbs;
388 memset (&mbs, 0, sizeof (mbs));
389 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390 re_set_fastmap (fastmap, false, (int) c);
74e12fbc 391 }
bdb56bac 392 while (++c != 0);
15a7d175 393 }
bdb56bac
UD
394
395 else
396 {
397 /* ... Else catch all bytes which can start the mbchars. */
398 for (i = 0; i < cset->nmbchars; ++i)
399 {
400 char buf[256];
401 mbstate_t state;
402 memset (&state, '\0', sizeof (state));
403 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406 {
407 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
408 != (size_t) -1)
409 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
410 }
aa732e2b
UD
411 }
412 }
15a7d175 413 }
c0a0f9a3 414#endif /* RE_ENABLE_I18N */
c0d5034e
UD
415 else if (type == OP_PERIOD
416#ifdef RE_ENABLE_I18N
417 || type == OP_UTF8_PERIOD
418#endif /* RE_ENABLE_I18N */
419 || type == END_OF_RE)
15a7d175
UD
420 {
421 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422 if (type == END_OF_RE)
423 bufp->can_be_null = 1;
424 return;
425 }
3b0bdc72
UD
426 }
427}
428\f
429/* Entry point for POSIX code. */
430/* regcomp takes a regular expression as a string and compiles it.
431
432 PREG is a regex_t *. We do not expect any fields to be initialized,
433 since POSIX says we shouldn't. Thus, we set
434
435 `buffer' to the compiled pattern;
436 `used' to the length of the compiled pattern;
437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438 REG_EXTENDED bit in CFLAGS is set; otherwise, to
439 RE_SYNTAX_POSIX_BASIC;
440 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441 `fastmap' to an allocated space for the fastmap;
442 `fastmap_accurate' to zero;
443 `re_nsub' to the number of subexpressions in PATTERN.
444
445 PATTERN is the address of the pattern string.
446
447 CFLAGS is a series of bits which affect compilation.
448
449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450 use POSIX basic syntax.
451
452 If REG_NEWLINE is set, then . and [^...] don't match newline.
453 Also, regexec will try a match beginning after every newline.
454
455 If REG_ICASE is set, then we considers upper- and lowercase
456 versions of letters to be equivalent when matching.
457
458 If REG_NOSUB is set, then when PREG is passed to regexec, that
459 routine will report only success or failure, and nothing about the
460 registers.
461
462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
463 the return codes and their meanings.) */
464
465int
466regcomp (preg, pattern, cflags)
75e4a282
UD
467 regex_t *__restrict preg;
468 const char *__restrict pattern;
3b0bdc72
UD
469 int cflags;
470{
471 reg_errcode_t ret;
472 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 473 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
474
475 preg->buffer = NULL;
476 preg->allocated = 0;
477 preg->used = 0;
478
479 /* Try to allocate space for the fastmap. */
480 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 481 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
482 return REG_ESPACE;
483
484 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485
486 /* If REG_NEWLINE is set, newlines are treated differently. */
487 if (cflags & REG_NEWLINE)
488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
489 syntax &= ~RE_DOT_NEWLINE;
490 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491 /* It also changes the matching behavior. */
492 preg->newline_anchor = 1;
493 }
494 else
495 preg->newline_anchor = 0;
496 preg->no_sub = !!(cflags & REG_NOSUB);
497 preg->translate = NULL;
498
499 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500
501 /* POSIX doesn't distinguish between an unmatched open-group and an
502 unmatched close-group: both are REG_EPAREN. */
503 if (ret == REG_ERPAREN)
504 ret = REG_EPAREN;
505
a9388965 506 /* We have already checked preg->fastmap != NULL. */
bc15410e 507 if (BE (ret == REG_NOERROR, 1))
71ccd330 508 /* Compute the fastmap now, since regexec cannot modify the pattern
86576b62 509 buffer. This function never fails in this implementation. */
83b038f2 510 (void) re_compile_fastmap (preg);
71ccd330 511 else
3b0bdc72 512 {
71ccd330
UD
513 /* Some error occurred while compiling the expression. */
514 re_free (preg->fastmap);
515 preg->fastmap = NULL;
3b0bdc72
UD
516 }
517
518 return (int) ret;
519}
520#ifdef _LIBC
521weak_alias (__regcomp, regcomp)
522#endif
523
524/* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
526
527size_t
528regerror (errcode, preg, errbuf, errbuf_size)
529 int errcode;
2d87db5b
UD
530 const regex_t *__restrict preg;
531 char *__restrict errbuf;
3b0bdc72
UD
532 size_t errbuf_size;
533{
534 const char *msg;
535 size_t msg_size;
536
bc15410e 537 if (BE (errcode < 0
15a7d175
UD
538 || errcode >= (int) (sizeof (__re_error_msgid_idx)
539 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
540 /* Only error codes returned by the rest of the code should be passed
541 to this routine. If we are given anything else, or if other regex
542 code generates an invalid error code, then the program has a bug.
543 Dump core so we can fix it. */
544 abort ();
545
6455d255 546 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
547
548 msg_size = strlen (msg) + 1; /* Includes the null. */
549
bc15410e 550 if (BE (errbuf_size != 0, 1))
3b0bdc72 551 {
bc15410e 552 if (BE (msg_size > errbuf_size, 0))
15a7d175 553 {
3b0bdc72
UD
554#if defined HAVE_MEMPCPY || defined _LIBC
555 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556#else
15a7d175
UD
557 memcpy (errbuf, msg, errbuf_size - 1);
558 errbuf[errbuf_size - 1] = 0;
3b0bdc72 559#endif
15a7d175 560 }
3b0bdc72 561 else
15a7d175 562 memcpy (errbuf, msg, msg_size);
3b0bdc72
UD
563 }
564
565 return msg_size;
566}
567#ifdef _LIBC
568weak_alias (__regerror, regerror)
569#endif
570
3b0bdc72 571
e40a38b3
UD
572#ifdef RE_ENABLE_I18N
573/* This static array is used for the map to single-byte characters when
574 UTF-8 is used. Otherwise we would allocate memory just to initialize
575 it the same all the time. UTF-8 is the preferred encoding so this is
576 a worthwhile optimization. */
2c05d33f 577static const bitset_t utf8_sb_map =
e40a38b3
UD
578{
579 /* Set the first 128 bits. */
2c05d33f 580 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
e40a38b3
UD
581};
582#endif
583
584
71ccd330
UD
585static void
586free_dfa_content (re_dfa_t *dfa)
3b0bdc72
UD
587{
588 int i, j;
3b0bdc72 589
ee70274a
UD
590 if (dfa->nodes)
591 for (i = 0; i < dfa->nodes_len; ++i)
02f3550c 592 free_token (dfa->nodes + i);
71ccd330
UD
593 re_free (dfa->nexts);
594 for (i = 0; i < dfa->nodes_len; ++i)
595 {
596 if (dfa->eclosures != NULL)
597 re_node_set_free (dfa->eclosures + i);
598 if (dfa->inveclosures != NULL)
599 re_node_set_free (dfa->inveclosures + i);
600 if (dfa->edests != NULL)
601 re_node_set_free (dfa->edests + i);
602 }
603 re_free (dfa->edests);
604 re_free (dfa->eclosures);
605 re_free (dfa->inveclosures);
606 re_free (dfa->nodes);
3b0bdc72 607
ee70274a
UD
608 if (dfa->state_table)
609 for (i = 0; i <= dfa->state_hash_mask; ++i)
610 {
611 struct re_state_table_entry *entry = dfa->state_table + i;
612 for (j = 0; j < entry->num; ++j)
613 {
614 re_dfastate_t *state = entry->array[j];
615 free_state (state);
616 }
21f5de55 617 re_free (entry->array);
ee70274a 618 }
71ccd330 619 re_free (dfa->state_table);
65e6becf 620#ifdef RE_ENABLE_I18N
e40a38b3
UD
621 if (dfa->sb_char != utf8_sb_map)
622 re_free (dfa->sb_char);
a96c63ed 623#endif
c06a6956 624 re_free (dfa->subexp_map);
0742e48e 625#ifdef DEBUG
71ccd330 626 re_free (dfa->re_str);
0742e48e 627#endif
71ccd330
UD
628
629 re_free (dfa);
630}
631
632
633/* Free dynamically allocated space used by PREG. */
634
635void
636regfree (preg)
637 regex_t *preg;
638{
639 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640 if (BE (dfa != NULL, 1))
641 free_dfa_content (dfa);
86576b62
UD
642 preg->buffer = NULL;
643 preg->allocated = 0;
71ccd330 644
3b0bdc72 645 re_free (preg->fastmap);
86576b62
UD
646 preg->fastmap = NULL;
647
648 re_free (preg->translate);
649 preg->translate = NULL;
3b0bdc72
UD
650}
651#ifdef _LIBC
652weak_alias (__regfree, regfree)
653#endif
654\f
655/* Entry points compatible with 4.2 BSD regex library. We don't define
656 them unless specifically requested. */
657
658#if defined _REGEX_RE_COMP || defined _LIBC
659
660/* BSD has one and only one pattern buffer. */
661static struct re_pattern_buffer re_comp_buf;
662
663char *
664# ifdef _LIBC
665/* Make these definitions weak in libc, so POSIX programs can redefine
666 these names if they don't use our functions, and still use
667 regcomp/regexec above without link errors. */
668weak_function
669# endif
670re_comp (s)
671 const char *s;
672{
673 reg_errcode_t ret;
240e87c2 674 char *fastmap;
3b0bdc72
UD
675
676 if (!s)
677 {
678 if (!re_comp_buf.buffer)
679 return gettext ("No previous regular expression");
680 return 0;
681 }
682
240e87c2
RM
683 if (re_comp_buf.buffer)
684 {
685 fastmap = re_comp_buf.fastmap;
686 re_comp_buf.fastmap = NULL;
687 __regfree (&re_comp_buf);
1b2c2628 688 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
689 re_comp_buf.fastmap = fastmap;
690 }
691
692 if (re_comp_buf.fastmap == NULL)
3b0bdc72
UD
693 {
694 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695 if (re_comp_buf.fastmap == NULL)
6455d255
UD
696 return (char *) gettext (__re_error_msgid
697 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
698 }
699
700 /* Since `re_exec' always passes NULL for the `regs' argument, we
701 don't need to initialize the pattern buffer fields which affect it. */
702
703 /* Match anchors at newlines. */
704 re_comp_buf.newline_anchor = 1;
705
706 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
707
708 if (!ret)
709 return NULL;
710
711 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 712 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 713}
240e87c2
RM
714
715#ifdef _LIBC
c877418f 716libc_freeres_fn (free_mem)
240e87c2
RM
717{
718 __regfree (&re_comp_buf);
719}
240e87c2
RM
720#endif
721
3b0bdc72
UD
722#endif /* _REGEX_RE_COMP */
723\f
724/* Internal entry point.
725 Compile the regular expression PATTERN, whose length is LENGTH.
726 SYNTAX indicate regular expression's syntax. */
727
728static reg_errcode_t
0fd8ae9c
UD
729re_compile_internal (regex_t *preg, const char * pattern, size_t length,
730 reg_syntax_t syntax)
3b0bdc72
UD
731{
732 reg_errcode_t err = REG_NOERROR;
733 re_dfa_t *dfa;
734 re_string_t regexp;
735
736 /* Initialize the pattern buffer. */
737 preg->fastmap_accurate = 0;
738 preg->syntax = syntax;
739 preg->not_bol = preg->not_eol = 0;
740 preg->used = 0;
741 preg->re_nsub = 0;
1b2c2628
UD
742 preg->can_be_null = 0;
743 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
744
745 /* Initialize the dfa. */
746 dfa = (re_dfa_t *) preg->buffer;
951d6408 747 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3b0bdc72
UD
748 {
749 /* If zero allocated, but buffer is non-null, try to realloc
750 enough space. This loses if buffer's address is bogus, but
751 that is the user's responsibility. If ->buffer is NULL this
752 is a simple allocation. */
753 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
754 if (dfa == NULL)
755 return REG_ESPACE;
3b0bdc72 756 preg->allocated = sizeof (re_dfa_t);
951d6408 757 preg->buffer = (unsigned char *) dfa;
3b0bdc72 758 }
3b0bdc72
UD
759 preg->used = sizeof (re_dfa_t);
760
761 err = init_dfa (dfa, length);
bc15410e 762 if (BE (err != REG_NOERROR, 0))
3b0bdc72 763 {
ee70274a 764 free_dfa_content (dfa);
3b0bdc72 765 preg->buffer = NULL;
ca3c505e 766 preg->allocated = 0;
3b0bdc72
UD
767 return err;
768 }
0742e48e 769#ifdef DEBUG
01ed6ceb 770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
0742e48e
UD
771 dfa->re_str = re_malloc (char, length + 1);
772 strncpy (dfa->re_str, pattern, length + 1);
773#endif
3b0bdc72 774
96c91d1c
UD
775 __libc_lock_init (dfa->lock);
776
612546c6 777 err = re_string_construct (&regexp, pattern, length, preg->translate,
f0c7c524 778 syntax & RE_ICASE, dfa);
bc15410e 779 if (BE (err != REG_NOERROR, 0))
3b0bdc72 780 {
ee70274a
UD
781 re_compile_internal_free_return:
782 free_workarea_compile (preg);
783 re_string_destruct (&regexp);
784 free_dfa_content (dfa);
3b0bdc72 785 preg->buffer = NULL;
ca3c505e 786 preg->allocated = 0;
3b0bdc72
UD
787 return err;
788 }
789
790 /* Parse the regular expression, and build a structure tree. */
791 preg->re_nsub = 0;
792 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 793 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
794 goto re_compile_internal_free_return;
795
02f3550c
UD
796 /* Analyze the tree and create the nfa. */
797 err = analyze (preg);
798 if (BE (err != REG_NOERROR, 0))
799 goto re_compile_internal_free_return;
800
14744156
UD
801#ifdef RE_ENABLE_I18N
802 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 803 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
804 optimize_utf8 (dfa);
805#endif
806
3b0bdc72
UD
807 /* Then create the initial state of the dfa. */
808 err = create_initial_state (dfa);
3b0bdc72 809
3b0bdc72
UD
810 /* Release work areas. */
811 free_workarea_compile (preg);
812 re_string_destruct (&regexp);
813
71ccd330
UD
814 if (BE (err != REG_NOERROR, 0))
815 {
71ccd330
UD
816 free_dfa_content (dfa);
817 preg->buffer = NULL;
ca3c505e 818 preg->allocated = 0;
71ccd330
UD
819 }
820
3b0bdc72
UD
821 return err;
822}
823
824/* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
826
827static reg_errcode_t
0fd8ae9c 828init_dfa (re_dfa_t *dfa, size_t pat_len)
3b0bdc72 829{
01ed6ceb 830 unsigned int table_size;
e40a38b3
UD
831#ifndef _LIBC
832 char *codeset_name;
833#endif
81c64d40
UD
834
835 memset (dfa, '\0', sizeof (re_dfa_t));
836
ee70274a
UD
837 /* Force allocation of str_tree_storage the first time. */
838 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
839
01ed6ceb
UD
840 /* Avoid overflows. */
841 if (pat_len == SIZE_MAX)
842 return REG_ESPACE;
843
3b0bdc72
UD
844 dfa->nodes_alloc = pat_len + 1;
845 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
846
3b0bdc72 847 /* table_size = 2 ^ ceil(log pat_len) */
01ed6ceb 848 for (table_size = 1; ; table_size <<= 1)
3b0bdc72
UD
849 if (table_size > pat_len)
850 break;
851
852 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853 dfa->state_hash_mask = table_size - 1;
854
3c0fb574
UD
855 dfa->mb_cur_max = MB_CUR_MAX;
856#ifdef _LIBC
bb3f4825 857 if (dfa->mb_cur_max == 6
3c0fb574
UD
858 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
859 dfa->is_utf8 = 1;
f0c7c524
UD
860 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
861 != 0);
e40a38b3
UD
862#else
863# ifdef HAVE_LANGINFO_CODESET
864 codeset_name = nl_langinfo (CODESET);
865# else
866 codeset_name = getenv ("LC_ALL");
ea161661 867 if (codeset_name == NULL || codeset_name[0] == '\0')
e40a38b3 868 codeset_name = getenv ("LC_CTYPE");
ea161661 869 if (codeset_name == NULL || codeset_name[0] == '\0')
e40a38b3
UD
870 codeset_name = getenv ("LANG");
871 if (codeset_name == NULL)
872 codeset_name = "";
873 else if (strchr (codeset_name, '.') != NULL)
874 codeset_name = strchr (codeset_name, '.') + 1;
875# endif
876
877 if (strcasecmp (codeset_name, "UTF-8") == 0
878 || strcasecmp (codeset_name, "UTF8") == 0)
879 dfa->is_utf8 = 1;
880
881 /* We check exhaustively in the loop below if this charset is a
882 superset of ASCII. */
883 dfa->map_notascii = 0;
3c0fb574 884#endif
e40a38b3 885
65e6becf
UD
886#ifdef RE_ENABLE_I18N
887 if (dfa->mb_cur_max > 1)
888 {
65e6becf 889 if (dfa->is_utf8)
e40a38b3 890 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
65e6becf 891 else
e40a38b3
UD
892 {
893 int i, j, ch;
894
2c05d33f 895 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
e40a38b3
UD
896 if (BE (dfa->sb_char == NULL, 0))
897 return REG_ESPACE;
898
2c05d33f
UD
899 /* Set the bits corresponding to single byte chars. */
900 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
e40a38b3 902 {
2d87db5b 903 wint_t wch = __btowc (ch);
e40a38b3 904 if (wch != WEOF)
2c05d33f 905 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
e40a38b3 906# ifndef _LIBC
2d87db5b 907 if (isascii (ch) && wch != ch)
e40a38b3
UD
908 dfa->map_notascii = 1;
909# endif
910 }
911 }
65e6becf
UD
912 }
913#endif
3c0fb574 914
ae73c6c1 915 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
ee70274a 916 return REG_ESPACE;
3b0bdc72
UD
917 return REG_NOERROR;
918}
919
920/* Initialize WORD_CHAR table, which indicate which character is
921 "word". In this case "word" means that it is the word construction
922 character used by some operators like "\<", "\>", etc. */
923
56b168be 924static void
9b0c3f38 925internal_function
0fd8ae9c 926init_word_char (re_dfa_t *dfa)
3b0bdc72 927{
56b168be 928 dfa->word_ops_used = 1;
9f115170
UD
929 int i = 0;
930 int ch = 0;
931 if (BE (dfa->map_notascii == 0, 1))
932 {
933 if (sizeof (dfa->word_char[0]) == 8)
934 {
935 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
936 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
937 i = 2;
938 }
939 else if (sizeof (dfa->word_char[0]) == 4)
940 {
941 dfa->word_char[0] = UINT32_C (0x00000000);
942 dfa->word_char[1] = UINT32_C (0x03ff0000);
943 dfa->word_char[2] = UINT32_C (0x87fffffe);
944 dfa->word_char[3] = UINT32_C (0x07fffffe);
945 i = 4;
946 }
947 else
948 abort ();
949 ch = 128;
950
951 if (BE (dfa->is_utf8, 1))
952 {
953 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
954 return;
955 }
956 }
957
958 for (; i < BITSET_WORDS; ++i)
959 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3b0bdc72 960 if (isalnum (ch) || ch == '_')
2c05d33f 961 dfa->word_char[i] |= (bitset_word_t) 1 << j;
3b0bdc72
UD
962}
963
964/* Free the work area which are only used while compiling. */
965
966static void
0fd8ae9c 967free_workarea_compile (regex_t *preg)
3b0bdc72
UD
968{
969 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
ee70274a
UD
970 bin_tree_storage_t *storage, *next;
971 for (storage = dfa->str_tree_storage; storage; storage = next)
972 {
973 next = storage->next;
974 re_free (storage);
975 }
976 dfa->str_tree_storage = NULL;
977 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 978 dfa->str_tree = NULL;
a7d5c291
UD
979 re_free (dfa->org_indices);
980 dfa->org_indices = NULL;
3b0bdc72
UD
981}
982
983/* Create initial states for all contexts. */
984
985static reg_errcode_t
0fd8ae9c 986create_initial_state (re_dfa_t *dfa)
3b0bdc72
UD
987{
988 int first, i;
989 reg_errcode_t err;
990 re_node_set init_nodes;
991
992 /* Initial states have the epsilon closure of the node which is
993 the first node of the regular expression. */
02f3550c 994 first = dfa->str_tree->first->node_idx;
3b0bdc72
UD
995 dfa->init_node = first;
996 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 997 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
998 return err;
999
1000 /* The back-references which are in initial states can epsilon transit,
1001 since in this case all of the subexpressions can be null.
1002 Then we add epsilon closures of the nodes which are the next nodes of
1003 the back-references. */
1004 if (dfa->nbackref > 0)
1005 for (i = 0; i < init_nodes.nelem; ++i)
1006 {
15a7d175
UD
1007 int node_idx = init_nodes.elems[i];
1008 re_token_type_t type = dfa->nodes[node_idx].type;
1009
1010 int clexp_idx;
1011 if (type != OP_BACK_REF)
1012 continue;
1013 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1014 {
1015 re_token_t *clexp_node;
1016 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1017 if (clexp_node->type == OP_CLOSE_SUBEXP
ae73c6c1 1018 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
15a7d175
UD
1019 break;
1020 }
1021 if (clexp_idx == init_nodes.nelem)
1022 continue;
1023
1024 if (type == OP_BACK_REF)
1025 {
1026 int dest_idx = dfa->edests[node_idx].elems[0];
1027 if (!re_node_set_contains (&init_nodes, dest_idx))
1028 {
2da42bc0
UD
1029 reg_errcode_t err = re_node_set_merge (&init_nodes,
1030 dfa->eclosures
1031 + dest_idx);
1032 if (err != REG_NOERROR)
1033 return err;
15a7d175
UD
1034 i = 0;
1035 }
1036 }
3b0bdc72
UD
1037 }
1038
1039 /* It must be the first time to invoke acquire_state. */
a9388965
UD
1040 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1041 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 1042 if (BE (dfa->init_state == NULL, 0))
a9388965 1043 return err;
3b0bdc72
UD
1044 if (dfa->init_state->has_constraint)
1045 {
a9388965 1046 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1047 CONTEXT_WORD);
a9388965 1048 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1049 CONTEXT_NEWLINE);
a9388965 1050 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
1051 &init_nodes,
1052 CONTEXT_NEWLINE
1053 | CONTEXT_BEGBUF);
bc15410e 1054 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
1055 || dfa->init_state_begbuf == NULL, 0))
1056 return err;
3b0bdc72
UD
1057 }
1058 else
1059 dfa->init_state_word = dfa->init_state_nl
1060 = dfa->init_state_begbuf = dfa->init_state;
1061
3b0bdc72
UD
1062 re_node_set_free (&init_nodes);
1063 return REG_NOERROR;
1064}
1065\f
14744156
UD
1066#ifdef RE_ENABLE_I18N
1067/* If it is possible to do searching in single byte encoding instead of UTF-8
1068 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1069 DFA nodes where needed. */
1070
1071static void
0fd8ae9c 1072optimize_utf8 (re_dfa_t *dfa)
14744156 1073{
ad7f28c2 1074 int node, i, mb_chars = 0, has_period = 0;
14744156
UD
1075
1076 for (node = 0; node < dfa->nodes_len; ++node)
1077 switch (dfa->nodes[node].type)
1078 {
1079 case CHARACTER:
14744156 1080 if (dfa->nodes[node].opr.c >= 0x80)
5f93cd52 1081 mb_chars = 1;
14744156
UD
1082 break;
1083 case ANCHOR:
bc3e1c12 1084 switch (dfa->nodes[node].opr.ctx_type)
14744156
UD
1085 {
1086 case LINE_FIRST:
1087 case LINE_LAST:
1088 case BUF_FIRST:
1089 case BUF_LAST:
1090 break;
1091 default:
0caca71a
UD
1092 /* Word anchors etc. cannot be handled. It's okay to test
1093 opr.ctx_type since constraints (for all DFA nodes) are
1094 created by ORing one or more opr.ctx_type values. */
14744156
UD
1095 return;
1096 }
1097 break;
ad7f28c2 1098 case OP_PERIOD:
21f5de55
PE
1099 has_period = 1;
1100 break;
14744156
UD
1101 case OP_BACK_REF:
1102 case OP_ALT:
1103 case END_OF_RE:
14744156 1104 case OP_DUP_ASTERISK:
14744156
UD
1105 case OP_OPEN_SUBEXP:
1106 case OP_CLOSE_SUBEXP:
1107 break;
02f3550c
UD
1108 case COMPLEX_BRACKET:
1109 return;
a8067e8f 1110 case SIMPLE_BRACKET:
2c05d33f
UD
1111 /* Just double check. The non-ASCII range starts at 0x80. */
1112 assert (0x80 % BITSET_WORD_BITS == 0);
21f5de55 1113 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
a8067e8f
UD
1114 if (dfa->nodes[node].opr.sbcset[i])
1115 return;
1116 break;
14744156 1117 default:
02f3550c 1118 abort ();
14744156
UD
1119 }
1120
ad7f28c2 1121 if (mb_chars || has_period)
5f93cd52 1122 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1123 {
1124 if (dfa->nodes[node].type == CHARACTER
1125 && dfa->nodes[node].opr.c >= 0x80)
1126 dfa->nodes[node].mb_partial = 0;
1127 else if (dfa->nodes[node].type == OP_PERIOD)
1128 dfa->nodes[node].type = OP_UTF8_PERIOD;
1129 }
5f93cd52 1130
14744156
UD
1131 /* The search can be in single byte locale. */
1132 dfa->mb_cur_max = 1;
1133 dfa->is_utf8 = 0;
ad7f28c2 1134 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1135}
1136#endif
1137\f
3b0bdc72
UD
1138/* Analyze the structure tree, and calculate "first", "next", "edest",
1139 "eclosure", and "inveclosure". */
1140
1141static reg_errcode_t
0fd8ae9c 1142analyze (regex_t *preg)
3b0bdc72 1143{
02f3550c 1144 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3b0bdc72
UD
1145 reg_errcode_t ret;
1146
1147 /* Allocate arrays. */
3b0bdc72 1148 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
a7d5c291 1149 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3b0bdc72
UD
1150 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1151 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1152 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
963d8d78 1153 || dfa->eclosures == NULL, 0))
3b0bdc72 1154 return REG_ESPACE;
02f3550c
UD
1155
1156 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1157 if (dfa->subexp_map != NULL)
3b0bdc72 1158 {
02f3550c
UD
1159 int i;
1160 for (i = 0; i < preg->re_nsub; i++)
1161 dfa->subexp_map[i] = i;
1162 preorder (dfa->str_tree, optimize_subexps, dfa);
1163 for (i = 0; i < preg->re_nsub; i++)
1164 if (dfa->subexp_map[i] != i)
1165 break;
1166 if (i == preg->re_nsub)
1167 {
1168 free (dfa->subexp_map);
1169 dfa->subexp_map = NULL;
1170 }
3b0bdc72
UD
1171 }
1172
02f3550c
UD
1173 ret = postorder (dfa->str_tree, lower_subexps, preg);
1174 if (BE (ret != REG_NOERROR, 0))
1175 return ret;
1176 ret = postorder (dfa->str_tree, calc_first, dfa);
1177 if (BE (ret != REG_NOERROR, 0))
1178 return ret;
1179 preorder (dfa->str_tree, calc_next, dfa);
1180 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1181 if (BE (ret != REG_NOERROR, 0))
1182 return ret;
1183 ret = calc_eclosure (dfa);
1184 if (BE (ret != REG_NOERROR, 0))
1185 return ret;
963d8d78
UD
1186
1187 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1188 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1189 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1190 || dfa->nbackref)
1191 {
1192 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1193 if (BE (dfa->inveclosures == NULL, 0))
21f5de55 1194 return REG_ESPACE;
963d8d78
UD
1195 ret = calc_inveclosure (dfa);
1196 }
1197
02f3550c
UD
1198 return ret;
1199}
1200
1201/* Our parse trees are very unbalanced, so we cannot use a stack to
1202 implement parse tree visits. Instead, we use parent pointers and
1203 some hairy code in these two functions. */
1204static reg_errcode_t
0fd8ae9c
UD
1205postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1206 void *extra)
02f3550c
UD
1207{
1208 bin_tree_t *node, *prev;
1209
1210 for (node = root; ; )
3b0bdc72 1211 {
02f3550c
UD
1212 /* Descend down the tree, preferably to the left (or to the right
1213 if that's the only child). */
1214 while (node->left || node->right)
1215 if (node->left)
21f5de55
PE
1216 node = node->left;
1217 else
1218 node = node->right;
02f3550c
UD
1219
1220 do
1221 {
1222 reg_errcode_t err = fn (extra, node);
1223 if (BE (err != REG_NOERROR, 0))
1224 return err;
21f5de55 1225 if (node->parent == NULL)
02f3550c
UD
1226 return REG_NOERROR;
1227 prev = node;
1228 node = node->parent;
1229 }
1230 /* Go up while we have a node that is reached from the right. */
1231 while (node->right == prev || node->right == NULL);
1232 node = node->right;
3b0bdc72 1233 }
3b0bdc72
UD
1234}
1235
02f3550c 1236static reg_errcode_t
0fd8ae9c
UD
1237preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238 void *extra)
02f3550c
UD
1239{
1240 bin_tree_t *node;
1241
1242 for (node = root; ; )
1243 {
1244 reg_errcode_t err = fn (extra, node);
1245 if (BE (err != REG_NOERROR, 0))
1246 return err;
0ecb606c 1247
02f3550c
UD
1248 /* Go to the left node, or up and to the right. */
1249 if (node->left)
1250 node = node->left;
1251 else
1252 {
1253 bin_tree_t *prev = NULL;
1254 while (node->right == prev || node->right == NULL)
1255 {
1256 prev = node;
1257 node = node->parent;
1258 if (!node)
21f5de55 1259 return REG_NOERROR;
02f3550c
UD
1260 }
1261 node = node->right;
1262 }
1263 }
1264}
1265
1266/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1267 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1268 backreferences as well. Requires a preorder visit. */
0ecb606c 1269static reg_errcode_t
0fd8ae9c 1270optimize_subexps (void *extra, bin_tree_t *node)
0ecb606c 1271{
02f3550c 1272 re_dfa_t *dfa = (re_dfa_t *) extra;
0ecb606c 1273
02f3550c 1274 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3b0bdc72 1275 {
02f3550c
UD
1276 int idx = node->token.opr.idx;
1277 node->token.opr.idx = dfa->subexp_map[idx];
1278 dfa->used_bkref_map |= 1 << node->token.opr.idx;
0ecb606c 1279 }
02f3550c
UD
1280
1281 else if (node->token.type == SUBEXP
21f5de55 1282 && node->left && node->left->token.type == SUBEXP)
0ecb606c 1283 {
02f3550c
UD
1284 int other_idx = node->left->token.opr.idx;
1285
1286 node->left = node->left->left;
1287 if (node->left)
21f5de55 1288 node->left->parent = node;
02f3550c
UD
1289
1290 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
2c05d33f
UD
1291 if (other_idx < BITSET_WORD_BITS)
1292 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3b0bdc72 1293 }
02f3550c 1294
3b0bdc72
UD
1295 return REG_NOERROR;
1296}
1297
02f3550c
UD
1298/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1299 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1300static reg_errcode_t
0fd8ae9c 1301lower_subexps (void *extra, bin_tree_t *node)
3b0bdc72 1302{
02f3550c
UD
1303 regex_t *preg = (regex_t *) extra;
1304 reg_errcode_t err = REG_NOERROR;
3b0bdc72 1305
02f3550c 1306 if (node->left && node->left->token.type == SUBEXP)
0ecb606c 1307 {
02f3550c
UD
1308 node->left = lower_subexp (&err, preg, node->left);
1309 if (node->left)
1310 node->left->parent = node;
1311 }
1312 if (node->right && node->right->token.type == SUBEXP)
1313 {
1314 node->right = lower_subexp (&err, preg, node->right);
1315 if (node->right)
1316 node->right->parent = node;
3b0bdc72 1317 }
3b0bdc72 1318
02f3550c
UD
1319 return err;
1320}
3b0bdc72 1321
02f3550c 1322static bin_tree_t *
0fd8ae9c 1323lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
0ecb606c 1324{
02f3550c
UD
1325 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1326 bin_tree_t *body = node->left;
1327 bin_tree_t *op, *cls, *tree1, *tree;
1328
1329 if (preg->no_sub
744eb12b
UD
1330 /* We do not optimize empty subexpressions, because otherwise we may
1331 have bad CONCAT nodes with NULL children. This is obviously not
1332 very common, so we do not lose much. An example that triggers
1333 this case is the sed "script" /\(\)/x. */
1334 && node->left != NULL
2c05d33f
UD
1335 && (node->token.opr.idx >= BITSET_WORD_BITS
1336 || !(dfa->used_bkref_map
1337 & ((bitset_word_t) 1 << node->token.opr.idx))))
02f3550c
UD
1338 return node->left;
1339
1340 /* Convert the SUBEXP node to the concatenation of an
1341 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1342 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1343 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1344 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1345 tree = create_tree (dfa, op, tree1, CONCAT);
1346 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
0ecb606c 1347 {
02f3550c
UD
1348 *err = REG_ESPACE;
1349 return NULL;
0ecb606c 1350 }
3b0bdc72 1351
02f3550c
UD
1352 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1353 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1354 return tree;
1355}
a334319f 1356
02f3550c
UD
1357/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1358 nodes. Requires a postorder visit. */
1359static reg_errcode_t
0fd8ae9c 1360calc_first (void *extra, bin_tree_t *node)
02f3550c
UD
1361{
1362 re_dfa_t *dfa = (re_dfa_t *) extra;
1363 if (node->token.type == CONCAT)
1364 {
1365 node->first = node->left->first;
1366 node->node_idx = node->left->node_idx;
1367 }
1368 else
1369 {
1370 node->first = node;
1371 node->node_idx = re_dfa_add_node (dfa, node->token);
1372 if (BE (node->node_idx == -1, 0))
21f5de55 1373 return REG_ESPACE;
0caca71a 1374 if (node->token.type == ANCHOR)
21f5de55 1375 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
02f3550c
UD
1376 }
1377 return REG_NOERROR;
1378}
1379
1380/* Pass 2: compute NEXT on the tree. Preorder visit. */
1381static reg_errcode_t
0fd8ae9c 1382calc_next (void *extra, bin_tree_t *node)
02f3550c
UD
1383{
1384 switch (node->token.type)
3b0bdc72
UD
1385 {
1386 case OP_DUP_ASTERISK:
02f3550c 1387 node->left->next = node;
3b0bdc72
UD
1388 break;
1389 case CONCAT:
02f3550c
UD
1390 node->left->next = node->right->first;
1391 node->right->next = node->next;
1392 break;
3b0bdc72 1393 default:
02f3550c
UD
1394 if (node->left)
1395 node->left->next = node->next;
1396 if (node->right)
21f5de55 1397 node->right->next = node->next;
3b0bdc72
UD
1398 break;
1399 }
02f3550c 1400 return REG_NOERROR;
3b0bdc72
UD
1401}
1402
02f3550c
UD
1403/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1404static reg_errcode_t
0fd8ae9c 1405link_nfa_nodes (void *extra, bin_tree_t *node)
a334319f 1406{
02f3550c
UD
1407 re_dfa_t *dfa = (re_dfa_t *) extra;
1408 int idx = node->node_idx;
1409 reg_errcode_t err = REG_NOERROR;
1410
1411 switch (node->token.type)
3b0bdc72 1412 {
02f3550c
UD
1413 case CONCAT:
1414 break;
1415
1416 case END_OF_RE:
1417 assert (node->next == NULL);
1418 break;
1419
1420 case OP_DUP_ASTERISK:
1421 case OP_ALT:
1422 {
1423 int left, right;
1424 dfa->has_plural_match = 1;
1425 if (node->left != NULL)
1426 left = node->left->first->node_idx;
1427 else
1428 left = node->next->node_idx;
1429 if (node->right != NULL)
1430 right = node->right->first->node_idx;
1431 else
1432 right = node->next->node_idx;
1433 assert (left > -1);
1434 assert (right > -1);
1435 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436 }
1437 break;
1438
1439 case ANCHOR:
1440 case OP_OPEN_SUBEXP:
1441 case OP_CLOSE_SUBEXP:
1442 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1443 break;
1444
1445 case OP_BACK_REF:
1446 dfa->nexts[idx] = node->next->node_idx;
1447 if (node->token.type == OP_BACK_REF)
2da42bc0 1448 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
02f3550c
UD
1449 break;
1450
1451 default:
1452 assert (!IS_EPSILON_NODE (node->token.type));
1453 dfa->nexts[idx] = node->next->node_idx;
1454 break;
3b0bdc72 1455 }
02f3550c
UD
1456
1457 return err;
3b0bdc72
UD
1458}
1459
485d775d
UD
1460/* Duplicate the epsilon closure of the node ROOT_NODE.
1461 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462 to their own constraint. */
1463
1464static reg_errcode_t
9b0c3f38 1465internal_function
0fd8ae9c
UD
1466duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1467 int root_node, unsigned int init_constraint)
485d775d 1468{
485d775d
UD
1469 int org_node, clone_node, ret;
1470 unsigned int constraint = init_constraint;
1471 for (org_node = top_org_node, clone_node = top_clone_node;;)
1472 {
1473 int org_dest, clone_dest;
1474 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1475 {
485d775d
UD
1476 /* If the back reference epsilon-transit, its destination must
1477 also have the constraint. Then duplicate the epsilon closure
1478 of the destination of the back reference, and store it in
1479 edests of the back reference. */
15a7d175
UD
1480 org_dest = dfa->nexts[org_node];
1481 re_node_set_empty (dfa->edests + clone_node);
2d87db5b
UD
1482 clone_dest = duplicate_node (dfa, org_dest, constraint);
1483 if (BE (clone_dest == -1, 0))
1484 return REG_ESPACE;
15a7d175
UD
1485 dfa->nexts[clone_node] = dfa->nexts[org_node];
1486 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487 if (BE (ret < 0, 0))
1488 return REG_ESPACE;
1489 }
485d775d 1490 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1491 {
485d775d
UD
1492 /* In case of the node can't epsilon-transit, don't duplicate the
1493 destination and store the original destination as the
1494 destination of the node. */
15a7d175
UD
1495 dfa->nexts[clone_node] = dfa->nexts[org_node];
1496 break;
1497 }
485d775d 1498 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1499 {
485d775d
UD
1500 /* In case of the node can epsilon-transit, and it has only one
1501 destination. */
15a7d175
UD
1502 org_dest = dfa->edests[org_node].elems[0];
1503 re_node_set_empty (dfa->edests + clone_node);
0caca71a
UD
1504 /* If the node is root_node itself, it means the epsilon clsoure
1505 has a loop. Then tie it to the destination of the root_node. */
1506 if (org_node == root_node && clone_node != org_node)
15a7d175 1507 {
0caca71a
UD
1508 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1509 if (BE (ret < 0, 0))
1510 return REG_ESPACE;
1511 break;
15a7d175 1512 }
0caca71a
UD
1513 /* In case of the node has another constraint, add it. */
1514 constraint |= dfa->nodes[org_node].constraint;
2d87db5b
UD
1515 clone_dest = duplicate_node (dfa, org_dest, constraint);
1516 if (BE (clone_dest == -1, 0))
1517 return REG_ESPACE;
15a7d175
UD
1518 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1521 }
485d775d 1522 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1523 {
485d775d 1524 /* In case of the node can epsilon-transit, and it has two
02f3550c 1525 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
15a7d175
UD
1526 org_dest = dfa->edests[org_node].elems[0];
1527 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530 if (clone_dest == -1)
1531 {
0caca71a 1532 /* There is no such duplicated node, create a new one. */
2d87db5b
UD
1533 reg_errcode_t err;
1534 clone_dest = duplicate_node (dfa, org_dest, constraint);
1535 if (BE (clone_dest == -1, 0))
1536 return REG_ESPACE;
7de66108
UD
1537 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538 if (BE (ret < 0, 0))
1539 return REG_ESPACE;
1540 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1541 root_node, constraint);
1542 if (BE (err != REG_NOERROR, 0))
1543 return err;
a7d5c291
UD
1544 }
1545 else
1546 {
0caca71a 1547 /* There is a duplicated node which satisfies the constraint,
a7d5c291
UD
1548 use it to avoid infinite loop. */
1549 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1550 if (BE (ret < 0, 0))
1551 return REG_ESPACE;
1552 }
15a7d175
UD
1553
1554 org_dest = dfa->edests[org_node].elems[1];
2d87db5b
UD
1555 clone_dest = duplicate_node (dfa, org_dest, constraint);
1556 if (BE (clone_dest == -1, 0))
1557 return REG_ESPACE;
15a7d175
UD
1558 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (ret < 0, 0))
1560 return REG_ESPACE;
1561 }
485d775d
UD
1562 org_node = org_dest;
1563 clone_node = clone_dest;
1564 }
1565 return REG_NOERROR;
1566}
1567
a7d5c291
UD
1568/* Search for a node which is duplicated from the node ORG_NODE, and
1569 satisfies the constraint CONSTRAINT. */
1570
1571static int
0fd8ae9c
UD
1572search_duplicated_node (const re_dfa_t *dfa, int org_node,
1573 unsigned int constraint)
a7d5c291
UD
1574{
1575 int idx;
1576 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1577 {
1578 if (org_node == dfa->org_indices[idx]
1579 && constraint == dfa->nodes[idx].constraint)
1580 return idx; /* Found. */
1581 }
1582 return -1; /* Not found. */
1583}
1584
a9388965 1585/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
2d87db5b
UD
1586 Return the index of the new node, or -1 if insufficient storage is
1587 available. */
a9388965 1588
2d87db5b 1589static int
0fd8ae9c 1590duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
3b0bdc72 1591{
02f3550c 1592 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
2d87db5b
UD
1593 if (BE (dup_idx != -1, 1))
1594 {
1595 dfa->nodes[dup_idx].constraint = constraint;
0caca71a 1596 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
2d87db5b
UD
1597 dfa->nodes[dup_idx].duplicated = 1;
1598
1599 /* Store the index of the original node. */
1600 dfa->org_indices[dup_idx] = org_idx;
1601 }
1602 return dup_idx;
3b0bdc72
UD
1603}
1604
963d8d78 1605static reg_errcode_t
0fd8ae9c 1606calc_inveclosure (re_dfa_t *dfa)
3b0bdc72 1607{
963d8d78
UD
1608 int src, idx, ret;
1609 for (idx = 0; idx < dfa->nodes_len; ++idx)
1610 re_node_set_init_empty (dfa->inveclosures + idx);
1611
3b0bdc72
UD
1612 for (src = 0; src < dfa->nodes_len; ++src)
1613 {
963d8d78 1614 int *elems = dfa->eclosures[src].elems;
3b0bdc72 1615 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175 1616 {
963d8d78
UD
1617 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1618 if (BE (ret == -1, 0))
1619 return REG_ESPACE;
15a7d175 1620 }
3b0bdc72 1621 }
963d8d78
UD
1622
1623 return REG_NOERROR;
3b0bdc72
UD
1624}
1625
1626/* Calculate "eclosure" for all the node in DFA. */
1627
1628static reg_errcode_t
0fd8ae9c 1629calc_eclosure (re_dfa_t *dfa)
3b0bdc72 1630{
485d775d 1631 int node_idx, incomplete;
3b0bdc72
UD
1632#ifdef DEBUG
1633 assert (dfa->nodes_len > 0);
1634#endif
485d775d 1635 incomplete = 0;
3b0bdc72 1636 /* For each nodes, calculate epsilon closure. */
485d775d 1637 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1638 {
a9388965 1639 reg_errcode_t err;
3b0bdc72 1640 re_node_set eclosure_elem;
485d775d 1641 if (node_idx == dfa->nodes_len)
15a7d175
UD
1642 {
1643 if (!incomplete)
1644 break;
1645 incomplete = 0;
1646 node_idx = 0;
1647 }
3b0bdc72
UD
1648
1649#ifdef DEBUG
3b0bdc72
UD
1650 assert (dfa->eclosures[node_idx].nelem != -1);
1651#endif
c06a6956 1652
3b0bdc72
UD
1653 /* If we have already calculated, skip it. */
1654 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1655 continue;
3b0bdc72 1656 /* Calculate epsilon closure of `node_idx'. */
a9388965 1657 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1658 if (BE (err != REG_NOERROR, 0))
15a7d175 1659 return err;
3b0bdc72
UD
1660
1661 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175
UD
1662 {
1663 incomplete = 1;
1664 re_node_set_free (&eclosure_elem);
1665 }
3b0bdc72 1666 }
3b0bdc72
UD
1667 return REG_NOERROR;
1668}
1669
1670/* Calculate epsilon closure of NODE. */
1671
a9388965 1672static reg_errcode_t
0fd8ae9c 1673calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
3b0bdc72 1674{
a9388965 1675 reg_errcode_t err;
21f5de55 1676 int i;
3b0bdc72 1677 re_node_set eclosure;
21f5de55
PE
1678 int ret;
1679 int incomplete = 0;
a9388965 1680 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1681 if (BE (err != REG_NOERROR, 0))
a9388965 1682 return err;
3b0bdc72
UD
1683
1684 /* This indicates that we are calculating this node now.
1685 We reference this value to avoid infinite loop. */
1686 dfa->eclosures[node].nelem = -1;
1687
0caca71a
UD
1688 /* If the current node has constraints, duplicate all nodes
1689 since they must inherit the constraints. */
1690 if (dfa->nodes[node].constraint
b4ae56bd
UD
1691 && dfa->edests[node].nelem
1692 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
485d775d 1693 {
0caca71a
UD
1694 err = duplicate_node_closure (dfa, node, node, node,
1695 dfa->nodes[node].constraint);
485d775d 1696 if (BE (err != REG_NOERROR, 0))
15a7d175 1697 return err;
485d775d 1698 }
3b0bdc72
UD
1699
1700 /* Expand each epsilon destination nodes. */
485d775d 1701 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1702 for (i = 0; i < dfa->edests[node].nelem; ++i)
1703 {
15a7d175
UD
1704 re_node_set eclosure_elem;
1705 int edest = dfa->edests[node].elems[i];
1706 /* If calculating the epsilon closure of `edest' is in progress,
1707 return intermediate result. */
1708 if (dfa->eclosures[edest].nelem == -1)
1709 {
1710 incomplete = 1;
1711 continue;
1712 }
1713 /* If we haven't calculated the epsilon closure of `edest' yet,
1714 calculate now. Otherwise use calculated epsilon closure. */
1715 if (dfa->eclosures[edest].nelem == 0)
1716 {
1717 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1718 if (BE (err != REG_NOERROR, 0))
1719 return err;
1720 }
1721 else
1722 eclosure_elem = dfa->eclosures[edest];
1723 /* Merge the epsilon closure of `edest'. */
2da42bc0
UD
1724 err = re_node_set_merge (&eclosure, &eclosure_elem);
1725 if (BE (err != REG_NOERROR, 0))
1726 return err;
15a7d175
UD
1727 /* If the epsilon closure of `edest' is incomplete,
1728 the epsilon closure of this node is also incomplete. */
1729 if (dfa->eclosures[edest].nelem == 0)
1730 {
1731 incomplete = 1;
1732 re_node_set_free (&eclosure_elem);
1733 }
3b0bdc72
UD
1734 }
1735
21f5de55
PE
1736 /* An epsilon closure includes itself. */
1737 ret = re_node_set_insert (&eclosure, node);
1738 if (BE (ret < 0, 0))
1739 return REG_ESPACE;
3b0bdc72
UD
1740 if (incomplete && !root)
1741 dfa->eclosures[node].nelem = 0;
1742 else
1743 dfa->eclosures[node] = eclosure;
a9388965
UD
1744 *new_set = eclosure;
1745 return REG_NOERROR;
3b0bdc72
UD
1746}
1747\f
1748/* Functions for token which are used in the parser. */
1749
1750/* Fetch a token from INPUT.
1751 We must not use this function inside bracket expressions. */
1752
f0d77aa8 1753static void
9b0c3f38 1754internal_function
0fd8ae9c 1755fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
3b0bdc72 1756{
f0d77aa8 1757 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1758}
1759
1760/* Peek a token from INPUT, and return the length of the token.
1761 We must not use this function inside bracket expressions. */
1762
1763static int
9b0c3f38 1764internal_function
0fd8ae9c 1765peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
1766{
1767 unsigned char c;
1768
1769 if (re_string_eoi (input))
1770 {
1771 token->type = END_OF_RE;
1772 return 0;
1773 }
1774
1775 c = re_string_peek_byte (input, 0);
1776 token->opr.c = c;
1777
65e6becf 1778 token->word_char = 0;
3b0bdc72
UD
1779#ifdef RE_ENABLE_I18N
1780 token->mb_partial = 0;
3c0fb574 1781 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1782 !re_string_first_byte (input, re_string_cur_idx (input)))
1783 {
1784 token->type = CHARACTER;
1785 token->mb_partial = 1;
1786 return 1;
1787 }
1788#endif
1789 if (c == '\\')
1790 {
1791 unsigned char c2;
1792 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1793 {
1794 token->type = BACK_SLASH;
1795 return 1;
1796 }
3b0bdc72
UD
1797
1798 c2 = re_string_peek_byte_case (input, 1);
1799 token->opr.c = c2;
1800 token->type = CHARACTER;
65e6becf
UD
1801#ifdef RE_ENABLE_I18N
1802 if (input->mb_cur_max > 1)
1803 {
1804 wint_t wc = re_string_wchar_at (input,
1805 re_string_cur_idx (input) + 1);
1806 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1807 }
1808 else
1809#endif
1810 token->word_char = IS_WORD_CHAR (c2) != 0;
1811
3b0bdc72 1812 switch (c2)
15a7d175
UD
1813 {
1814 case '|':
1815 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1816 token->type = OP_ALT;
1817 break;
1818 case '1': case '2': case '3': case '4': case '5':
1819 case '6': case '7': case '8': case '9':
1820 if (!(syntax & RE_NO_BK_REFS))
1821 {
1822 token->type = OP_BACK_REF;
ae73c6c1 1823 token->opr.idx = c2 - '1';
15a7d175
UD
1824 }
1825 break;
1826 case '<':
1827 if (!(syntax & RE_NO_GNU_OPS))
1828 {
1829 token->type = ANCHOR;
bb3f4825 1830 token->opr.ctx_type = WORD_FIRST;
15a7d175
UD
1831 }
1832 break;
1833 case '>':
1834 if (!(syntax & RE_NO_GNU_OPS))
1835 {
1836 token->type = ANCHOR;
bb3f4825 1837 token->opr.ctx_type = WORD_LAST;
15a7d175
UD
1838 }
1839 break;
1840 case 'b':
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 {
1843 token->type = ANCHOR;
bb3f4825 1844 token->opr.ctx_type = WORD_DELIM;
15a7d175
UD
1845 }
1846 break;
1847 case 'B':
1848 if (!(syntax & RE_NO_GNU_OPS))
1849 {
1850 token->type = ANCHOR;
24992143 1851 token->opr.ctx_type = NOT_WORD_DELIM;
15a7d175
UD
1852 }
1853 break;
1854 case 'w':
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = OP_WORD;
1857 break;
1858 case 'W':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = OP_NOTWORD;
1861 break;
e2b6bfa3
UD
1862 case 's':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_SPACE;
1865 break;
1866 case 'S':
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_NOTSPACE;
1869 break;
15a7d175
UD
1870 case '`':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 {
1873 token->type = ANCHOR;
bb3f4825 1874 token->opr.ctx_type = BUF_FIRST;
15a7d175
UD
1875 }
1876 break;
1877 case '\'':
1878 if (!(syntax & RE_NO_GNU_OPS))
1879 {
1880 token->type = ANCHOR;
bb3f4825 1881 token->opr.ctx_type = BUF_LAST;
15a7d175
UD
1882 }
1883 break;
1884 case '(':
1885 if (!(syntax & RE_NO_BK_PARENS))
1886 token->type = OP_OPEN_SUBEXP;
1887 break;
1888 case ')':
1889 if (!(syntax & RE_NO_BK_PARENS))
1890 token->type = OP_CLOSE_SUBEXP;
1891 break;
1892 case '+':
1893 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894 token->type = OP_DUP_PLUS;
1895 break;
1896 case '?':
1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898 token->type = OP_DUP_QUESTION;
1899 break;
1900 case '{':
1901 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902 token->type = OP_OPEN_DUP_NUM;
1903 break;
1904 case '}':
1905 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906 token->type = OP_CLOSE_DUP_NUM;
1907 break;
1908 default:
1909 break;
1910 }
3b0bdc72
UD
1911 return 2;
1912 }
1913
1914 token->type = CHARACTER;
65e6becf
UD
1915#ifdef RE_ENABLE_I18N
1916 if (input->mb_cur_max > 1)
1917 {
1918 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1919 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1920 }
1921 else
1922#endif
1923 token->word_char = IS_WORD_CHAR (token->opr.c);
1924
3b0bdc72
UD
1925 switch (c)
1926 {
1927 case '\n':
1928 if (syntax & RE_NEWLINE_ALT)
15a7d175 1929 token->type = OP_ALT;
3b0bdc72
UD
1930 break;
1931 case '|':
1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1933 token->type = OP_ALT;
3b0bdc72
UD
1934 break;
1935 case '*':
1936 token->type = OP_DUP_ASTERISK;
1937 break;
1938 case '+':
1939 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1940 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1941 break;
1942 case '?':
1943 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1944 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1945 break;
1946 case '{':
1947 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1948 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1949 break;
1950 case '}':
1951 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1952 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1953 break;
1954 case '(':
1955 if (syntax & RE_NO_BK_PARENS)
15a7d175 1956 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1957 break;
1958 case ')':
1959 if (syntax & RE_NO_BK_PARENS)
15a7d175 1960 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1961 break;
1962 case '[':
1963 token->type = OP_OPEN_BRACKET;
1964 break;
1965 case '.':
1966 token->type = OP_PERIOD;
1967 break;
1968 case '^':
134abcb5 1969 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
1970 re_string_cur_idx (input) != 0)
1971 {
1972 char prev = re_string_peek_byte (input, -1);
134abcb5 1973 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1974 break;
1975 }
3b0bdc72 1976 token->type = ANCHOR;
bb3f4825 1977 token->opr.ctx_type = LINE_FIRST;
3b0bdc72
UD
1978 break;
1979 case '$':
1980 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
1981 re_string_cur_idx (input) + 1 != re_string_length (input))
1982 {
1983 re_token_t next;
1984 re_string_skip_bytes (input, 1);
1985 peek_token (&next, input, syntax);
1986 re_string_skip_bytes (input, -1);
1987 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1988 break;
1989 }
3b0bdc72 1990 token->type = ANCHOR;
bb3f4825 1991 token->opr.ctx_type = LINE_LAST;
3b0bdc72
UD
1992 break;
1993 default:
1994 break;
1995 }
1996 return 1;
1997}
1998
1999/* Peek a token from INPUT, and return the length of the token.
2000 We must not use this function out of bracket expressions. */
2001
2002static int
9b0c3f38 2003internal_function
0fd8ae9c 2004peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
3b0bdc72
UD
2005{
2006 unsigned char c;
2007 if (re_string_eoi (input))
2008 {
2009 token->type = END_OF_RE;
2010 return 0;
2011 }
2012 c = re_string_peek_byte (input, 0);
2013 token->opr.c = c;
2014
2015#ifdef RE_ENABLE_I18N
3c0fb574 2016 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
2017 !re_string_first_byte (input, re_string_cur_idx (input)))
2018 {
2019 token->type = CHARACTER;
2020 return 1;
2021 }
2022#endif /* RE_ENABLE_I18N */
2023
294b6bcc
UD
2024 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2025 && re_string_cur_idx (input) + 1 < re_string_length (input))
3b0bdc72
UD
2026 {
2027 /* In this case, '\' escape a character. */
2028 unsigned char c2;
82bbb29e
UD
2029 re_string_skip_bytes (input, 1);
2030 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
2031 token->opr.c = c2;
2032 token->type = CHARACTER;
2033 return 1;
2034 }
2035 if (c == '[') /* '[' is a special char in a bracket exps. */
2036 {
2037 unsigned char c2;
2038 int token_len;
294b6bcc
UD
2039 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2040 c2 = re_string_peek_byte (input, 1);
2041 else
2042 c2 = 0;
3b0bdc72
UD
2043 token->opr.c = c2;
2044 token_len = 2;
2045 switch (c2)
15a7d175
UD
2046 {
2047 case '.':
2048 token->type = OP_OPEN_COLL_ELEM;
2049 break;
2050 case '=':
2051 token->type = OP_OPEN_EQUIV_CLASS;
2052 break;
2053 case ':':
2054 if (syntax & RE_CHAR_CLASSES)
2055 {
2056 token->type = OP_OPEN_CHAR_CLASS;
2057 break;
2058 }
2059 /* else fall through. */
2060 default:
2061 token->type = CHARACTER;
2062 token->opr.c = c;
2063 token_len = 1;
2064 break;
2065 }
3b0bdc72
UD
2066 return token_len;
2067 }
2068 switch (c)
2069 {
2070 case '-':
2071 token->type = OP_CHARSET_RANGE;
2072 break;
2073 case ']':
2074 token->type = OP_CLOSE_BRACKET;
2075 break;
2076 case '^':
2077 token->type = OP_NON_MATCH_LIST;
2078 break;
2079 default:
2080 token->type = CHARACTER;
2081 }
2082 return 1;
2083}
2084\f
2085/* Functions for parser. */
2086
2087/* Entry point of the parser.
2088 Parse the regular expression REGEXP and return the structure tree.
2089 If an error is occured, ERR is set by error code, and return NULL.
2090 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
2091 CAT
2092 / \
2093 / \
3b0bdc72
UD
2094 <reg_exp> EOR
2095
2096 CAT means concatenation.
2097 EOR means end of regular expression. */
2098
2099static bin_tree_t *
0fd8ae9c
UD
2100parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2101 reg_errcode_t *err)
3b0bdc72
UD
2102{
2103 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2104 bin_tree_t *tree, *eor, *root;
2105 re_token_t current_token;
56b168be 2106 dfa->syntax = syntax;
f0d77aa8 2107 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2108 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 2109 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72 2110 return NULL;
02f3550c 2111 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
3b0bdc72 2112 if (tree != NULL)
02f3550c 2113 root = create_tree (dfa, tree, eor, CONCAT);
3b0bdc72
UD
2114 else
2115 root = eor;
ee70274a 2116 if (BE (eor == NULL || root == NULL, 0))
485d775d
UD
2117 {
2118 *err = REG_ESPACE;
2119 return NULL;
2120 }
3b0bdc72
UD
2121 return root;
2122}
2123
2124/* This function build the following tree, from regular expression
2125 <branch1>|<branch2>:
15a7d175
UD
2126 ALT
2127 / \
2128 / \
3b0bdc72
UD
2129 <branch1> <branch2>
2130
2131 ALT means alternative, which represents the operator `|'. */
2132
2133static bin_tree_t *
0fd8ae9c
UD
2134parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2135 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2136{
2137 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2138 bin_tree_t *tree, *branch = NULL;
3b0bdc72 2139 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 2140 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2141 return NULL;
2142
2143 while (token->type == OP_ALT)
2144 {
f0d77aa8 2145 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2146 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
2147 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2148 {
2149 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2150 if (BE (*err != REG_NOERROR && branch == NULL, 0))
ee70274a 2151 return NULL;
15a7d175 2152 }
9b88fc16
UD
2153 else
2154 branch = NULL;
02f3550c 2155 tree = create_tree (dfa, tree, branch, OP_ALT);
ee70274a 2156 if (BE (tree == NULL, 0))
15a7d175
UD
2157 {
2158 *err = REG_ESPACE;
2159 return NULL;
2160 }
3b0bdc72
UD
2161 }
2162 return tree;
2163}
2164
2165/* This function build the following tree, from regular expression
2166 <exp1><exp2>:
15a7d175
UD
2167 CAT
2168 / \
3b0bdc72
UD
2169 / \
2170 <exp1> <exp2>
2171
2172 CAT means concatenation. */
2173
2174static bin_tree_t *
0fd8ae9c
UD
2175parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2176 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2177{
2178 bin_tree_t *tree, *exp;
ee70274a 2179 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3b0bdc72 2180 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2181 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2182 return NULL;
2183
2184 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2185 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72
UD
2186 {
2187 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2188 if (BE (*err != REG_NOERROR && exp == NULL, 0))
15a7d175 2189 {
b833d51f
UD
2190 if (tree != NULL)
2191 postorder (tree, free_tree, NULL);
15a7d175
UD
2192 return NULL;
2193 }
3b0bdc72 2194 if (tree != NULL && exp != NULL)
15a7d175 2195 {
e9b9cbf5
UD
2196 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2197 if (newtree == NULL)
15a7d175 2198 {
e9b9cbf5
UD
2199 postorder (exp, free_tree, NULL);
2200 postorder (tree, free_tree, NULL);
15a7d175
UD
2201 *err = REG_ESPACE;
2202 return NULL;
2203 }
e9b9cbf5 2204 tree = newtree;
15a7d175 2205 }
3b0bdc72 2206 else if (tree == NULL)
15a7d175 2207 tree = exp;
3b0bdc72
UD
2208 /* Otherwise exp == NULL, we don't need to create new tree. */
2209 }
2210 return tree;
2211}
2212
2213/* This function build the following tree, from regular expression a*:
15a7d175
UD
2214 *
2215 |
2216 a
3b0bdc72
UD
2217*/
2218
2219static bin_tree_t *
0fd8ae9c
UD
2220parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2221 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2222{
2223 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2224 bin_tree_t *tree;
3b0bdc72
UD
2225 switch (token->type)
2226 {
2227 case CHARACTER:
02f3550c 2228 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2229 if (BE (tree == NULL, 0))
15a7d175
UD
2230 {
2231 *err = REG_ESPACE;
2232 return NULL;
2233 }
3b0bdc72 2234#ifdef RE_ENABLE_I18N
3c0fb574 2235 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2236 {
2237 while (!re_string_eoi (regexp)
2238 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2239 {
15a7d175 2240 bin_tree_t *mbc_remain;
f0d77aa8 2241 fetch_token (token, regexp, syntax);
02f3550c
UD
2242 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2243 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
ee70274a 2244 if (BE (mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2245 {
2246 *err = REG_ESPACE;
2247 return NULL;
2248 }
15a7d175 2249 }
3b0bdc72
UD
2250 }
2251#endif
2252 break;
2253 case OP_OPEN_SUBEXP:
2254 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2255 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2256 return NULL;
3b0bdc72
UD
2257 break;
2258 case OP_OPEN_BRACKET:
2259 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2260 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2261 return NULL;
3b0bdc72
UD
2262 break;
2263 case OP_BACK_REF:
ae73c6c1 2264 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
15a7d175
UD
2265 {
2266 *err = REG_ESUBREG;
2267 return NULL;
2268 }
ae73c6c1 2269 dfa->used_bkref_map |= 1 << token->opr.idx;
02f3550c 2270 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2271 if (BE (tree == NULL, 0))
15a7d175
UD
2272 {
2273 *err = REG_ESPACE;
2274 return NULL;
2275 }
3b0bdc72
UD
2276 ++dfa->nbackref;
2277 dfa->has_mb_node = 1;
2278 break;
06e8303a
UD
2279 case OP_OPEN_DUP_NUM:
2280 if (syntax & RE_CONTEXT_INVALID_DUP)
2281 {
2282 *err = REG_BADRPT;
2283 return NULL;
2284 }
2285 /* FALLTHROUGH */
3b0bdc72
UD
2286 case OP_DUP_ASTERISK:
2287 case OP_DUP_PLUS:
2288 case OP_DUP_QUESTION:
3b0bdc72 2289 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2290 {
2291 *err = REG_BADRPT;
2292 return NULL;
2293 }
3b0bdc72 2294 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2295 {
f0d77aa8 2296 fetch_token (token, regexp, syntax);
15a7d175
UD
2297 return parse_expression (regexp, preg, token, syntax, nest, err);
2298 }
3b0bdc72
UD
2299 /* else fall through */
2300 case OP_CLOSE_SUBEXP:
2301 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2302 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2303 {
2304 *err = REG_ERPAREN;
2305 return NULL;
2306 }
3b0bdc72
UD
2307 /* else fall through */
2308 case OP_CLOSE_DUP_NUM:
2309 /* We treat it as a normal character. */
2310
2311 /* Then we can these characters as normal characters. */
2312 token->type = CHARACTER;
65e6becf
UD
2313 /* mb_partial and word_char bits should be initialized already
2314 by peek_token. */
02f3550c 2315 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2316 if (BE (tree == NULL, 0))
15a7d175
UD
2317 {
2318 *err = REG_ESPACE;
2319 return NULL;
2320 }
3b0bdc72
UD
2321 break;
2322 case ANCHOR:
bb3f4825 2323 if ((token->opr.ctx_type
24992143 2324 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
56b168be
UD
2325 && dfa->word_ops_used == 0)
2326 init_word_char (dfa);
24992143 2327 if (token->opr.ctx_type == WORD_DELIM
21f5de55 2328 || token->opr.ctx_type == NOT_WORD_DELIM)
15a7d175
UD
2329 {
2330 bin_tree_t *tree_first, *tree_last;
24992143
UD
2331 if (token->opr.ctx_type == WORD_DELIM)
2332 {
2333 token->opr.ctx_type = WORD_FIRST;
02f3550c 2334 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2335 token->opr.ctx_type = WORD_LAST;
21f5de55
PE
2336 }
2337 else
2338 {
24992143 2339 token->opr.ctx_type = INSIDE_WORD;
02f3550c 2340 tree_first = create_token_tree (dfa, NULL, NULL, token);
24992143 2341 token->opr.ctx_type = INSIDE_NOTWORD;
21f5de55 2342 }
02f3550c
UD
2343 tree_last = create_token_tree (dfa, NULL, NULL, token);
2344 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
ee70274a 2345 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
15a7d175
UD
2346 {
2347 *err = REG_ESPACE;
2348 return NULL;
2349 }
2350 }
3b0bdc72 2351 else
15a7d175 2352 {
02f3550c 2353 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2354 if (BE (tree == NULL, 0))
54e1cabc
UD
2355 {
2356 *err = REG_ESPACE;
2357 return NULL;
2358 }
15a7d175 2359 }
3b0bdc72 2360 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2361 by repetition operators.
2362 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2363 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2364 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2365 return tree;
2366 case OP_PERIOD:
02f3550c 2367 tree = create_token_tree (dfa, NULL, NULL, token);
ee70274a 2368 if (BE (tree == NULL, 0))
15a7d175
UD
2369 {
2370 *err = REG_ESPACE;
2371 return NULL;
2372 }
3c0fb574 2373 if (dfa->mb_cur_max > 1)
15a7d175 2374 dfa->has_mb_node = 1;
3b0bdc72
UD
2375 break;
2376 case OP_WORD:
3b0bdc72 2377 case OP_NOTWORD:
266c1f50
UD
2378 tree = build_charclass_op (dfa, regexp->trans,
2379 (const unsigned char *) "alnum",
2380 (const unsigned char *) "_",
2381 token->type == OP_NOTWORD, err);
e2b6bfa3
UD
2382 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2383 return NULL;
2384 break;
2385 case OP_SPACE:
e2b6bfa3 2386 case OP_NOTSPACE:
266c1f50
UD
2387 tree = build_charclass_op (dfa, regexp->trans,
2388 (const unsigned char *) "space",
2389 (const unsigned char *) "",
2390 token->type == OP_NOTSPACE, err);
bc15410e 2391 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2392 return NULL;
3b0bdc72
UD
2393 break;
2394 case OP_ALT:
2395 case END_OF_RE:
2396 return NULL;
2397 case BACK_SLASH:
2398 *err = REG_EESCAPE;
2399 return NULL;
2400 default:
2401 /* Must not happen? */
2402#ifdef DEBUG
2403 assert (0);
2404#endif
2405 return NULL;
2406 }
f0d77aa8 2407 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2408
2409 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2410 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72
UD
2411 {
2412 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
bc15410e 2413 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2414 return NULL;
c34bfc8d
UD
2415 /* In BRE consecutive duplications are not allowed. */
2416 if ((syntax & RE_CONTEXT_INVALID_DUP)
2417 && (token->type == OP_DUP_ASTERISK
2418 || token->type == OP_OPEN_DUP_NUM))
2419 {
2420 *err = REG_BADRPT;
2421 return NULL;
2422 }
3b0bdc72
UD
2423 }
2424
2425 return tree;
2426}
2427
2428/* This function build the following tree, from regular expression
2429 (<reg_exp>):
15a7d175
UD
2430 SUBEXP
2431 |
2432 <reg_exp>
3b0bdc72
UD
2433*/
2434
2435static bin_tree_t *
0fd8ae9c
UD
2436parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2437 reg_syntax_t syntax, int nest, reg_errcode_t *err)
3b0bdc72
UD
2438{
2439 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02f3550c 2440 bin_tree_t *tree;
3b0bdc72
UD
2441 size_t cur_nsub;
2442 cur_nsub = preg->re_nsub++;
81c64d40 2443
f0d77aa8 2444 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2445
2446 /* The subexpression may be a null string. */
2447 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2448 tree = NULL;
3b0bdc72
UD
2449 else
2450 {
2451 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
02f3550c 2452 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
a129c80d
UD
2453 {
2454 if (tree != NULL)
e9b9cbf5 2455 postorder (tree, free_tree, NULL);
a129c80d
UD
2456 *err = REG_EPAREN;
2457 }
02f3550c 2458 if (BE (*err != REG_NOERROR, 0))
15a7d175 2459 return NULL;
3b0bdc72 2460 }
01ed6ceb
UD
2461
2462 if (cur_nsub <= '9' - '1')
2463 dfa->completed_bkref_map |= 1 << cur_nsub;
02f3550c
UD
2464
2465 tree = create_tree (dfa, tree, NULL, SUBEXP);
2466 if (BE (tree == NULL, 0))
485d775d
UD
2467 {
2468 *err = REG_ESPACE;
2469 return NULL;
2470 }
02f3550c 2471 tree->token.opr.idx = cur_nsub;
3b0bdc72
UD
2472 return tree;
2473}
2474
2475/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2476
2477static bin_tree_t *
0fd8ae9c
UD
2478parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72 2480{
20dc2f79 2481 bin_tree_t *tree = NULL, *old_tree = NULL;
6b6557e8 2482 int i, start, end, start_idx = re_string_cur_idx (regexp);
3b0bdc72 2483 re_token_t start_token = *token;
6b6557e8 2484
3b0bdc72
UD
2485 if (token->type == OP_OPEN_DUP_NUM)
2486 {
6b6557e8
UD
2487 end = 0;
2488 start = fetch_number (regexp, token, syntax);
3b0bdc72 2489 if (start == -1)
15a7d175
UD
2490 {
2491 if (token->type == CHARACTER && token->opr.c == ',')
2492 start = 0; /* We treat "{,m}" as "{0,m}". */
2493 else
2494 {
2495 *err = REG_BADBR; /* <re>{} is invalid. */
2496 return NULL;
2497 }
2498 }
602c2f9d 2499 if (BE (start != -2, 1))
15a7d175
UD
2500 {
2501 /* We treat "{n}" as "{n,n}". */
2502 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503 : ((token->type == CHARACTER && token->opr.c == ',')
2504 ? fetch_number (regexp, token, syntax) : -2));
2505 }
602c2f9d 2506 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2507 {
2508 /* Invalid sequence. */
6b6557e8
UD
2509 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2510 {
2511 if (token->type == END_OF_RE)
2512 *err = REG_EBRACE;
2513 else
2514 *err = REG_BADBR;
2515
2516 return NULL;
2517 }
2518
2519 /* If the syntax bit is set, rollback. */
2520 re_string_set_index (regexp, start_idx);
2521 *token = start_token;
2522 token->type = CHARACTER;
2523 /* mb_partial and word_char bits should be already initialized by
2524 peek_token. */
2525 return elem;
15a7d175 2526 }
6b6557e8 2527
52db8039 2528 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
15a7d175 2529 {
6b6557e8
UD
2530 /* First number greater than second. */
2531 *err = REG_BADBR;
15a7d175
UD
2532 return NULL;
2533 }
6b6557e8
UD
2534 }
2535 else
2536 {
2537 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2539 }
2540
20dc2f79
UD
2541 fetch_token (token, regexp, syntax);
2542
02f3550c 2543 if (BE (elem == NULL, 0))
20dc2f79 2544 return NULL;
02f3550c
UD
2545 if (BE (start == 0 && end == 0, 0))
2546 {
2547 postorder (elem, free_tree, NULL);
2548 return NULL;
2549 }
602c2f9d 2550
6b6557e8 2551 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
20dc2f79 2552 if (BE (start > 0, 0))
6b6557e8
UD
2553 {
2554 tree = elem;
2555 for (i = 2; i <= start; ++i)
7de66108 2556 {
6b6557e8 2557 elem = duplicate_tree (elem, dfa);
02f3550c 2558 tree = create_tree (dfa, tree, elem, CONCAT);
6b6557e8 2559 if (BE (elem == NULL || tree == NULL, 0))
7de66108
UD
2560 goto parse_dup_op_espace;
2561 }
6b6557e8 2562
20dc2f79
UD
2563 if (start == end)
2564 return tree;
a96c63ed 2565
20dc2f79
UD
2566 /* Duplicate ELEM before it is marked optional. */
2567 elem = duplicate_tree (elem, dfa);
2568 old_tree = tree;
2569 }
2570 else
2571 old_tree = NULL;
2572
02f3550c
UD
2573 if (elem->token.type == SUBEXP)
2574 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575
2576 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
20dc2f79
UD
2577 if (BE (tree == NULL, 0))
2578 goto parse_dup_op_espace;
2579
2580 /* This loop is actually executed only when end != -1,
2581 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2582 already created the start+1-th copy. */
2583 for (i = start + 2; i <= end; ++i)
2584 {
2585 elem = duplicate_tree (elem, dfa);
02f3550c 2586 tree = create_tree (dfa, tree, elem, CONCAT);
6b6557e8 2587 if (BE (elem == NULL || tree == NULL, 0))
21f5de55 2588 goto parse_dup_op_espace;
6b6557e8 2589
02f3550c 2590 tree = create_tree (dfa, tree, NULL, OP_ALT);
20dc2f79 2591 if (BE (tree == NULL, 0))
21f5de55 2592 goto parse_dup_op_espace;
3b0bdc72 2593 }
6b6557e8 2594
20dc2f79 2595 if (old_tree)
02f3550c 2596 tree = create_tree (dfa, old_tree, tree, CONCAT);
20dc2f79 2597
3b0bdc72
UD
2598 return tree;
2599
2600 parse_dup_op_espace:
3b0bdc72
UD
2601 *err = REG_ESPACE;
2602 return NULL;
3b0bdc72
UD
2603}
2604
2605/* Size of the names for collating symbol/equivalence_class/character_class.
2606 I'm not sure, but maybe enough. */
2607#define BRACKET_NAME_BUF_SIZE 32
2608
434d3784
UD
2609#ifndef _LIBC
2610 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2611 Build the range expression which starts from START_ELEM, and ends
2612 at END_ELEM. The result are written to MBCSET and SBCSET.
2613 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2614 mbcset->range_ends, is a pointer argument sinse we may
2615 update it. */
2616
2617static reg_errcode_t
9b0c3f38 2618internal_function
c0a0f9a3 2619# ifdef RE_ENABLE_I18N
0fd8ae9c
UD
2620build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2621 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
c0a0f9a3 2622# else /* not RE_ENABLE_I18N */
0fd8ae9c
UD
2623build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2624 bracket_elem_t *end_elem)
c0a0f9a3 2625# endif /* not RE_ENABLE_I18N */
434d3784
UD
2626{
2627 unsigned int start_ch, end_ch;
2628 /* Equivalence Classes and Character Classes can't be a range start/end. */
2629 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2630 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2631 0))
434d3784
UD
2632 return REG_ERANGE;
2633
2634 /* We can handle no multi character collating elements without libc
2635 support. */
62439eac 2636 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2637 && strlen ((char *) start_elem->opr.name) > 1)
2638 || (end_elem->type == COLL_SYM
2639 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2640 return REG_ECOLLATE;
2641
2642# ifdef RE_ENABLE_I18N
2643 {
2d87db5b
UD
2644 wchar_t wc;
2645 wint_t start_wc;
2646 wint_t end_wc;
434d3784
UD
2647 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2648
2649 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2650 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2651 : 0));
434d3784 2652 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2653 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2654 : 0));
434d3784 2655 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
15a7d175 2656 ? __btowc (start_ch) : start_elem->opr.wch);
434d3784 2657 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
15a7d175 2658 ? __btowc (end_ch) : end_elem->opr.wch);
4bb333cd
UD
2659 if (start_wc == WEOF || end_wc == WEOF)
2660 return REG_ECOLLATE;
434d3784
UD
2661 cmp_buf[0] = start_wc;
2662 cmp_buf[4] = end_wc;
2663 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2664 return REG_ERANGE;
2665
10677727
UD
2666 /* Got valid collation sequence values, add them as a new entry.
2667 However, for !_LIBC we have no collation elements: if the
2668 character set is single byte, the single byte character set
2669 that we build below suffices. parse_bracket_exp passes
2670 no MBCSET if dfa->mb_cur_max == 1. */
2671 if (mbcset)
434d3784 2672 {
21f5de55
PE
2673 /* Check the space of the arrays. */
2674 if (BE (*range_alloc == mbcset->nranges, 0))
2675 {
10677727
UD
2676 /* There is not enough space, need realloc. */
2677 wchar_t *new_array_start, *new_array_end;
2678 int new_nranges;
2679
2680 /* +1 in case of mbcset->nranges is 0. */
2681 new_nranges = 2 * mbcset->nranges + 1;
2682 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2683 are NULL if *range_alloc == 0. */
2684 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
21f5de55 2685 new_nranges);
10677727 2686 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
21f5de55 2687 new_nranges);
10677727
UD
2688
2689 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2690 return REG_ESPACE;
2691
2692 mbcset->range_starts = new_array_start;
2693 mbcset->range_ends = new_array_end;
2694 *range_alloc = new_nranges;
21f5de55 2695 }
10677727 2696
21f5de55
PE
2697 mbcset->range_starts[mbcset->nranges] = start_wc;
2698 mbcset->range_ends[mbcset->nranges++] = end_wc;
434d3784
UD
2699 }
2700
434d3784 2701 /* Build the table for single byte characters. */
1e7947dc 2702 for (wc = 0; wc < SBC_MAX; ++wc)
434d3784 2703 {
15a7d175
UD
2704 cmp_buf[2] = wc;
2705 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2706 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2707 bitset_set (sbcset, wc);
434d3784
UD
2708 }
2709 }
2710# else /* not RE_ENABLE_I18N */
2711 {
2712 unsigned int ch;
2713 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2714 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2715 : 0));
434d3784 2716 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2717 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2718 : 0));
434d3784
UD
2719 if (start_ch > end_ch)
2720 return REG_ERANGE;
2721 /* Build the table for single byte characters. */
1e7947dc 2722 for (ch = 0; ch < SBC_MAX; ++ch)
434d3784 2723 if (start_ch <= ch && ch <= end_ch)
15a7d175 2724 bitset_set (sbcset, ch);
434d3784
UD
2725 }
2726# endif /* not RE_ENABLE_I18N */
2727 return REG_NOERROR;
2728}
2729#endif /* not _LIBC */
2730
2731#ifndef _LIBC
2732/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2733 Build the collating element which is represented by NAME.
2734 The result are written to MBCSET and SBCSET.
2735 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2736 pointer argument since we may update it. */
2737
2738static reg_errcode_t
9b0c3f38 2739internal_function
c0a0f9a3 2740# ifdef RE_ENABLE_I18N
0fd8ae9c
UD
2741build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2742 int *coll_sym_alloc, const unsigned char *name)
c0a0f9a3 2743# else /* not RE_ENABLE_I18N */
0fd8ae9c 2744build_collating_symbol (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 2745# endif /* not RE_ENABLE_I18N */
434d3784 2746{
62439eac
UD
2747 size_t name_len = strlen ((const char *) name);
2748 if (BE (name_len != 1, 0))
434d3784
UD
2749 return REG_ECOLLATE;
2750 else
2751 {
62439eac 2752 bitset_set (sbcset, name[0]);
434d3784
UD
2753 return REG_NOERROR;
2754 }
2755}
2756#endif /* not _LIBC */
2757
3b0bdc72
UD
2758/* This function parse bracket expression like "[abc]", "[a-c]",
2759 "[[.a-a.]]" etc. */
2760
2761static bin_tree_t *
0fd8ae9c
UD
2762parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2763 reg_syntax_t syntax, reg_errcode_t *err)
3b0bdc72
UD
2764{
2765#ifdef _LIBC
62439eac
UD
2766 const unsigned char *collseqmb;
2767 const char *collseqwc;
3b0bdc72
UD
2768 uint32_t nrules;
2769 int32_t table_size;
2770 const int32_t *symb_table;
2771 const unsigned char *extra;
2772
434d3784 2773 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2774 Seek the collating symbol entry correspondings to NAME.
2775 Return the index of the symbol in the SYMB_TABLE. */
2776
7090d3ca 2777 auto inline int32_t
25337753 2778 __attribute ((always_inline))
3b0bdc72 2779 seek_collating_symbol_entry (name, name_len)
15a7d175
UD
2780 const unsigned char *name;
2781 size_t name_len;
3b0bdc72 2782 {
62439eac 2783 int32_t hash = elem_hash ((const char *) name, name_len);
3b0bdc72 2784 int32_t elem = hash % table_size;
7d4722e3 2785 if (symb_table[2 * elem] != 0)
a334319f 2786 {
7d4722e3
UD
2787 int32_t second = hash % (table_size - 2) + 1;
2788
2789 do
15a7d175 2790 {
7d4722e3
UD
2791 /* First compare the hashing value. */
2792 if (symb_table[2 * elem] == hash
2793 /* Compare the length of the name. */
2794 && name_len == extra[symb_table[2 * elem + 1]]
2795 /* Compare the name. */
2796 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2797 name_len) == 0)
2798 {
2799 /* Yep, this is the entry. */
2800 break;
2801 }
a334319f 2802
7d4722e3
UD
2803 /* Next entry. */
2804 elem += second;
2805 }
2806 while (symb_table[2 * elem] != 0);
15a7d175 2807 }
3b0bdc72
UD
2808 return elem;
2809 }
2810
a532a41d 2811 /* Local function for parse_bracket_exp used in _LIBC environment.
3b0bdc72
UD
2812 Look up the collation sequence value of BR_ELEM.
2813 Return the value if succeeded, UINT_MAX otherwise. */
2814
7090d3ca 2815 auto inline unsigned int
25337753 2816 __attribute ((always_inline))
3b0bdc72 2817 lookup_collation_sequence_value (br_elem)
15a7d175 2818 bracket_elem_t *br_elem;
3b0bdc72
UD
2819 {
2820 if (br_elem->type == SB_CHAR)
15a7d175
UD
2821 {
2822 /*
2823 if (MB_CUR_MAX == 1)
2824 */
2825 if (nrules == 0)
2826 return collseqmb[br_elem->opr.ch];
2827 else
2828 {
2829 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2830 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2831 }
2832 }
3b0bdc72 2833 else if (br_elem->type == MB_CHAR)
15a7d175 2834 {
a532a41d
UD
2835 if (nrules != 0)
2836 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2837 }
3b0bdc72 2838 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2839 {
2840 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2841 if (nrules != 0)
2842 {
2843 int32_t elem, idx;
2844 elem = seek_collating_symbol_entry (br_elem->opr.name,
2845 sym_name_len);
2846 if (symb_table[2 * elem] != 0)
2847 {
2848 /* We found the entry. */
2849 idx = symb_table[2 * elem + 1];
2850 /* Skip the name of collating element name. */
2851 idx += 1 + extra[idx];
2852 /* Skip the byte sequence of the collating element. */
2853 idx += 1 + extra[idx];
2854 /* Adjust for the alignment. */
2855 idx = (idx + 3) & ~3;
2856 /* Skip the multibyte collation sequence value. */
2857 idx += sizeof (unsigned int);
2858 /* Skip the wide char sequence of the collating element. */
2859 idx += sizeof (unsigned int) *
2860 (1 + *(unsigned int *) (extra + idx));
2861 /* Return the collation sequence value. */
2862 return *(unsigned int *) (extra + idx);
2863 }
2864 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2865 {
2866 /* No valid character. Match it as a single byte
2867 character. */
2868 return collseqmb[br_elem->opr.name[0]];
2869 }
2870 }
2871 else if (sym_name_len == 1)
2872 return collseqmb[br_elem->opr.name[0]];
2873 }
3b0bdc72
UD
2874 return UINT_MAX;
2875 }
2876
434d3784 2877 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2878 Build the range expression which starts from START_ELEM, and ends
2879 at END_ELEM. The result are written to MBCSET and SBCSET.
2880 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2881 mbcset->range_ends, is a pointer argument sinse we may
2882 update it. */
2883
7090d3ca 2884 auto inline reg_errcode_t
25337753 2885 __attribute ((always_inline))
c0a0f9a3 2886 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
15a7d175
UD
2887 re_charset_t *mbcset;
2888 int *range_alloc;
2c05d33f 2889 bitset_t sbcset;
15a7d175 2890 bracket_elem_t *start_elem, *end_elem;
3b0bdc72
UD
2891 {
2892 unsigned int ch;
2893 uint32_t start_collseq;
2894 uint32_t end_collseq;
2895
434d3784 2896 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2897 start/end. */
bc15410e 2898 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2899 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2900 0))
2901 return REG_ERANGE;
3b0bdc72
UD
2902
2903 start_collseq = lookup_collation_sequence_value (start_elem);
2904 end_collseq = lookup_collation_sequence_value (end_elem);
2905 /* Check start/end collation sequence values. */
bc15410e 2906 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2907 return REG_ECOLLATE;
bc15410e 2908 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2909 return REG_ERANGE;
3b0bdc72 2910
10677727
UD
2911 /* Got valid collation sequence values, add them as a new entry.
2912 However, if we have no collation elements, and the character set
2913 is single byte, the single byte character set that we
2914 build below suffices. */
2915 if (nrules > 0 || dfa->mb_cur_max > 1)
2916 {
21f5de55
PE
2917 /* Check the space of the arrays. */
2918 if (BE (*range_alloc == mbcset->nranges, 0))
10677727
UD
2919 {
2920 /* There is not enough space, need realloc. */
2921 uint32_t *new_array_start;
2922 uint32_t *new_array_end;
2923 int new_nranges;
2924
2925 /* +1 in case of mbcset->nranges is 0. */
2926 new_nranges = 2 * mbcset->nranges + 1;
2927 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2928 new_nranges);
2929 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
21f5de55 2930 new_nranges);
10677727
UD
2931
2932 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
21f5de55 2933 return REG_ESPACE;
10677727
UD
2934
2935 mbcset->range_starts = new_array_start;
2936 mbcset->range_ends = new_array_end;
2937 *range_alloc = new_nranges;
2938 }
2939
21f5de55
PE
2940 mbcset->range_starts[mbcset->nranges] = start_collseq;
2941 mbcset->range_ends[mbcset->nranges++] = end_collseq;
10677727 2942 }
3b0bdc72
UD
2943
2944 /* Build the table for single byte characters. */
1e7947dc 2945 for (ch = 0; ch < SBC_MAX; ch++)
15a7d175
UD
2946 {
2947 uint32_t ch_collseq;
2948 /*
2949 if (MB_CUR_MAX == 1)
2950 */
2951 if (nrules == 0)
2952 ch_collseq = collseqmb[ch];
2953 else
25337753 2954 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
2955 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2956 bitset_set (sbcset, ch);
2957 }
3b0bdc72
UD
2958 return REG_NOERROR;
2959 }
3b0bdc72 2960
434d3784 2961 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2962 Build the collating element which is represented by NAME.
2963 The result are written to MBCSET and SBCSET.
2964 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2965 pointer argument sinse we may update it. */
2966
7090d3ca 2967 auto inline reg_errcode_t
25337753 2968 __attribute ((always_inline))
c0a0f9a3 2969 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
15a7d175
UD
2970 re_charset_t *mbcset;
2971 int *coll_sym_alloc;
2c05d33f 2972 bitset_t sbcset;
15a7d175 2973 const unsigned char *name;
3b0bdc72 2974 {
3b0bdc72 2975 int32_t elem, idx;
62439eac 2976 size_t name_len = strlen ((const char *) name);
3b0bdc72 2977 if (nrules != 0)
15a7d175
UD
2978 {
2979 elem = seek_collating_symbol_entry (name, name_len);
2980 if (symb_table[2 * elem] != 0)
2981 {
2982 /* We found the entry. */
2983 idx = symb_table[2 * elem + 1];
2984 /* Skip the name of collating element name. */
2985 idx += 1 + extra[idx];
2986 }
2987 else if (symb_table[2 * elem] == 0 && name_len == 1)
2988 {
2989 /* No valid character, treat it as a normal
2990 character. */
2991 bitset_set (sbcset, name[0]);
2992 return REG_NOERROR;
2993 }
2994 else
2995 return REG_ECOLLATE;
3b0bdc72 2996
15a7d175
UD
2997 /* Got valid collation sequence, add it as a new entry. */
2998 /* Check the space of the arrays. */
951d6408 2999 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
15a7d175
UD
3000 {
3001 /* Not enough, realloc it. */
3002 /* +1 in case of mbcset->ncoll_syms is 0. */
951d6408 3003 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
15a7d175
UD
3004 /* Use realloc since mbcset->coll_syms is NULL
3005 if *alloc == 0. */
951d6408
UD
3006 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3007 new_coll_sym_alloc);
3008 if (BE (new_coll_syms == NULL, 0))
15a7d175 3009 return REG_ESPACE;
951d6408
UD
3010 mbcset->coll_syms = new_coll_syms;
3011 *coll_sym_alloc = new_coll_sym_alloc;
15a7d175
UD
3012 }
3013 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
15a7d175
UD
3014 return REG_NOERROR;
3015 }
3b0bdc72 3016 else
15a7d175
UD
3017 {
3018 if (BE (name_len != 1, 0))
3019 return REG_ECOLLATE;
3020 else
3021 {
3022 bitset_set (sbcset, name[0]);
3023 return REG_NOERROR;
3024 }
3025 }
3b0bdc72 3026 }
434d3784
UD
3027#endif
3028
3b0bdc72
UD
3029 re_token_t br_token;
3030 re_bitset_ptr_t sbcset;
c0a0f9a3 3031#ifdef RE_ENABLE_I18N
3b0bdc72 3032 re_charset_t *mbcset;
3b0bdc72
UD
3033 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3034 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3 3035#endif /* not RE_ENABLE_I18N */
10677727 3036 int non_match = 0;
c0a0f9a3 3037 bin_tree_t *work_tree;
ee70274a 3038 int token_len;
8cae99db 3039 int first_round = 1;
3b0bdc72 3040#ifdef _LIBC
62439eac
UD
3041 collseqmb = (const unsigned char *)
3042 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
3043 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3044 if (nrules)
3045 {
3046 /*
3047 if (MB_CUR_MAX > 1)
3048 */
7d4722e3 3049 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
3050 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3051 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3052 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 3053 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3054 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3055 }
3056#endif
2c05d33f 3057 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3058#ifdef RE_ENABLE_I18N
3b0bdc72 3059 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3060#endif /* RE_ENABLE_I18N */
3061#ifdef RE_ENABLE_I18N
bc15410e 3062 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3063#else
3064 if (BE (sbcset == NULL, 0))
3065#endif /* RE_ENABLE_I18N */
3b0bdc72 3066 {
a129c80d
UD
3067 re_free (sbcset);
3068#ifdef RE_ENABLE_I18N
3069 re_free (mbcset);
3070#endif
3b0bdc72
UD
3071 *err = REG_ESPACE;
3072 return NULL;
3073 }
3074
3075 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3076 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 3077 {
3b0bdc72 3078 *err = REG_BADPAT;
434d3784 3079 goto parse_bracket_exp_free_return;
3b0bdc72
UD
3080 }
3081 if (token->type == OP_NON_MATCH_LIST)
3082 {
c0a0f9a3 3083#ifdef RE_ENABLE_I18N
3b0bdc72 3084 mbcset->non_match = 1;
c0a0f9a3 3085#endif /* not RE_ENABLE_I18N */
10677727 3086 non_match = 1;
3b0bdc72 3087 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
784aacea 3088 bitset_set (sbcset, '\n');
3b0bdc72
UD
3089 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3090 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3091 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
3092 {
3093 *err = REG_BADPAT;
3094 goto parse_bracket_exp_free_return;
3095 }
3b0bdc72
UD
3096 }
3097
3098 /* We treat the first ']' as a normal character. */
3099 if (token->type == OP_CLOSE_BRACKET)
3100 token->type = CHARACTER;
3101
3102 while (1)
3103 {
3104 bracket_elem_t start_elem, end_elem;
62439eac
UD
3105 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3106 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72
UD
3107 reg_errcode_t ret;
3108 int token_len2 = 0, is_range_exp = 0;
3109 re_token_t token2;
3110
3111 start_elem.opr.name = start_name_buf;
3112 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 3113 syntax, first_round);
bc15410e 3114 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
3115 {
3116 *err = ret;
3117 goto parse_bracket_exp_free_return;
3118 }
78d8b07a 3119 first_round = 0;
3b0bdc72 3120
78d8b07a 3121 /* Get information about the next token. We need it in any case. */
3b0bdc72 3122 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3123
3124 /* Do not check for ranges if we know they are not allowed. */
3125 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3126 {
15a7d175
UD
3127 if (BE (token->type == END_OF_RE, 0))
3128 {
78d8b07a 3129 *err = REG_EBRACK;
15a7d175
UD
3130 goto parse_bracket_exp_free_return;
3131 }
78d8b07a 3132 if (token->type == OP_CHARSET_RANGE)
15a7d175 3133 {
78d8b07a
UD
3134 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3135 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3136 if (BE (token2.type == END_OF_RE, 0))
3137 {
3138 *err = REG_EBRACK;
3139 goto parse_bracket_exp_free_return;
3140 }
3141 if (token2.type == OP_CLOSE_BRACKET)
3142 {
3143 /* We treat the last '-' as a normal character. */
3144 re_string_skip_bytes (regexp, -token_len);
3145 token->type = CHARACTER;
3146 }
3147 else
3148 is_range_exp = 1;
15a7d175 3149 }
15a7d175 3150 }
3b0bdc72
UD
3151
3152 if (is_range_exp == 1)
15a7d175
UD
3153 {
3154 end_elem.opr.name = end_name_buf;
3155 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
78d8b07a 3156 dfa, syntax, 1);
15a7d175
UD
3157 if (BE (ret != REG_NOERROR, 0))
3158 {
3159 *err = ret;
3160 goto parse_bracket_exp_free_return;
3161 }
3162
3163 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3164
10677727
UD
3165#ifdef _LIBC
3166 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3167 &start_elem, &end_elem);
3168#else
3169# ifdef RE_ENABLE_I18N
15a7d175 3170 *err = build_range_exp (sbcset,
10677727
UD
3171 dfa->mb_cur_max > 1 ? mbcset : NULL,
3172 &range_alloc, &start_elem, &end_elem);
3173# else
3174 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3175# endif
c0a0f9a3 3176#endif /* RE_ENABLE_I18N */
15a7d175
UD
3177 if (BE (*err != REG_NOERROR, 0))
3178 goto parse_bracket_exp_free_return;
3179 }
3b0bdc72 3180 else
15a7d175
UD
3181 {
3182 switch (start_elem.type)
3183 {
3184 case SB_CHAR:
3185 bitset_set (sbcset, start_elem.opr.ch);
3186 break;
c0a0f9a3 3187#ifdef RE_ENABLE_I18N
15a7d175
UD
3188 case MB_CHAR:
3189 /* Check whether the array has enough space. */
951d6408 3190 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
15a7d175 3191 {
951d6408 3192 wchar_t *new_mbchars;
15a7d175
UD
3193 /* Not enough, realloc it. */
3194 /* +1 in case of mbcset->nmbchars is 0. */
3195 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3196 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3197 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3198 mbchar_alloc);
3199 if (BE (new_mbchars == NULL, 0))
15a7d175 3200 goto parse_bracket_exp_espace;
951d6408 3201 mbcset->mbchars = new_mbchars;
15a7d175
UD
3202 }
3203 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3204 break;
c0a0f9a3 3205#endif /* RE_ENABLE_I18N */
15a7d175
UD
3206 case EQUIV_CLASS:
3207 *err = build_equiv_class (sbcset,
c0a0f9a3 3208#ifdef RE_ENABLE_I18N
15a7d175 3209 mbcset, &equiv_class_alloc,
c0a0f9a3 3210#endif /* RE_ENABLE_I18N */
3b0bdc72 3211 start_elem.opr.name);
15a7d175
UD
3212 if (BE (*err != REG_NOERROR, 0))
3213 goto parse_bracket_exp_free_return;
3214 break;
3215 case COLL_SYM:
3216 *err = build_collating_symbol (sbcset,
c0a0f9a3 3217#ifdef RE_ENABLE_I18N
15a7d175 3218 mbcset, &coll_sym_alloc,
c0a0f9a3 3219#endif /* RE_ENABLE_I18N */
3b0bdc72 3220 start_elem.opr.name);
15a7d175
UD
3221 if (BE (*err != REG_NOERROR, 0))
3222 goto parse_bracket_exp_free_return;
3223 break;
3224 case CHAR_CLASS:
66b110e8 3225 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3226#ifdef RE_ENABLE_I18N
7b7b9e70 3227 mbcset, &char_class_alloc,
c0a0f9a3 3228#endif /* RE_ENABLE_I18N */
7b7b9e70
UD
3229 start_elem.opr.name, syntax);
3230 if (BE (*err != REG_NOERROR, 0))
3231 goto parse_bracket_exp_free_return;
15a7d175
UD
3232 break;
3233 default:
3234 assert (0);
3235 break;
3236 }
3237 }
78d8b07a
UD
3238 if (BE (token->type == END_OF_RE, 0))
3239 {
3240 *err = REG_EBRACK;
3241 goto parse_bracket_exp_free_return;
3242 }
3b0bdc72 3243 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3244 break;
3b0bdc72
UD
3245 }
3246
3247 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3248
3249 /* If it is non-matching list. */
c0a0f9a3 3250 if (non_match)
3b0bdc72 3251 bitset_not (sbcset);
10677727 3252
65e6becf
UD
3253#ifdef RE_ENABLE_I18N
3254 /* Ensure only single byte characters are set. */
3255 if (dfa->mb_cur_max > 1)
3256 bitset_mask (sbcset, dfa->sb_char);
3b0bdc72
UD
3257
3258 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3259 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3260 || mbcset->non_match)))
3b0bdc72 3261 {
3b0bdc72 3262 bin_tree_t *mbc_tree;
ad7f28c2 3263 int sbc_idx;
3b0bdc72 3264 /* Build a tree for complex bracket. */
ad7f28c2 3265 dfa->has_mb_node = 1;
02f3550c
UD
3266 br_token.type = COMPLEX_BRACKET;
3267 br_token.opr.mbcset = mbcset;
3268 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3269 if (BE (mbc_tree == NULL, 0))
3270 goto parse_bracket_exp_espace;
2c05d33f 3271 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
ad7f28c2
UD
3272 if (sbcset[sbc_idx])
3273 break;
3274 /* If there are no bits set in sbcset, there is no point
3275 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
2c05d33f 3276 if (sbc_idx < BITSET_WORDS)
02f3550c 3277 {
21f5de55
PE
3278 /* Build a tree for simple bracket. */
3279 br_token.type = SIMPLE_BRACKET;
3280 br_token.opr.sbcset = sbcset;
3281 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282 if (BE (work_tree == NULL, 0))
3283 goto parse_bracket_exp_espace;
02f3550c 3284
21f5de55
PE
3285 /* Then join them by ALT node. */
3286 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3287 if (BE (work_tree == NULL, 0))
3288 goto parse_bracket_exp_espace;
02f3550c
UD
3289 }
3290 else
ad7f28c2
UD
3291 {
3292 re_free (sbcset);
02f3550c 3293 work_tree = mbc_tree;
ad7f28c2 3294 }
3b0bdc72
UD
3295 }
3296 else
963d8d78 3297#endif /* not RE_ENABLE_I18N */
3b0bdc72 3298 {
963d8d78
UD
3299#ifdef RE_ENABLE_I18N
3300 free_charset (mbcset);
3301#endif
02f3550c
UD
3302 /* Build a tree for simple bracket. */
3303 br_token.type = SIMPLE_BRACKET;
3304 br_token.opr.sbcset = sbcset;
3305 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3306 if (BE (work_tree == NULL, 0))
21f5de55 3307 goto parse_bracket_exp_espace;
3b0bdc72 3308 }
02f3550c 3309 return work_tree;
3b0bdc72
UD
3310
3311 parse_bracket_exp_espace:
3b0bdc72 3312 *err = REG_ESPACE;
434d3784
UD
3313 parse_bracket_exp_free_return:
3314 re_free (sbcset);
c0a0f9a3 3315#ifdef RE_ENABLE_I18N
434d3784 3316 free_charset (mbcset);
c0a0f9a3 3317#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3318 return NULL;
3319}
3320
434d3784
UD
3321/* Parse an element in the bracket expression. */
3322
3b0bdc72 3323static reg_errcode_t
0fd8ae9c
UD
3324parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3325 re_token_t *token, int token_len, re_dfa_t *dfa,
3326 reg_syntax_t syntax, int accept_hyphen)
3b0bdc72
UD
3327{
3328#ifdef RE_ENABLE_I18N
3329 int cur_char_size;
3330 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3331 if (cur_char_size > 1)
3332 {
3333 elem->type = MB_CHAR;
3334 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3335 re_string_skip_bytes (regexp, cur_char_size);
3336 return REG_NOERROR;
3337 }
3338#endif /* RE_ENABLE_I18N */
3339 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3340 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3341 || token->type == OP_OPEN_EQUIV_CLASS)
3342 return parse_bracket_symbol (elem, regexp, token);
78d8b07a
UD
3343 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3344 {
3345 /* A '-' must only appear as anything but a range indicator before
3346 the closing bracket. Everything else is an error. */
3347 re_token_t token2;
3348 (void) peek_token_bracket (&token2, regexp, syntax);
3349 if (token2.type != OP_CLOSE_BRACKET)
3350 /* The actual error value is not standardized since this whole
3351 case is undefined. But ERANGE makes good sense. */
3352 return REG_ERANGE;
3353 }
3b0bdc72
UD
3354 elem->type = SB_CHAR;
3355 elem->opr.ch = token->opr.c;
3356 return REG_NOERROR;
3357}
3358
434d3784
UD
3359/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3360 such as [:<character_class>:], [.<collating_element>.], and
3361 [=<equivalent_class>=]. */
3362
3b0bdc72 3363static reg_errcode_t
0fd8ae9c
UD
3364parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3365 re_token_t *token)
3b0bdc72
UD
3366{
3367 unsigned char ch, delim = token->opr.c;
3368 int i = 0;
294b6bcc
UD
3369 if (re_string_eoi(regexp))
3370 return REG_EBRACK;
602c2f9d 3371 for (;; ++i)
3b0bdc72 3372 {
294b6bcc 3373 if (i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3374 return REG_EBRACK;
3b0bdc72 3375 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3376 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3377 else
15a7d175 3378 ch = re_string_fetch_byte (regexp);
294b6bcc
UD
3379 if (re_string_eoi(regexp))
3380 return REG_EBRACK;
3b0bdc72 3381 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3382 break;
3b0bdc72
UD
3383 elem->opr.name[i] = ch;
3384 }
3385 re_string_skip_bytes (regexp, 1);
3386 elem->opr.name[i] = '\0';
3387 switch (token->type)
3388 {
3389 case OP_OPEN_COLL_ELEM:
3390 elem->type = COLL_SYM;
3391 break;
3392 case OP_OPEN_EQUIV_CLASS:
3393 elem->type = EQUIV_CLASS;
3394 break;
3395 case OP_OPEN_CHAR_CLASS:
3396 elem->type = CHAR_CLASS;
3397 break;
3398 default:
3399 break;
3400 }
3401 return REG_NOERROR;
3402}
3403
3404 /* Helper function for parse_bracket_exp.
3405 Build the equivalence class which is represented by NAME.
3406 The result are written to MBCSET and SBCSET.
3407 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3408 is a pointer argument sinse we may update it. */
3409
3410static reg_errcode_t
c0a0f9a3 3411#ifdef RE_ENABLE_I18N
0fd8ae9c
UD
3412build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3413 int *equiv_class_alloc, const unsigned char *name)
c0a0f9a3 3414#else /* not RE_ENABLE_I18N */
0fd8ae9c 3415build_equiv_class (bitset_t sbcset, const unsigned char *name)
c0a0f9a3 3416#endif /* not RE_ENABLE_I18N */
3b0bdc72 3417{
0fd8ae9c 3418#ifdef _LIBC
3b0bdc72
UD
3419 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3420 if (nrules != 0)
3421 {
3422 const int32_t *table, *indirect;
3423 const unsigned char *weights, *extra, *cp;
62439eac 3424 unsigned char char_buf[2];
3b0bdc72
UD
3425 int32_t idx1, idx2;
3426 unsigned int ch;
3427 size_t len;
3428 /* This #include defines a local function! */
3429# include <locale/weight.h>
3430 /* Calculate the index for equivalence class. */
62439eac 3431 cp = name;
3b0bdc72
UD
3432 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3433 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3434 _NL_COLLATE_WEIGHTMB);
3435 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3436 _NL_COLLATE_EXTRAMB);
3b0bdc72 3437 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3438 _NL_COLLATE_INDIRECTMB);
f3a6cc0a
AS
3439 idx1 = findidx (&cp, -1);
3440 if (BE (idx1 == 0 || *cp != '\0', 0))
15a7d175
UD
3441 /* This isn't a valid character. */
3442 return REG_ECOLLATE;
3b0bdc72
UD
3443
3444 /* Build single byte matcing table for this equivalence class. */
b7d1c5fa 3445 len = weights[idx1 & 0xffffff];
3b0bdc72 3446 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3447 {
3448 char_buf[0] = ch;
3449 cp = char_buf;
f3a6cc0a 3450 idx2 = findidx (&cp, 1);
3b0bdc72 3451/*
15a7d175 3452 idx2 = table[ch];
3b0bdc72 3453*/
15a7d175
UD
3454 if (idx2 == 0)
3455 /* This isn't a valid character. */
3456 continue;
b7d1c5fa
UD
3457 /* Compare only if the length matches and the collation rule
3458 index is the same. */
3459 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
15a7d175
UD
3460 {
3461 int cnt = 0;
b7d1c5fa 3462
15a7d175 3463 while (cnt <= len &&
b7d1c5fa
UD
3464 weights[(idx1 & 0xffffff) + 1 + cnt]
3465 == weights[(idx2 & 0xffffff) + 1 + cnt])
15a7d175
UD
3466 ++cnt;
3467
3468 if (cnt > len)
3469 bitset_set (sbcset, ch);
3470 }
3471 }
a9388965 3472 /* Check whether the array has enough space. */
951d6408 3473 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
15a7d175
UD
3474 {
3475 /* Not enough, realloc it. */
3476 /* +1 in case of mbcset->nequiv_classes is 0. */
951d6408 3477 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3478 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3479 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3480 int32_t,
3481 new_equiv_class_alloc);
3482 if (BE (new_equiv_classes == NULL, 0))
15a7d175 3483 return REG_ESPACE;
951d6408
UD
3484 mbcset->equiv_classes = new_equiv_classes;
3485 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3486 }
3b0bdc72
UD
3487 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3488 }
3489 else
10677727 3490#endif /* _LIBC */
3b0bdc72 3491 {
62439eac 3492 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3493 return REG_ECOLLATE;
62439eac 3494 bitset_set (sbcset, *name);
3b0bdc72
UD
3495 }
3496 return REG_NOERROR;
3497}
3498
3499 /* Helper function for parse_bracket_exp.
3500 Build the character class which is represented by NAME.
3501 The result are written to MBCSET and SBCSET.
3502 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3503 is a pointer argument sinse we may update it. */
3504
3505static reg_errcode_t
c0a0f9a3 3506#ifdef RE_ENABLE_I18N
0fd8ae9c
UD
3507build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3508 re_charset_t *mbcset, int *char_class_alloc,
3509 const unsigned char *class_name, reg_syntax_t syntax)
c0a0f9a3 3510#else /* not RE_ENABLE_I18N */
0fd8ae9c
UD
3511build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3512 const unsigned char *class_name, reg_syntax_t syntax)
c0a0f9a3 3513#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3514{
3515 int i;
62439eac 3516 const char *name = (const char *) class_name;
c0a0f9a3
UD
3517
3518 /* In case of REG_ICASE "upper" and "lower" match the both of
3519 upper and lower cases. */
3520 if ((syntax & RE_ICASE)
3521 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3522 name = "alpha";
3523
3524#ifdef RE_ENABLE_I18N
3b0bdc72 3525 /* Check the space of the arrays. */
951d6408 3526 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
a9388965
UD
3527 {
3528 /* Not enough, realloc it. */
3529 /* +1 in case of mbcset->nchar_classes is 0. */
951d6408 3530 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3531 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3532 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3533 new_char_class_alloc);
3534 if (BE (new_char_classes == NULL, 0))
15a7d175 3535 return REG_ESPACE;
951d6408
UD
3536 mbcset->char_classes = new_char_classes;
3537 *char_class_alloc = new_char_class_alloc;
a9388965 3538 }
3b0bdc72 3539 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3540#endif /* RE_ENABLE_I18N */
3b0bdc72 3541
66b110e8 3542#define BUILD_CHARCLASS_LOOP(ctype_func) \
513bbb25
UD
3543 do { \
3544 if (BE (trans != NULL, 0)) \
0ecb606c 3545 { \
513bbb25
UD
3546 for (i = 0; i < SBC_MAX; ++i) \
3547 if (ctype_func (i)) \
3548 bitset_set (sbcset, trans[i]); \
3549 } \
3550 else \
3551 { \
3552 for (i = 0; i < SBC_MAX; ++i) \
3553 if (ctype_func (i)) \
3554 bitset_set (sbcset, i); \
3555 } \
3556 } while (0)
3b0bdc72
UD
3557
3558 if (strcmp (name, "alnum") == 0)
513bbb25 3559 BUILD_CHARCLASS_LOOP (isalnum);
3b0bdc72 3560 else if (strcmp (name, "cntrl") == 0)
513bbb25 3561 BUILD_CHARCLASS_LOOP (iscntrl);
3b0bdc72 3562 else if (strcmp (name, "lower") == 0)
513bbb25 3563 BUILD_CHARCLASS_LOOP (islower);
3b0bdc72 3564 else if (strcmp (name, "space") == 0)
513bbb25 3565 BUILD_CHARCLASS_LOOP (isspace);
3b0bdc72 3566 else if (strcmp (name, "alpha") == 0)
513bbb25 3567 BUILD_CHARCLASS_LOOP (isalpha);
3b0bdc72 3568 else if (strcmp (name, "digit") == 0)
513bbb25 3569 BUILD_CHARCLASS_LOOP (isdigit);
3b0bdc72 3570 else if (strcmp (name, "print") == 0)
513bbb25 3571 BUILD_CHARCLASS_LOOP (isprint);
3b0bdc72 3572 else if (strcmp (name, "upper") == 0)
513bbb25 3573 BUILD_CHARCLASS_LOOP (isupper);
3b0bdc72 3574 else if (strcmp (name, "blank") == 0)
513bbb25 3575 BUILD_CHARCLASS_LOOP (isblank);
3b0bdc72 3576 else if (strcmp (name, "graph") == 0)
513bbb25 3577 BUILD_CHARCLASS_LOOP (isgraph);
3b0bdc72 3578 else if (strcmp (name, "punct") == 0)
513bbb25 3579 BUILD_CHARCLASS_LOOP (ispunct);
3b0bdc72 3580 else if (strcmp (name, "xdigit") == 0)
513bbb25 3581 BUILD_CHARCLASS_LOOP (isxdigit);
3b0bdc72
UD
3582 else
3583 return REG_ECTYPE;
3584
3585 return REG_NOERROR;
3586}
3587
3588static bin_tree_t *
0fd8ae9c
UD
3589build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3590 const unsigned char *class_name,
3591 const unsigned char *extra, int non_match,
3592 reg_errcode_t *err)
3b0bdc72
UD
3593{
3594 re_bitset_ptr_t sbcset;
c0a0f9a3 3595#ifdef RE_ENABLE_I18N
3b0bdc72 3596 re_charset_t *mbcset;
c0a0f9a3 3597 int alloc = 0;
c0a0f9a3 3598#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3599 reg_errcode_t ret;
3600 re_token_t br_token;
3601 bin_tree_t *tree;
3b0bdc72 3602
2c05d33f 3603 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
c0a0f9a3 3604#ifdef RE_ENABLE_I18N
3b0bdc72 3605 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3606#endif /* RE_ENABLE_I18N */
3607
3608#ifdef RE_ENABLE_I18N
bc15410e 3609 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3610#else /* not RE_ENABLE_I18N */
3611 if (BE (sbcset == NULL, 0))
3612#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3613 {
3614 *err = REG_ESPACE;
3615 return NULL;
3616 }
3617
10677727 3618 if (non_match)
3b0bdc72 3619 {
c0a0f9a3 3620#ifdef RE_ENABLE_I18N
c0a0f9a3 3621 mbcset->non_match = 1;
c0a0f9a3 3622#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3623 }
3624
602c2f9d 3625 /* We don't care the syntax in this case. */
66b110e8 3626 ret = build_charclass (trans, sbcset,
c0a0f9a3 3627#ifdef RE_ENABLE_I18N
15a7d175 3628 mbcset, &alloc,
c0a0f9a3 3629#endif /* RE_ENABLE_I18N */
e2b6bfa3 3630 class_name, 0);
c0a0f9a3 3631
bc15410e 3632 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3633 {
3634 re_free (sbcset);
c0a0f9a3 3635#ifdef RE_ENABLE_I18N
3b0bdc72 3636 free_charset (mbcset);
c0a0f9a3 3637#endif /* RE_ENABLE_I18N */
7b7b9e70 3638 *err = ret;
3b0bdc72
UD
3639 return NULL;
3640 }
434d3784 3641 /* \w match '_' also. */
e2b6bfa3
UD
3642 for (; *extra; extra++)
3643 bitset_set (sbcset, *extra);
3b0bdc72
UD
3644
3645 /* If it is non-matching list. */
c0a0f9a3 3646 if (non_match)
3b0bdc72
UD
3647 bitset_not (sbcset);
3648
65e6becf
UD
3649#ifdef RE_ENABLE_I18N
3650 /* Ensure only single byte characters are set. */
3651 if (dfa->mb_cur_max > 1)
3652 bitset_mask (sbcset, dfa->sb_char);
3653#endif
3654
3b0bdc72
UD
3655 /* Build a tree for simple bracket. */
3656 br_token.type = SIMPLE_BRACKET;
3657 br_token.opr.sbcset = sbcset;
02f3550c 3658 tree = create_token_tree (dfa, NULL, NULL, &br_token);
ee70274a 3659 if (BE (tree == NULL, 0))
3b0bdc72
UD
3660 goto build_word_op_espace;
3661
c0a0f9a3 3662#ifdef RE_ENABLE_I18N
3c0fb574 3663 if (dfa->mb_cur_max > 1)
3b0bdc72 3664 {
3b0bdc72
UD
3665 bin_tree_t *mbc_tree;
3666 /* Build a tree for complex bracket. */
3667 br_token.type = COMPLEX_BRACKET;
3668 br_token.opr.mbcset = mbcset;
3669 dfa->has_mb_node = 1;
02f3550c 3670 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
ee70274a 3671 if (BE (mbc_tree == NULL, 0))
15a7d175 3672 goto build_word_op_espace;
3b0bdc72 3673 /* Then join them by ALT node. */
02f3550c 3674 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
ee70274a 3675 if (BE (mbc_tree != NULL, 1))
15a7d175 3676 return tree;
3b0bdc72
UD
3677 }
3678 else
3679 {
3680 free_charset (mbcset);
3681 return tree;
3682 }
c0a0f9a3
UD
3683#else /* not RE_ENABLE_I18N */
3684 return tree;
3685#endif /* not RE_ENABLE_I18N */
3686
3b0bdc72
UD
3687 build_word_op_espace:
3688 re_free (sbcset);
c0a0f9a3 3689#ifdef RE_ENABLE_I18N
3b0bdc72 3690 free_charset (mbcset);
c0a0f9a3 3691#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3692 *err = REG_ESPACE;
3693 return NULL;
3694}
3695
3696/* This is intended for the expressions like "a{1,3}".
3697 Fetch a number from `input', and return the number.
3698 Return -1, if the number field is empty like "{,1}".
3699 Return -2, If an error is occured. */
3700
3701static int
0fd8ae9c 3702fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3b0bdc72
UD
3703{
3704 int num = -1;
3705 unsigned char c;
3706 while (1)
3707 {
f0d77aa8 3708 fetch_token (token, input, syntax);
3b0bdc72 3709 c = token->opr.c;
602c2f9d 3710 if (BE (token->type == END_OF_RE, 0))
15a7d175 3711 return -2;
3b0bdc72 3712 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3713 break;
602c2f9d 3714 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
15a7d175 3715 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
602c2f9d 3716 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3717 }
3b0bdc72
UD
3718 return num;
3719}
3720\f
c0a0f9a3 3721#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3722static void
3723free_charset (re_charset_t *cset)
3724{
3725 re_free (cset->mbchars);
c0a0f9a3 3726# ifdef _LIBC
3b0bdc72
UD
3727 re_free (cset->coll_syms);
3728 re_free (cset->equiv_classes);
3729 re_free (cset->range_starts);
3730 re_free (cset->range_ends);
c0a0f9a3 3731# endif
3b0bdc72
UD
3732 re_free (cset->char_classes);
3733 re_free (cset);
3734}
c0a0f9a3 3735#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3736\f
3737/* Functions for binary tree operation. */
3738
ee70274a 3739/* Create a tree node. */
3b0bdc72
UD
3740
3741static bin_tree_t *
0fd8ae9c
UD
3742create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743 re_token_type_t type)
02f3550c
UD
3744{
3745 re_token_t t;
3746 t.type = type;
3747 return create_token_tree (dfa, left, right, &t);
3748}
3749
3750static bin_tree_t *
0fd8ae9c
UD
3751create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3752 const re_token_t *token)
3b0bdc72
UD
3753{
3754 bin_tree_t *tree;
ee70274a 3755 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3b0bdc72 3756 {
ee70274a
UD
3757 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3758
3759 if (storage == NULL)
3760 return NULL;
3761 storage->next = dfa->str_tree_storage;
3762 dfa->str_tree_storage = storage;
3763 dfa->str_tree_storage_idx = 0;
3b0bdc72 3764 }
ee70274a
UD
3765 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3766
3b0bdc72
UD
3767 tree->parent = NULL;
3768 tree->left = left;
3769 tree->right = right;
02f3550c
UD
3770 tree->token = *token;
3771 tree->token.duplicated = 0;
3772 tree->token.opt_subexp = 0;
3773 tree->first = NULL;
3774 tree->next = NULL;
3775 tree->node_idx = -1;
3b0bdc72
UD
3776
3777 if (left != NULL)
3778 left->parent = tree;
3779 if (right != NULL)
3780 right->parent = tree;
3781 return tree;
3782}
3783
02f3550c
UD
3784/* Mark the tree SRC as an optional subexpression.
3785 To be called from preorder or postorder. */
3b0bdc72 3786
02f3550c 3787static reg_errcode_t
0fd8ae9c 3788mark_opt_subexp (void *extra, bin_tree_t *node)
f0d77aa8 3789{
02f3550c
UD
3790 int idx = (int) (long) extra;
3791 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3792 node->token.opt_subexp = 1;
a334319f 3793
02f3550c 3794 return REG_NOERROR;
3b0bdc72
UD
3795}
3796
02f3550c 3797/* Free the allocated memory inside NODE. */
6b6557e8
UD
3798
3799static void
02f3550c 3800free_token (re_token_t *node)
6b6557e8 3801{
02f3550c
UD
3802#ifdef RE_ENABLE_I18N
3803 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3804 free_charset (node->opr.mbcset);
3805 else
3806#endif /* RE_ENABLE_I18N */
3807 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3808 re_free (node->opr.sbcset);
6b6557e8
UD
3809}
3810
02f3550c
UD
3811/* Worker function for tree walking. Free the allocated memory inside NODE
3812 and its children. */
6b6557e8 3813
02f3550c
UD
3814static reg_errcode_t
3815free_tree (void *extra, bin_tree_t *node)
6b6557e8 3816{
02f3550c
UD
3817 free_token (&node->token);
3818 return REG_NOERROR;
6b6557e8
UD
3819}
3820
ee70274a 3821
02f3550c
UD
3822/* Duplicate the node SRC, and return new node. This is a preorder
3823 visit similar to the one implemented by the generic visitor, but
3824 we need more infrastructure to maintain two parallel trees --- so,
3825 it's easier to duplicate. */
3b0bdc72
UD
3826
3827static bin_tree_t *
0fd8ae9c 3828duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3b0bdc72 3829{
02f3550c
UD
3830 const bin_tree_t *node;
3831 bin_tree_t *dup_root;
3832 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3b0bdc72 3833
02f3550c 3834 for (node = root; ; )
3b0bdc72 3835 {
02f3550c
UD
3836 /* Create a new tree and link it back to the current parent. */
3837 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3838 if (*p_new == NULL)
ee70274a 3839 return NULL;
02f3550c
UD
3840 (*p_new)->parent = dup_node;
3841 (*p_new)->token.duplicated = 1;
3842 dup_node = *p_new;
3b0bdc72 3843
02f3550c
UD
3844 /* Go to the left node, or up and to the right. */
3845 if (node->left)
3846 {
3847 node = node->left;
3848 p_new = &dup_node->left;
3849 }
3850 else
3851 {
3852 const bin_tree_t *prev = NULL;
3853 while (node->right == prev || node->right == NULL)
3854 {
3855 prev = node;
3856 node = node->parent;
3857 dup_node = dup_node->parent;
3858 if (!node)
21f5de55 3859 return dup_root;
02f3550c
UD
3860 }
3861 node = node->right;
3862 p_new = &dup_node->right;
3863 }
3b0bdc72 3864 }
3b0bdc72 3865}