]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
Update.
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
ca3c505e 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
3b0bdc72 21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
15a7d175 22 int length, reg_syntax_t syntax);
3b0bdc72 23static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
24 const re_dfastate_t *init_state,
25 char *fastmap);
3b0bdc72 26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
a9388965 27static reg_errcode_t init_word_char (re_dfa_t *dfa);
c0a0f9a3 28#ifdef RE_ENABLE_I18N
3b0bdc72 29static void free_charset (re_charset_t *cset);
c0a0f9a3 30#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
3b0bdc72
UD
36static reg_errcode_t analyze (re_dfa_t *dfa);
37static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
485d775d 41static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
15a7d175
UD
42 int top_clone_node, int root_node,
43 unsigned int constraint);
a9388965 44static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
15a7d175 45 unsigned int constraint);
a7d5c291
UD
46static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
3b0bdc72 48static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 49static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
15a7d175 50 int node, int root);
3b0bdc72
UD
51static void calc_inveclosure (re_dfa_t *dfa);
52static int fetch_number (re_string_t *input, re_token_t *token,
15a7d175 53 reg_syntax_t syntax);
f0d77aa8
UD
54static void fetch_token (re_token_t *result, re_string_t *input,
55 reg_syntax_t syntax);
3b0bdc72 56static int peek_token (re_token_t *token, re_string_t *input,
15a7d175 57 reg_syntax_t syntax);
3b0bdc72 58static int peek_token_bracket (re_token_t *token, re_string_t *input,
15a7d175 59 reg_syntax_t syntax);
3b0bdc72 60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 61 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
3b0bdc72 65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175
UD
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
3b0bdc72 68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175
UD
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
3b0bdc72 71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
3b0bdc72 74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
3b0bdc72 80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
78d8b07a
UD
84 reg_syntax_t syntax,
85 int accept_hyphen);
3b0bdc72 86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
87 re_string_t *regexp,
88 re_token_t *token);
434d3784 89#ifndef _LIBC
c0a0f9a3
UD
90# ifdef RE_ENABLE_I18N
91static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
92 re_charset_t *mbcset, int *range_alloc,
93 bracket_elem_t *start_elem,
94 bracket_elem_t *end_elem);
c0a0f9a3 95static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175
UD
96 re_charset_t *mbcset,
97 int *coll_sym_alloc,
98 const unsigned char *name);
c0a0f9a3
UD
99# else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
c0a0f9a3 103static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175 104 const unsigned char *name);
c0a0f9a3 105# endif /* not RE_ENABLE_I18N */
434d3784 106#endif /* not _LIBC */
c0a0f9a3
UD
107#ifdef RE_ENABLE_I18N
108static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175
UD
109 re_charset_t *mbcset,
110 int *equiv_class_alloc,
111 const unsigned char *name);
66b110e8
UD
112static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
113 re_bitset_ptr_t sbcset,
15a7d175
UD
114 re_charset_t *mbcset,
115 int *char_class_alloc,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
c0a0f9a3
UD
118#else /* not RE_ENABLE_I18N */
119static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175 120 const unsigned char *name);
66b110e8
UD
121static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
122 re_bitset_ptr_t sbcset,
15a7d175
UD
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
c0a0f9a3 125#endif /* not RE_ENABLE_I18N */
e2b6bfa3 126static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
06e8303a 127 const unsigned char *class_name,
e2b6bfa3
UD
128 const unsigned char *extra, int not,
129 reg_errcode_t *err);
ee70274a
UD
130static bin_tree_t *create_tree (re_dfa_t *dfa,
131 bin_tree_t *left, bin_tree_t *right,
15a7d175 132 re_token_type_t type, int index);
ee70274a
UD
133static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
134 bin_tree_t *left, bin_tree_t *right,
c5c9c929 135 const re_token_t *token)
ee70274a 136 __attribute ((noinline));
3b0bdc72 137static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
6b6557e8
UD
138static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
139static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
3b0bdc72
UD
140\f
141/* This table gives an error message for each of the error codes listed
142 in regex.h. Obviously the order here has to be same as there.
143 POSIX doesn't require that we do anything for REG_NOERROR,
144 but why not be nice? */
145
6455d255 146const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
147 {
148#define REG_NOERROR_IDX 0
149 gettext_noop ("Success") /* REG_NOERROR */
150 "\0"
151#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
152 gettext_noop ("No match") /* REG_NOMATCH */
153 "\0"
154#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
155 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
156 "\0"
157#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
158 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
159 "\0"
160#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
161 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
162 "\0"
163#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
164 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
165 "\0"
166#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
167 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
168 "\0"
169#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
170 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
171 "\0"
172#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
173 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
174 "\0"
175#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
176 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
177 "\0"
178#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
179 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
180 "\0"
181#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
182 gettext_noop ("Invalid range end") /* REG_ERANGE */
183 "\0"
184#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
185 gettext_noop ("Memory exhausted") /* REG_ESPACE */
186 "\0"
187#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
188 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
189 "\0"
190#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
191 gettext_noop ("Premature end of regular expression") /* REG_EEND */
192 "\0"
193#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
194 gettext_noop ("Regular expression too big") /* REG_ESIZE */
195 "\0"
196#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
197 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
198 };
199
6455d255 200const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
201 {
202 REG_NOERROR_IDX,
203 REG_NOMATCH_IDX,
204 REG_BADPAT_IDX,
205 REG_ECOLLATE_IDX,
206 REG_ECTYPE_IDX,
207 REG_EESCAPE_IDX,
208 REG_ESUBREG_IDX,
209 REG_EBRACK_IDX,
210 REG_EPAREN_IDX,
211 REG_EBRACE_IDX,
212 REG_BADBR_IDX,
213 REG_ERANGE_IDX,
214 REG_ESPACE_IDX,
215 REG_BADRPT_IDX,
216 REG_EEND_IDX,
217 REG_ESIZE_IDX,
218 REG_ERPAREN_IDX
219 };
220\f
221/* Entry points for GNU code. */
222
223/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 224 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
225 Returns 0 if the pattern was valid, otherwise an error string.
226
227 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
228 are set in BUFP on entry. */
229
230const char *
231re_compile_pattern (pattern, length, bufp)
232 const char *pattern;
233 size_t length;
234 struct re_pattern_buffer *bufp;
235{
236 reg_errcode_t ret;
237
3b0bdc72
UD
238 /* And GNU code determines whether or not to get register information
239 by passing null for the REGS argument to re_match, etc., not by
240 setting no_sub. */
241 bufp->no_sub = 0;
242
243 /* Match anchors at newline. */
244 bufp->newline_anchor = 1;
245
75e4a282 246 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
247
248 if (!ret)
249 return NULL;
6455d255 250 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
251}
252#ifdef _LIBC
253weak_alias (__re_compile_pattern, re_compile_pattern)
254#endif
255
256/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
257 also be assigned to arbitrarily: each pattern buffer stores its own
258 syntax, so it can be changed between regex compilations. */
259/* This has no initializer because initialized variables in Emacs
260 become read-only after dumping. */
261reg_syntax_t re_syntax_options;
262
263
264/* Specify the precise syntax of regexps for compilation. This provides
265 for compatibility for various utilities which historically have
266 different, incompatible syntaxes.
267
268 The argument SYNTAX is a bit mask comprised of the various bits
269 defined in regex.h. We return the old syntax. */
270
271reg_syntax_t
272re_set_syntax (syntax)
273 reg_syntax_t syntax;
274{
275 reg_syntax_t ret = re_syntax_options;
276
277 re_syntax_options = syntax;
278 return ret;
279}
280#ifdef _LIBC
281weak_alias (__re_set_syntax, re_set_syntax)
282#endif
283
284int
285re_compile_fastmap (bufp)
286 struct re_pattern_buffer *bufp;
287{
288 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
289 char *fastmap = bufp->fastmap;
290
291 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
292 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
293 if (dfa->init_state != dfa->init_state_word)
294 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
295 if (dfa->init_state != dfa->init_state_nl)
296 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
297 if (dfa->init_state != dfa->init_state_begbuf)
298 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
299 bufp->fastmap_accurate = 1;
300 return 0;
301}
302#ifdef _LIBC
303weak_alias (__re_compile_fastmap, re_compile_fastmap)
304#endif
305
1b2c2628 306static inline void
25337753 307__attribute ((always_inline))
1b2c2628
UD
308re_set_fastmap (char *fastmap, int icase, int ch)
309{
310 fastmap[ch] = 1;
311 if (icase)
312 fastmap[tolower (ch)] = 1;
313}
314
3b0bdc72
UD
315/* Helper function for re_compile_fastmap.
316 Compile fastmap for the initial_state INIT_STATE. */
317
318static void
319re_compile_fastmap_iter (bufp, init_state, fastmap)
320 regex_t *bufp;
321 const re_dfastate_t *init_state;
322 char *fastmap;
323{
324 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 int node_cnt;
3c0fb574 326 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
327 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
328 {
329 int node = init_state->nodes.elems[node_cnt];
330 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
331
332 if (type == CHARACTER)
74e12fbc
UD
333 {
334 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
335#ifdef RE_ENABLE_I18N
14744156 336 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 337 {
3c0fb574 338 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
74e12fbc
UD
339 wchar_t wc;
340 mbstate_t state;
341
342 p = buf;
343 *p++ = dfa->nodes[node].opr.c;
344 while (++node < dfa->nodes_len
345 && dfa->nodes[node].type == CHARACTER
346 && dfa->nodes[node].mb_partial)
347 *p++ = dfa->nodes[node].opr.c;
348 memset (&state, 0, sizeof (state));
349 if (mbrtowc (&wc, (const char *) buf, p - buf,
350 &state) == p - buf
351 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
352 re_set_fastmap (fastmap, 0, buf[0]);
353 }
354#endif
355 }
3b0bdc72 356 else if (type == SIMPLE_BRACKET)
15a7d175
UD
357 {
358 int i, j, ch;
359 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
360 for (j = 0; j < UINT_BITS; ++j, ++ch)
361 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
362 re_set_fastmap (fastmap, icase, ch);
363 }
c0a0f9a3 364#ifdef RE_ENABLE_I18N
3b0bdc72 365 else if (type == COMPLEX_BRACKET)
15a7d175
UD
366 {
367 int i;
368 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
369 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
370 || cset->nranges || cset->nchar_classes)
371 {
c0a0f9a3 372# ifdef _LIBC
15a7d175
UD
373 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
374 {
375 /* In this case we want to catch the bytes which are
376 the first byte of any collation elements.
377 e.g. In da_DK, we want to catch 'a' since "aa"
378 is a valid collation element, and don't catch
379 'b' since 'b' is the only collation element
380 which starts from 'b'. */
381 int j, ch;
382 const int32_t *table = (const int32_t *)
383 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
384 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
385 for (j = 0; j < UINT_BITS; ++j, ++ch)
386 if (table[ch] < 0)
387 re_set_fastmap (fastmap, icase, ch);
388 }
c0a0f9a3 389# else
3c0fb574 390 if (dfa->mb_cur_max > 1)
15a7d175
UD
391 for (i = 0; i < SBC_MAX; ++i)
392 if (__btowc (i) == WEOF)
393 re_set_fastmap (fastmap, icase, i);
c0a0f9a3 394# endif /* not _LIBC */
15a7d175
UD
395 }
396 for (i = 0; i < cset->nmbchars; ++i)
397 {
398 char buf[256];
399 mbstate_t state;
400 memset (&state, '\0', sizeof (state));
401 __wcrtomb (buf, cset->mbchars[i], &state);
402 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
14744156 403 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc
UD
404 {
405 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
406 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
407 }
15a7d175
UD
408 }
409 }
c0a0f9a3 410#endif /* RE_ENABLE_I18N */
c0d5034e
UD
411 else if (type == OP_PERIOD
412#ifdef RE_ENABLE_I18N
413 || type == OP_UTF8_PERIOD
414#endif /* RE_ENABLE_I18N */
415 || type == END_OF_RE)
15a7d175
UD
416 {
417 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
418 if (type == END_OF_RE)
419 bufp->can_be_null = 1;
420 return;
421 }
3b0bdc72
UD
422 }
423}
424\f
425/* Entry point for POSIX code. */
426/* regcomp takes a regular expression as a string and compiles it.
427
428 PREG is a regex_t *. We do not expect any fields to be initialized,
429 since POSIX says we shouldn't. Thus, we set
430
431 `buffer' to the compiled pattern;
432 `used' to the length of the compiled pattern;
433 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
434 REG_EXTENDED bit in CFLAGS is set; otherwise, to
435 RE_SYNTAX_POSIX_BASIC;
436 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
437 `fastmap' to an allocated space for the fastmap;
438 `fastmap_accurate' to zero;
439 `re_nsub' to the number of subexpressions in PATTERN.
440
441 PATTERN is the address of the pattern string.
442
443 CFLAGS is a series of bits which affect compilation.
444
445 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
446 use POSIX basic syntax.
447
448 If REG_NEWLINE is set, then . and [^...] don't match newline.
449 Also, regexec will try a match beginning after every newline.
450
451 If REG_ICASE is set, then we considers upper- and lowercase
452 versions of letters to be equivalent when matching.
453
454 If REG_NOSUB is set, then when PREG is passed to regexec, that
455 routine will report only success or failure, and nothing about the
456 registers.
457
458 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
459 the return codes and their meanings.) */
460
461int
462regcomp (preg, pattern, cflags)
75e4a282
UD
463 regex_t *__restrict preg;
464 const char *__restrict pattern;
3b0bdc72
UD
465 int cflags;
466{
467 reg_errcode_t ret;
468 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 469 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
470
471 preg->buffer = NULL;
472 preg->allocated = 0;
473 preg->used = 0;
474
475 /* Try to allocate space for the fastmap. */
476 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 477 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
478 return REG_ESPACE;
479
480 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
481
482 /* If REG_NEWLINE is set, newlines are treated differently. */
483 if (cflags & REG_NEWLINE)
484 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
485 syntax &= ~RE_DOT_NEWLINE;
486 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
487 /* It also changes the matching behavior. */
488 preg->newline_anchor = 1;
489 }
490 else
491 preg->newline_anchor = 0;
492 preg->no_sub = !!(cflags & REG_NOSUB);
493 preg->translate = NULL;
494
495 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
496
497 /* POSIX doesn't distinguish between an unmatched open-group and an
498 unmatched close-group: both are REG_EPAREN. */
499 if (ret == REG_ERPAREN)
500 ret = REG_EPAREN;
501
a9388965 502 /* We have already checked preg->fastmap != NULL. */
bc15410e 503 if (BE (ret == REG_NOERROR, 1))
71ccd330
UD
504 /* Compute the fastmap now, since regexec cannot modify the pattern
505 buffer. This function nevers fails in this implementation. */
83b038f2 506 (void) re_compile_fastmap (preg);
71ccd330 507 else
3b0bdc72 508 {
71ccd330
UD
509 /* Some error occurred while compiling the expression. */
510 re_free (preg->fastmap);
511 preg->fastmap = NULL;
3b0bdc72
UD
512 }
513
514 return (int) ret;
515}
516#ifdef _LIBC
517weak_alias (__regcomp, regcomp)
518#endif
519
520/* Returns a message corresponding to an error code, ERRCODE, returned
521 from either regcomp or regexec. We don't use PREG here. */
522
523size_t
524regerror (errcode, preg, errbuf, errbuf_size)
525 int errcode;
526 const regex_t *preg;
527 char *errbuf;
528 size_t errbuf_size;
529{
530 const char *msg;
531 size_t msg_size;
532
bc15410e 533 if (BE (errcode < 0
15a7d175
UD
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
541
6455d255 542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
543
544 msg_size = strlen (msg) + 1; /* Includes the null. */
545
bc15410e 546 if (BE (errbuf_size != 0, 1))
3b0bdc72 547 {
bc15410e 548 if (BE (msg_size > errbuf_size, 0))
15a7d175 549 {
3b0bdc72
UD
550#if defined HAVE_MEMPCPY || defined _LIBC
551 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
552#else
15a7d175
UD
553 memcpy (errbuf, msg, errbuf_size - 1);
554 errbuf[errbuf_size - 1] = 0;
3b0bdc72 555#endif
15a7d175 556 }
3b0bdc72 557 else
15a7d175 558 memcpy (errbuf, msg, msg_size);
3b0bdc72
UD
559 }
560
561 return msg_size;
562}
563#ifdef _LIBC
564weak_alias (__regerror, regerror)
565#endif
566
3b0bdc72 567
71ccd330
UD
568static void
569free_dfa_content (re_dfa_t *dfa)
3b0bdc72
UD
570{
571 int i, j;
3b0bdc72 572
71ccd330
UD
573 re_free (dfa->subexps);
574
ee70274a
UD
575 if (dfa->nodes)
576 for (i = 0; i < dfa->nodes_len; ++i)
577 {
578 re_token_t *node = dfa->nodes + i;
c0a0f9a3 579#ifdef RE_ENABLE_I18N
ee70274a
UD
580 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
581 free_charset (node->opr.mbcset);
582 else
c0a0f9a3 583#endif /* RE_ENABLE_I18N */
ee70274a
UD
584 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
585 re_free (node->opr.sbcset);
586 }
71ccd330
UD
587 re_free (dfa->nexts);
588 for (i = 0; i < dfa->nodes_len; ++i)
589 {
590 if (dfa->eclosures != NULL)
591 re_node_set_free (dfa->eclosures + i);
592 if (dfa->inveclosures != NULL)
593 re_node_set_free (dfa->inveclosures + i);
594 if (dfa->edests != NULL)
595 re_node_set_free (dfa->edests + i);
596 }
597 re_free (dfa->edests);
598 re_free (dfa->eclosures);
599 re_free (dfa->inveclosures);
600 re_free (dfa->nodes);
3b0bdc72 601
ee70274a
UD
602 if (dfa->state_table)
603 for (i = 0; i <= dfa->state_hash_mask; ++i)
604 {
605 struct re_state_table_entry *entry = dfa->state_table + i;
606 for (j = 0; j < entry->num; ++j)
607 {
608 re_dfastate_t *state = entry->array[j];
609 free_state (state);
610 }
611 re_free (entry->array);
612 }
71ccd330 613 re_free (dfa->state_table);
ee70274a 614 re_free (dfa->word_char);
65e6becf
UD
615#ifdef RE_ENABLE_I18N
616 re_free (dfa->sb_char);
617#endif
0742e48e 618#ifdef DEBUG
71ccd330 619 re_free (dfa->re_str);
0742e48e 620#endif
71ccd330
UD
621
622 re_free (dfa);
623}
624
625
626/* Free dynamically allocated space used by PREG. */
627
628void
629regfree (preg)
630 regex_t *preg;
631{
632 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
633 if (BE (dfa != NULL, 1))
634 free_dfa_content (dfa);
635
3b0bdc72
UD
636 re_free (preg->fastmap);
637}
638#ifdef _LIBC
639weak_alias (__regfree, regfree)
640#endif
641\f
642/* Entry points compatible with 4.2 BSD regex library. We don't define
643 them unless specifically requested. */
644
645#if defined _REGEX_RE_COMP || defined _LIBC
646
647/* BSD has one and only one pattern buffer. */
648static struct re_pattern_buffer re_comp_buf;
649
650char *
651# ifdef _LIBC
652/* Make these definitions weak in libc, so POSIX programs can redefine
653 these names if they don't use our functions, and still use
654 regcomp/regexec above without link errors. */
655weak_function
656# endif
657re_comp (s)
658 const char *s;
659{
660 reg_errcode_t ret;
240e87c2 661 char *fastmap;
3b0bdc72
UD
662
663 if (!s)
664 {
665 if (!re_comp_buf.buffer)
666 return gettext ("No previous regular expression");
667 return 0;
668 }
669
240e87c2
RM
670 if (re_comp_buf.buffer)
671 {
672 fastmap = re_comp_buf.fastmap;
673 re_comp_buf.fastmap = NULL;
674 __regfree (&re_comp_buf);
1b2c2628 675 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
676 re_comp_buf.fastmap = fastmap;
677 }
678
679 if (re_comp_buf.fastmap == NULL)
3b0bdc72
UD
680 {
681 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
682 if (re_comp_buf.fastmap == NULL)
6455d255
UD
683 return (char *) gettext (__re_error_msgid
684 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
685 }
686
687 /* Since `re_exec' always passes NULL for the `regs' argument, we
688 don't need to initialize the pattern buffer fields which affect it. */
689
690 /* Match anchors at newlines. */
691 re_comp_buf.newline_anchor = 1;
692
693 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
694
695 if (!ret)
696 return NULL;
697
698 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 699 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 700}
240e87c2
RM
701
702#ifdef _LIBC
c877418f 703libc_freeres_fn (free_mem)
240e87c2
RM
704{
705 __regfree (&re_comp_buf);
706}
240e87c2
RM
707#endif
708
3b0bdc72
UD
709#endif /* _REGEX_RE_COMP */
710\f
711/* Internal entry point.
712 Compile the regular expression PATTERN, whose length is LENGTH.
713 SYNTAX indicate regular expression's syntax. */
714
715static reg_errcode_t
716re_compile_internal (preg, pattern, length, syntax)
717 regex_t *preg;
718 const char * pattern;
719 int length;
720 reg_syntax_t syntax;
721{
722 reg_errcode_t err = REG_NOERROR;
723 re_dfa_t *dfa;
724 re_string_t regexp;
725
726 /* Initialize the pattern buffer. */
727 preg->fastmap_accurate = 0;
728 preg->syntax = syntax;
729 preg->not_bol = preg->not_eol = 0;
730 preg->used = 0;
731 preg->re_nsub = 0;
1b2c2628
UD
732 preg->can_be_null = 0;
733 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
734
735 /* Initialize the dfa. */
736 dfa = (re_dfa_t *) preg->buffer;
951d6408 737 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3b0bdc72
UD
738 {
739 /* If zero allocated, but buffer is non-null, try to realloc
740 enough space. This loses if buffer's address is bogus, but
741 that is the user's responsibility. If ->buffer is NULL this
742 is a simple allocation. */
743 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
744 if (dfa == NULL)
745 return REG_ESPACE;
3b0bdc72 746 preg->allocated = sizeof (re_dfa_t);
951d6408 747 preg->buffer = (unsigned char *) dfa;
3b0bdc72 748 }
3b0bdc72
UD
749 preg->used = sizeof (re_dfa_t);
750
751 err = init_dfa (dfa, length);
bc15410e 752 if (BE (err != REG_NOERROR, 0))
3b0bdc72 753 {
ee70274a 754 free_dfa_content (dfa);
3b0bdc72 755 preg->buffer = NULL;
ca3c505e 756 preg->allocated = 0;
3b0bdc72
UD
757 return err;
758 }
0742e48e
UD
759#ifdef DEBUG
760 dfa->re_str = re_malloc (char, length + 1);
761 strncpy (dfa->re_str, pattern, length + 1);
762#endif
3b0bdc72 763
612546c6 764 err = re_string_construct (&regexp, pattern, length, preg->translate,
f0c7c524 765 syntax & RE_ICASE, dfa);
bc15410e 766 if (BE (err != REG_NOERROR, 0))
3b0bdc72 767 {
ee70274a
UD
768 re_compile_internal_free_return:
769 free_workarea_compile (preg);
770 re_string_destruct (&regexp);
771 free_dfa_content (dfa);
3b0bdc72 772 preg->buffer = NULL;
ca3c505e 773 preg->allocated = 0;
3b0bdc72
UD
774 return err;
775 }
776
777 /* Parse the regular expression, and build a structure tree. */
778 preg->re_nsub = 0;
779 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 780 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
781 goto re_compile_internal_free_return;
782
14744156
UD
783#ifdef RE_ENABLE_I18N
784 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 785 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
786 optimize_utf8 (dfa);
787#endif
788
3b0bdc72
UD
789 /* Analyze the tree and collect information which is necessary to
790 create the dfa. */
791 err = analyze (dfa);
bc15410e 792 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
793 goto re_compile_internal_free_return;
794
795 /* Then create the initial state of the dfa. */
796 err = create_initial_state (dfa);
3b0bdc72 797
3b0bdc72
UD
798 /* Release work areas. */
799 free_workarea_compile (preg);
800 re_string_destruct (&regexp);
801
71ccd330
UD
802 if (BE (err != REG_NOERROR, 0))
803 {
71ccd330
UD
804 free_dfa_content (dfa);
805 preg->buffer = NULL;
ca3c505e 806 preg->allocated = 0;
71ccd330
UD
807 }
808
3b0bdc72
UD
809 return err;
810}
811
812/* Initialize DFA. We use the length of the regular expression PAT_LEN
813 as the initial length of some arrays. */
814
815static reg_errcode_t
816init_dfa (dfa, pat_len)
817 re_dfa_t *dfa;
818 int pat_len;
819{
820 int table_size;
81c64d40
UD
821
822 memset (dfa, '\0', sizeof (re_dfa_t));
823
ee70274a
UD
824 /* Force allocation of str_tree_storage the first time. */
825 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
826
3b0bdc72
UD
827 dfa->nodes_alloc = pat_len + 1;
828 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
829
830 dfa->states_alloc = pat_len + 1;
831
832 /* table_size = 2 ^ ceil(log pat_len) */
833 for (table_size = 1; table_size > 0; table_size <<= 1)
834 if (table_size > pat_len)
835 break;
836
837 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
838 dfa->state_hash_mask = table_size - 1;
839
840 dfa->subexps_alloc = 1;
841 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
65e6becf 842 /* dfa->word_char = NULL; */
3b0bdc72 843
3c0fb574
UD
844 dfa->mb_cur_max = MB_CUR_MAX;
845#ifdef _LIBC
bb3f4825 846 if (dfa->mb_cur_max == 6
3c0fb574
UD
847 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
848 dfa->is_utf8 = 1;
f0c7c524
UD
849 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
850 != 0);
3c0fb574 851#endif
65e6becf
UD
852#ifdef RE_ENABLE_I18N
853 if (dfa->mb_cur_max > 1)
854 {
855 int i, j, ch;
856
857 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
858 if (BE (dfa->sb_char == NULL, 0))
859 return REG_ESPACE;
860#ifdef _LIBC
861 if (dfa->is_utf8)
862 memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
863 else
864#endif
865 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
866 for (j = 0; j < UINT_BITS; ++j, ++ch)
867 if (btowc (ch) != WEOF)
868 dfa->sb_char[i] |= 1 << j;
869 }
870#endif
3c0fb574 871
bc15410e 872 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
15a7d175 873 || dfa->subexps == NULL, 0))
ee70274a 874 return REG_ESPACE;
3b0bdc72
UD
875 return REG_NOERROR;
876}
877
878/* Initialize WORD_CHAR table, which indicate which character is
879 "word". In this case "word" means that it is the word construction
880 character used by some operators like "\<", "\>", etc. */
881
a9388965 882static reg_errcode_t
3b0bdc72
UD
883init_word_char (dfa)
884 re_dfa_t *dfa;
885{
886 int i, j, ch;
887 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
bc15410e 888 if (BE (dfa->word_char == NULL, 0))
a9388965 889 return REG_ESPACE;
3b0bdc72
UD
890 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
891 for (j = 0; j < UINT_BITS; ++j, ++ch)
892 if (isalnum (ch) || ch == '_')
15a7d175 893 dfa->word_char[i] |= 1 << j;
a9388965 894 return REG_NOERROR;
3b0bdc72
UD
895}
896
897/* Free the work area which are only used while compiling. */
898
899static void
900free_workarea_compile (preg)
901 regex_t *preg;
902{
903 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
ee70274a
UD
904 bin_tree_storage_t *storage, *next;
905 for (storage = dfa->str_tree_storage; storage; storage = next)
906 {
907 next = storage->next;
908 re_free (storage);
909 }
910 dfa->str_tree_storage = NULL;
911 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 912 dfa->str_tree = NULL;
a7d5c291
UD
913 re_free (dfa->org_indices);
914 dfa->org_indices = NULL;
3b0bdc72
UD
915}
916
917/* Create initial states for all contexts. */
918
919static reg_errcode_t
920create_initial_state (dfa)
921 re_dfa_t *dfa;
922{
923 int first, i;
924 reg_errcode_t err;
925 re_node_set init_nodes;
926
927 /* Initial states have the epsilon closure of the node which is
928 the first node of the regular expression. */
929 first = dfa->str_tree->first;
930 dfa->init_node = first;
931 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 932 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
933 return err;
934
935 /* The back-references which are in initial states can epsilon transit,
936 since in this case all of the subexpressions can be null.
937 Then we add epsilon closures of the nodes which are the next nodes of
938 the back-references. */
939 if (dfa->nbackref > 0)
940 for (i = 0; i < init_nodes.nelem; ++i)
941 {
15a7d175
UD
942 int node_idx = init_nodes.elems[i];
943 re_token_type_t type = dfa->nodes[node_idx].type;
944
945 int clexp_idx;
946 if (type != OP_BACK_REF)
947 continue;
948 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
949 {
950 re_token_t *clexp_node;
951 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
952 if (clexp_node->type == OP_CLOSE_SUBEXP
953 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
954 break;
955 }
956 if (clexp_idx == init_nodes.nelem)
957 continue;
958
959 if (type == OP_BACK_REF)
960 {
961 int dest_idx = dfa->edests[node_idx].elems[0];
962 if (!re_node_set_contains (&init_nodes, dest_idx))
963 {
964 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
965 i = 0;
966 }
967 }
3b0bdc72
UD
968 }
969
970 /* It must be the first time to invoke acquire_state. */
a9388965
UD
971 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
972 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 973 if (BE (dfa->init_state == NULL, 0))
a9388965 974 return err;
3b0bdc72
UD
975 if (dfa->init_state->has_constraint)
976 {
a9388965 977 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 978 CONTEXT_WORD);
a9388965 979 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 980 CONTEXT_NEWLINE);
a9388965 981 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
982 &init_nodes,
983 CONTEXT_NEWLINE
984 | CONTEXT_BEGBUF);
bc15410e 985 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
986 || dfa->init_state_begbuf == NULL, 0))
987 return err;
3b0bdc72
UD
988 }
989 else
990 dfa->init_state_word = dfa->init_state_nl
991 = dfa->init_state_begbuf = dfa->init_state;
992
3b0bdc72
UD
993 re_node_set_free (&init_nodes);
994 return REG_NOERROR;
995}
996\f
14744156
UD
997#ifdef RE_ENABLE_I18N
998/* If it is possible to do searching in single byte encoding instead of UTF-8
999 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1000 DFA nodes where needed. */
1001
1002static void
1003optimize_utf8 (dfa)
1004 re_dfa_t *dfa;
1005{
ad7f28c2 1006 int node, i, mb_chars = 0, has_period = 0;
14744156
UD
1007
1008 for (node = 0; node < dfa->nodes_len; ++node)
1009 switch (dfa->nodes[node].type)
1010 {
1011 case CHARACTER:
14744156 1012 if (dfa->nodes[node].opr.c >= 0x80)
5f93cd52 1013 mb_chars = 1;
14744156
UD
1014 break;
1015 case ANCHOR:
1016 switch (dfa->nodes[node].opr.idx)
1017 {
1018 case LINE_FIRST:
1019 case LINE_LAST:
1020 case BUF_FIRST:
1021 case BUF_LAST:
1022 break;
1023 default:
1024 /* Word anchors etc. cannot be handled. */
1025 return;
1026 }
1027 break;
ad7f28c2
UD
1028 case OP_PERIOD:
1029 has_period = 1;
1030 break;
14744156
UD
1031 case OP_BACK_REF:
1032 case OP_ALT:
1033 case END_OF_RE:
14744156
UD
1034 case OP_DUP_ASTERISK:
1035 case OP_DUP_QUESTION:
14744156
UD
1036 case OP_OPEN_SUBEXP:
1037 case OP_CLOSE_SUBEXP:
1038 break;
a8067e8f
UD
1039 case SIMPLE_BRACKET:
1040 /* Just double check. */
1041 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1042 if (dfa->nodes[node].opr.sbcset[i])
1043 return;
1044 break;
14744156
UD
1045 default:
1046 return;
1047 }
1048
ad7f28c2 1049 if (mb_chars || has_period)
5f93cd52 1050 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1051 {
1052 if (dfa->nodes[node].type == CHARACTER
1053 && dfa->nodes[node].opr.c >= 0x80)
1054 dfa->nodes[node].mb_partial = 0;
1055 else if (dfa->nodes[node].type == OP_PERIOD)
1056 dfa->nodes[node].type = OP_UTF8_PERIOD;
1057 }
5f93cd52 1058
14744156
UD
1059 /* The search can be in single byte locale. */
1060 dfa->mb_cur_max = 1;
1061 dfa->is_utf8 = 0;
ad7f28c2 1062 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1063}
1064#endif
1065\f
3b0bdc72
UD
1066/* Analyze the structure tree, and calculate "first", "next", "edest",
1067 "eclosure", and "inveclosure". */
1068
1069static reg_errcode_t
1070analyze (dfa)
1071 re_dfa_t *dfa;
1072{
1073 int i;
1074 reg_errcode_t ret;
1075
1076 /* Allocate arrays. */
3b0bdc72 1077 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
a7d5c291 1078 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3b0bdc72
UD
1079 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1080 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1081 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1082 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
15a7d175 1083 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
3b0bdc72
UD
1084 return REG_ESPACE;
1085 /* Initialize them. */
1086 for (i = 0; i < dfa->nodes_len; ++i)
1087 {
3b0bdc72
UD
1088 dfa->nexts[i] = -1;
1089 re_node_set_init_empty (dfa->edests + i);
1090 re_node_set_init_empty (dfa->eclosures + i);
1091 re_node_set_init_empty (dfa->inveclosures + i);
1092 }
1093
1094 ret = analyze_tree (dfa, dfa->str_tree);
bc15410e 1095 if (BE (ret == REG_NOERROR, 1))
3b0bdc72
UD
1096 {
1097 ret = calc_eclosure (dfa);
1098 if (ret == REG_NOERROR)
1099 calc_inveclosure (dfa);
1100 }
1101 return ret;
1102}
1103
1104/* Helper functions for analyze.
1105 This function calculate "first", "next", and "edest" for the subtree
1106 whose root is NODE. */
1107
1108static reg_errcode_t
1109analyze_tree (dfa, node)
1110 re_dfa_t *dfa;
1111 bin_tree_t *node;
1112{
1113 reg_errcode_t ret;
1114 if (node->first == -1)
1115 calc_first (dfa, node);
1116 if (node->next == -1)
1117 calc_next (dfa, node);
1118 if (node->eclosure.nelem == 0)
1119 calc_epsdest (dfa, node);
1120 /* Calculate "first" etc. for the left child. */
1121 if (node->left != NULL)
1122 {
1123 ret = analyze_tree (dfa, node->left);
bc15410e 1124 if (BE (ret != REG_NOERROR, 0))
15a7d175 1125 return ret;
3b0bdc72
UD
1126 }
1127 /* Calculate "first" etc. for the right child. */
1128 if (node->right != NULL)
1129 {
1130 ret = analyze_tree (dfa, node->right);
bc15410e 1131 if (BE (ret != REG_NOERROR, 0))
15a7d175 1132 return ret;
3b0bdc72
UD
1133 }
1134 return REG_NOERROR;
1135}
1136
1137/* Calculate "first" for the node NODE. */
1138static void
1139calc_first (dfa, node)
1140 re_dfa_t *dfa;
1141 bin_tree_t *node;
1142{
1143 int idx, type;
1144 idx = node->node_idx;
1145 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1146
1147 switch (type)
1148 {
1149#ifdef DEBUG
3b0bdc72
UD
1150 case OP_OPEN_BRACKET:
1151 case OP_CLOSE_BRACKET:
1152 case OP_OPEN_DUP_NUM:
1153 case OP_CLOSE_DUP_NUM:
6b6557e8 1154 case OP_DUP_PLUS:
3b0bdc72
UD
1155 case OP_NON_MATCH_LIST:
1156 case OP_OPEN_COLL_ELEM:
1157 case OP_CLOSE_COLL_ELEM:
1158 case OP_OPEN_EQUIV_CLASS:
1159 case OP_CLOSE_EQUIV_CLASS:
1160 case OP_OPEN_CHAR_CLASS:
1161 case OP_CLOSE_CHAR_CLASS:
7de66108 1162 /* These must not appear here. */
3b0bdc72
UD
1163 assert (0);
1164#endif
1165 case END_OF_RE:
1166 case CHARACTER:
1167 case OP_PERIOD:
1168 case OP_DUP_ASTERISK:
1169 case OP_DUP_QUESTION:
c0a0f9a3 1170#ifdef RE_ENABLE_I18N
ad7f28c2 1171 case OP_UTF8_PERIOD:
3b0bdc72 1172 case COMPLEX_BRACKET:
c0a0f9a3 1173#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1174 case SIMPLE_BRACKET:
1175 case OP_BACK_REF:
1176 case ANCHOR:
81c64d40
UD
1177 case OP_OPEN_SUBEXP:
1178 case OP_CLOSE_SUBEXP:
3b0bdc72
UD
1179 node->first = idx;
1180 break;
3b0bdc72
UD
1181 case OP_ALT:
1182 node->first = idx;
1183 break;
3b0bdc72
UD
1184 /* else fall through */
1185 default:
1186#ifdef DEBUG
1187 assert (node->left != NULL);
1188#endif
1189 if (node->left->first == -1)
15a7d175 1190 calc_first (dfa, node->left);
3b0bdc72
UD
1191 node->first = node->left->first;
1192 break;
1193 }
3b0bdc72
UD
1194}
1195
1196/* Calculate "next" for the node NODE. */
1197
1198static void
1199calc_next (dfa, node)
1200 re_dfa_t *dfa;
1201 bin_tree_t *node;
1202{
1203 int idx, type;
1204 bin_tree_t *parent = node->parent;
1205 if (parent == NULL)
1206 {
1207 node->next = -1;
1208 idx = node->node_idx;
1209 if (node->type == 0)
15a7d175 1210 dfa->nexts[idx] = node->next;
3b0bdc72
UD
1211 return;
1212 }
1213
1214 idx = parent->node_idx;
1215 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1216
1217 switch (type)
1218 {
1219 case OP_DUP_ASTERISK:
3b0bdc72
UD
1220 node->next = idx;
1221 break;
1222 case CONCAT:
1223 if (parent->left == node)
15a7d175
UD
1224 {
1225 if (parent->right->first == -1)
1226 calc_first (dfa, parent->right);
1227 node->next = parent->right->first;
1228 break;
1229 }
3b0bdc72
UD
1230 /* else fall through */
1231 default:
1232 if (parent->next == -1)
15a7d175 1233 calc_next (dfa, parent);
3b0bdc72
UD
1234 node->next = parent->next;
1235 break;
1236 }
1237 idx = node->node_idx;
1238 if (node->type == 0)
1239 dfa->nexts[idx] = node->next;
1240}
1241
1242/* Calculate "edest" for the node NODE. */
1243
1244static void
1245calc_epsdest (dfa, node)
1246 re_dfa_t *dfa;
1247 bin_tree_t *node;
1248{
1249 int idx;
1250 idx = node->node_idx;
1251 if (node->type == 0)
1252 {
1253 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
15a7d175
UD
1254 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1255 {
1256 if (node->left->first == -1)
1257 calc_first (dfa, node->left);
1258 if (node->next == -1)
1259 calc_next (dfa, node);
1260 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1261 node->next);
1262 }
3b0bdc72 1263 else if (dfa->nodes[idx].type == OP_ALT)
15a7d175
UD
1264 {
1265 int left, right;
1266 if (node->left != NULL)
1267 {
1268 if (node->left->first == -1)
1269 calc_first (dfa, node->left);
1270 left = node->left->first;
1271 }
1272 else
1273 {
1274 if (node->next == -1)
1275 calc_next (dfa, node);
1276 left = node->next;
1277 }
1278 if (node->right != NULL)
1279 {
1280 if (node->right->first == -1)
1281 calc_first (dfa, node->right);
1282 right = node->right->first;
1283 }
1284 else
1285 {
1286 if (node->next == -1)
1287 calc_next (dfa, node);
1288 right = node->next;
1289 }
1290 re_node_set_init_2 (dfa->edests + idx, left, right);
1291 }
81c64d40 1292 else if (dfa->nodes[idx].type == ANCHOR
15a7d175
UD
1293 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1294 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1295 || dfa->nodes[idx].type == OP_BACK_REF)
1296 re_node_set_init_1 (dfa->edests + idx, node->next);
1769a73f
UD
1297 else
1298 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
3b0bdc72
UD
1299 }
1300}
1301
485d775d
UD
1302/* Duplicate the epsilon closure of the node ROOT_NODE.
1303 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1304 to their own constraint. */
1305
1306static reg_errcode_t
1307duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
15a7d175 1308 init_constraint)
485d775d
UD
1309 re_dfa_t *dfa;
1310 int top_org_node, top_clone_node, root_node;
1311 unsigned int init_constraint;
1312{
1313 reg_errcode_t err;
1314 int org_node, clone_node, ret;
1315 unsigned int constraint = init_constraint;
1316 for (org_node = top_org_node, clone_node = top_clone_node;;)
1317 {
1318 int org_dest, clone_dest;
1319 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1320 {
485d775d
UD
1321 /* If the back reference epsilon-transit, its destination must
1322 also have the constraint. Then duplicate the epsilon closure
1323 of the destination of the back reference, and store it in
1324 edests of the back reference. */
15a7d175
UD
1325 org_dest = dfa->nexts[org_node];
1326 re_node_set_empty (dfa->edests + clone_node);
1327 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1328 if (BE (err != REG_NOERROR, 0))
1329 return err;
1330 dfa->nexts[clone_node] = dfa->nexts[org_node];
1331 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1332 if (BE (ret < 0, 0))
1333 return REG_ESPACE;
1334 }
485d775d 1335 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1336 {
485d775d
UD
1337 /* In case of the node can't epsilon-transit, don't duplicate the
1338 destination and store the original destination as the
1339 destination of the node. */
15a7d175
UD
1340 dfa->nexts[clone_node] = dfa->nexts[org_node];
1341 break;
1342 }
485d775d 1343 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1344 {
485d775d
UD
1345 /* In case of the node can epsilon-transit, and it has only one
1346 destination. */
15a7d175
UD
1347 org_dest = dfa->edests[org_node].elems[0];
1348 re_node_set_empty (dfa->edests + clone_node);
1349 if (dfa->nodes[org_node].type == ANCHOR)
1350 {
485d775d 1351 /* In case of the node has another constraint, append it. */
15a7d175
UD
1352 if (org_node == root_node && clone_node != org_node)
1353 {
485d775d
UD
1354 /* ...but if the node is root_node itself, it means the
1355 epsilon closure have a loop, then tie it to the
1356 destination of the root_node. */
15a7d175
UD
1357 ret = re_node_set_insert (dfa->edests + clone_node,
1358 org_dest);
1359 if (BE (ret < 0, 0))
1360 return REG_ESPACE;
1361 break;
1362 }
1363 constraint |= dfa->nodes[org_node].opr.ctx_type;
1364 }
1365 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1366 if (BE (err != REG_NOERROR, 0))
1367 return err;
1368 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1369 if (BE (ret < 0, 0))
1370 return REG_ESPACE;
1371 }
485d775d 1372 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1373 {
485d775d 1374 /* In case of the node can epsilon-transit, and it has two
a7d5c291 1375 destinations. E.g. '|', '*', '+', '?'. */
15a7d175
UD
1376 org_dest = dfa->edests[org_node].elems[0];
1377 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1378 /* Search for a duplicated node which satisfies the constraint. */
1379 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1380 if (clone_dest == -1)
1381 {
1382 /* There are no such a duplicated node, create a new one. */
1383 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1384 if (BE (err != REG_NOERROR, 0))
1385 return err;
7de66108
UD
1386 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1387 if (BE (ret < 0, 0))
1388 return REG_ESPACE;
1389 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1390 root_node, constraint);
1391 if (BE (err != REG_NOERROR, 0))
1392 return err;
a7d5c291
UD
1393 }
1394 else
1395 {
1396 /* There are a duplicated node which satisfy the constraint,
1397 use it to avoid infinite loop. */
1398 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1399 if (BE (ret < 0, 0))
1400 return REG_ESPACE;
1401 }
15a7d175
UD
1402
1403 org_dest = dfa->edests[org_node].elems[1];
1404 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1405 if (BE (err != REG_NOERROR, 0))
1406 return err;
1407 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1408 if (BE (ret < 0, 0))
1409 return REG_ESPACE;
1410 }
485d775d
UD
1411 org_node = org_dest;
1412 clone_node = clone_dest;
1413 }
1414 return REG_NOERROR;
1415}
1416
a7d5c291
UD
1417/* Search for a node which is duplicated from the node ORG_NODE, and
1418 satisfies the constraint CONSTRAINT. */
1419
1420static int
1421search_duplicated_node (dfa, org_node, constraint)
1422 re_dfa_t *dfa;
1423 int org_node;
1424 unsigned int constraint;
1425{
1426 int idx;
1427 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1428 {
1429 if (org_node == dfa->org_indices[idx]
1430 && constraint == dfa->nodes[idx].constraint)
1431 return idx; /* Found. */
1432 }
1433 return -1; /* Not found. */
1434}
1435
a9388965
UD
1436/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1437 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1438 otherwise return the error code. */
1439
1440static reg_errcode_t
1441duplicate_node (new_idx, dfa, org_idx, constraint)
3b0bdc72 1442 re_dfa_t *dfa;
a9388965 1443 int *new_idx, org_idx;
3b0bdc72
UD
1444 unsigned int constraint;
1445{
7de66108 1446 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
485d775d 1447 if (BE (dup_idx == -1, 0))
a9388965 1448 return REG_ESPACE;
485d775d
UD
1449 dfa->nodes[dup_idx].constraint = constraint;
1450 if (dfa->nodes[org_idx].type == ANCHOR)
1451 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
a9388965 1452 dfa->nodes[dup_idx].duplicated = 1;
485d775d
UD
1453 re_node_set_init_empty (dfa->edests + dup_idx);
1454 re_node_set_init_empty (dfa->eclosures + dup_idx);
1455 re_node_set_init_empty (dfa->inveclosures + dup_idx);
3b0bdc72 1456
a7d5c291
UD
1457 /* Store the index of the original node. */
1458 dfa->org_indices[dup_idx] = org_idx;
a9388965
UD
1459 *new_idx = dup_idx;
1460 return REG_NOERROR;
3b0bdc72
UD
1461}
1462
1463static void
1464calc_inveclosure (dfa)
1465 re_dfa_t *dfa;
1466{
485d775d 1467 int src, idx, dest;
3b0bdc72
UD
1468 for (src = 0; src < dfa->nodes_len; ++src)
1469 {
1470 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175
UD
1471 {
1472 dest = dfa->eclosures[src].elems[idx];
1473 re_node_set_insert (dfa->inveclosures + dest, src);
1474 }
3b0bdc72
UD
1475 }
1476}
1477
1478/* Calculate "eclosure" for all the node in DFA. */
1479
1480static reg_errcode_t
1481calc_eclosure (dfa)
1482 re_dfa_t *dfa;
1483{
485d775d 1484 int node_idx, incomplete;
3b0bdc72
UD
1485#ifdef DEBUG
1486 assert (dfa->nodes_len > 0);
1487#endif
485d775d 1488 incomplete = 0;
3b0bdc72 1489 /* For each nodes, calculate epsilon closure. */
485d775d 1490 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1491 {
a9388965 1492 reg_errcode_t err;
3b0bdc72 1493 re_node_set eclosure_elem;
485d775d 1494 if (node_idx == dfa->nodes_len)
15a7d175
UD
1495 {
1496 if (!incomplete)
1497 break;
1498 incomplete = 0;
1499 node_idx = 0;
1500 }
3b0bdc72
UD
1501
1502#ifdef DEBUG
3b0bdc72
UD
1503 assert (dfa->eclosures[node_idx].nelem != -1);
1504#endif
1505 /* If we have already calculated, skip it. */
1506 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1507 continue;
3b0bdc72 1508 /* Calculate epsilon closure of `node_idx'. */
a9388965 1509 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1510 if (BE (err != REG_NOERROR, 0))
15a7d175 1511 return err;
3b0bdc72
UD
1512
1513 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175
UD
1514 {
1515 incomplete = 1;
1516 re_node_set_free (&eclosure_elem);
1517 }
3b0bdc72 1518 }
3b0bdc72
UD
1519 return REG_NOERROR;
1520}
1521
1522/* Calculate epsilon closure of NODE. */
1523
a9388965
UD
1524static reg_errcode_t
1525calc_eclosure_iter (new_set, dfa, node, root)
1526 re_node_set *new_set;
3b0bdc72
UD
1527 re_dfa_t *dfa;
1528 int node, root;
1529{
a9388965 1530 reg_errcode_t err;
3b0bdc72 1531 unsigned int constraint;
485d775d 1532 int i, incomplete;
3b0bdc72 1533 re_node_set eclosure;
485d775d 1534 incomplete = 0;
a9388965 1535 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1536 if (BE (err != REG_NOERROR, 0))
a9388965 1537 return err;
3b0bdc72
UD
1538
1539 /* This indicates that we are calculating this node now.
1540 We reference this value to avoid infinite loop. */
1541 dfa->eclosures[node].nelem = -1;
1542
1543 constraint = ((dfa->nodes[node].type == ANCHOR)
15a7d175 1544 ? dfa->nodes[node].opr.ctx_type : 0);
485d775d
UD
1545 /* If the current node has constraints, duplicate all nodes.
1546 Since they must inherit the constraints. */
1547 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1548 {
1549 int org_node, cur_node;
1550 org_node = cur_node = node;
1551 err = duplicate_node_closure (dfa, node, node, node, constraint);
1552 if (BE (err != REG_NOERROR, 0))
15a7d175 1553 return err;
485d775d 1554 }
3b0bdc72
UD
1555
1556 /* Expand each epsilon destination nodes. */
485d775d 1557 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1558 for (i = 0; i < dfa->edests[node].nelem; ++i)
1559 {
15a7d175
UD
1560 re_node_set eclosure_elem;
1561 int edest = dfa->edests[node].elems[i];
1562 /* If calculating the epsilon closure of `edest' is in progress,
1563 return intermediate result. */
1564 if (dfa->eclosures[edest].nelem == -1)
1565 {
1566 incomplete = 1;
1567 continue;
1568 }
1569 /* If we haven't calculated the epsilon closure of `edest' yet,
1570 calculate now. Otherwise use calculated epsilon closure. */
1571 if (dfa->eclosures[edest].nelem == 0)
1572 {
1573 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1574 if (BE (err != REG_NOERROR, 0))
1575 return err;
1576 }
1577 else
1578 eclosure_elem = dfa->eclosures[edest];
1579 /* Merge the epsilon closure of `edest'. */
1580 re_node_set_merge (&eclosure, &eclosure_elem);
1581 /* If the epsilon closure of `edest' is incomplete,
1582 the epsilon closure of this node is also incomplete. */
1583 if (dfa->eclosures[edest].nelem == 0)
1584 {
1585 incomplete = 1;
1586 re_node_set_free (&eclosure_elem);
1587 }
3b0bdc72
UD
1588 }
1589
3b0bdc72
UD
1590 /* Epsilon closures include itself. */
1591 re_node_set_insert (&eclosure, node);
1592 if (incomplete && !root)
1593 dfa->eclosures[node].nelem = 0;
1594 else
1595 dfa->eclosures[node] = eclosure;
a9388965
UD
1596 *new_set = eclosure;
1597 return REG_NOERROR;
3b0bdc72
UD
1598}
1599\f
1600/* Functions for token which are used in the parser. */
1601
1602/* Fetch a token from INPUT.
1603 We must not use this function inside bracket expressions. */
1604
f0d77aa8
UD
1605static void
1606fetch_token (result, input, syntax)
1607 re_token_t *result;
3b0bdc72
UD
1608 re_string_t *input;
1609 reg_syntax_t syntax;
1610{
f0d77aa8 1611 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1612}
1613
1614/* Peek a token from INPUT, and return the length of the token.
1615 We must not use this function inside bracket expressions. */
1616
1617static int
1618peek_token (token, input, syntax)
1619 re_token_t *token;
1620 re_string_t *input;
1621 reg_syntax_t syntax;
1622{
1623 unsigned char c;
1624
1625 if (re_string_eoi (input))
1626 {
1627 token->type = END_OF_RE;
1628 return 0;
1629 }
1630
1631 c = re_string_peek_byte (input, 0);
1632 token->opr.c = c;
1633
65e6becf 1634 token->word_char = 0;
3b0bdc72
UD
1635#ifdef RE_ENABLE_I18N
1636 token->mb_partial = 0;
3c0fb574 1637 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1638 !re_string_first_byte (input, re_string_cur_idx (input)))
1639 {
1640 token->type = CHARACTER;
1641 token->mb_partial = 1;
1642 return 1;
1643 }
1644#endif
1645 if (c == '\\')
1646 {
1647 unsigned char c2;
1648 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1649 {
1650 token->type = BACK_SLASH;
1651 return 1;
1652 }
3b0bdc72
UD
1653
1654 c2 = re_string_peek_byte_case (input, 1);
1655 token->opr.c = c2;
1656 token->type = CHARACTER;
65e6becf
UD
1657#ifdef RE_ENABLE_I18N
1658 if (input->mb_cur_max > 1)
1659 {
1660 wint_t wc = re_string_wchar_at (input,
1661 re_string_cur_idx (input) + 1);
1662 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1663 }
1664 else
1665#endif
1666 token->word_char = IS_WORD_CHAR (c2) != 0;
1667
3b0bdc72 1668 switch (c2)
15a7d175
UD
1669 {
1670 case '|':
1671 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1672 token->type = OP_ALT;
1673 break;
1674 case '1': case '2': case '3': case '4': case '5':
1675 case '6': case '7': case '8': case '9':
1676 if (!(syntax & RE_NO_BK_REFS))
1677 {
1678 token->type = OP_BACK_REF;
1679 token->opr.idx = c2 - '0';
1680 }
1681 break;
1682 case '<':
1683 if (!(syntax & RE_NO_GNU_OPS))
1684 {
1685 token->type = ANCHOR;
bb3f4825 1686 token->opr.ctx_type = WORD_FIRST;
15a7d175
UD
1687 }
1688 break;
1689 case '>':
1690 if (!(syntax & RE_NO_GNU_OPS))
1691 {
1692 token->type = ANCHOR;
bb3f4825 1693 token->opr.ctx_type = WORD_LAST;
15a7d175
UD
1694 }
1695 break;
1696 case 'b':
1697 if (!(syntax & RE_NO_GNU_OPS))
1698 {
1699 token->type = ANCHOR;
bb3f4825 1700 token->opr.ctx_type = WORD_DELIM;
15a7d175
UD
1701 }
1702 break;
1703 case 'B':
1704 if (!(syntax & RE_NO_GNU_OPS))
1705 {
1706 token->type = ANCHOR;
bb3f4825 1707 token->opr.ctx_type = INSIDE_WORD;
15a7d175
UD
1708 }
1709 break;
1710 case 'w':
1711 if (!(syntax & RE_NO_GNU_OPS))
1712 token->type = OP_WORD;
1713 break;
1714 case 'W':
1715 if (!(syntax & RE_NO_GNU_OPS))
1716 token->type = OP_NOTWORD;
1717 break;
e2b6bfa3
UD
1718 case 's':
1719 if (!(syntax & RE_NO_GNU_OPS))
1720 token->type = OP_SPACE;
1721 break;
1722 case 'S':
1723 if (!(syntax & RE_NO_GNU_OPS))
1724 token->type = OP_NOTSPACE;
1725 break;
15a7d175
UD
1726 case '`':
1727 if (!(syntax & RE_NO_GNU_OPS))
1728 {
1729 token->type = ANCHOR;
bb3f4825 1730 token->opr.ctx_type = BUF_FIRST;
15a7d175
UD
1731 }
1732 break;
1733 case '\'':
1734 if (!(syntax & RE_NO_GNU_OPS))
1735 {
1736 token->type = ANCHOR;
bb3f4825 1737 token->opr.ctx_type = BUF_LAST;
15a7d175
UD
1738 }
1739 break;
1740 case '(':
1741 if (!(syntax & RE_NO_BK_PARENS))
1742 token->type = OP_OPEN_SUBEXP;
1743 break;
1744 case ')':
1745 if (!(syntax & RE_NO_BK_PARENS))
1746 token->type = OP_CLOSE_SUBEXP;
1747 break;
1748 case '+':
1749 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1750 token->type = OP_DUP_PLUS;
1751 break;
1752 case '?':
1753 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1754 token->type = OP_DUP_QUESTION;
1755 break;
1756 case '{':
1757 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1758 token->type = OP_OPEN_DUP_NUM;
1759 break;
1760 case '}':
1761 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1762 token->type = OP_CLOSE_DUP_NUM;
1763 break;
1764 default:
1765 break;
1766 }
3b0bdc72
UD
1767 return 2;
1768 }
1769
1770 token->type = CHARACTER;
65e6becf
UD
1771#ifdef RE_ENABLE_I18N
1772 if (input->mb_cur_max > 1)
1773 {
1774 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1775 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1776 }
1777 else
1778#endif
1779 token->word_char = IS_WORD_CHAR (token->opr.c);
1780
3b0bdc72
UD
1781 switch (c)
1782 {
1783 case '\n':
1784 if (syntax & RE_NEWLINE_ALT)
15a7d175 1785 token->type = OP_ALT;
3b0bdc72
UD
1786 break;
1787 case '|':
1788 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1789 token->type = OP_ALT;
3b0bdc72
UD
1790 break;
1791 case '*':
1792 token->type = OP_DUP_ASTERISK;
1793 break;
1794 case '+':
1795 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1796 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1797 break;
1798 case '?':
1799 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1800 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1801 break;
1802 case '{':
1803 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1804 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1805 break;
1806 case '}':
1807 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1808 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1809 break;
1810 case '(':
1811 if (syntax & RE_NO_BK_PARENS)
15a7d175 1812 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1813 break;
1814 case ')':
1815 if (syntax & RE_NO_BK_PARENS)
15a7d175 1816 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1817 break;
1818 case '[':
1819 token->type = OP_OPEN_BRACKET;
1820 break;
1821 case '.':
1822 token->type = OP_PERIOD;
1823 break;
1824 case '^':
134abcb5 1825 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
1826 re_string_cur_idx (input) != 0)
1827 {
1828 char prev = re_string_peek_byte (input, -1);
134abcb5 1829 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1830 break;
1831 }
3b0bdc72 1832 token->type = ANCHOR;
bb3f4825 1833 token->opr.ctx_type = LINE_FIRST;
3b0bdc72
UD
1834 break;
1835 case '$':
1836 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
1837 re_string_cur_idx (input) + 1 != re_string_length (input))
1838 {
1839 re_token_t next;
1840 re_string_skip_bytes (input, 1);
1841 peek_token (&next, input, syntax);
1842 re_string_skip_bytes (input, -1);
1843 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1844 break;
1845 }
3b0bdc72 1846 token->type = ANCHOR;
bb3f4825 1847 token->opr.ctx_type = LINE_LAST;
3b0bdc72
UD
1848 break;
1849 default:
1850 break;
1851 }
1852 return 1;
1853}
1854
1855/* Peek a token from INPUT, and return the length of the token.
1856 We must not use this function out of bracket expressions. */
1857
1858static int
1859peek_token_bracket (token, input, syntax)
1860 re_token_t *token;
1861 re_string_t *input;
1862 reg_syntax_t syntax;
1863{
1864 unsigned char c;
1865 if (re_string_eoi (input))
1866 {
1867 token->type = END_OF_RE;
1868 return 0;
1869 }
1870 c = re_string_peek_byte (input, 0);
1871 token->opr.c = c;
1872
1873#ifdef RE_ENABLE_I18N
3c0fb574 1874 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1875 !re_string_first_byte (input, re_string_cur_idx (input)))
1876 {
1877 token->type = CHARACTER;
1878 return 1;
1879 }
1880#endif /* RE_ENABLE_I18N */
1881
1882 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1883 {
1884 /* In this case, '\' escape a character. */
1885 unsigned char c2;
82bbb29e
UD
1886 re_string_skip_bytes (input, 1);
1887 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
1888 token->opr.c = c2;
1889 token->type = CHARACTER;
1890 return 1;
1891 }
1892 if (c == '[') /* '[' is a special char in a bracket exps. */
1893 {
1894 unsigned char c2;
1895 int token_len;
1896 c2 = re_string_peek_byte (input, 1);
1897 token->opr.c = c2;
1898 token_len = 2;
1899 switch (c2)
15a7d175
UD
1900 {
1901 case '.':
1902 token->type = OP_OPEN_COLL_ELEM;
1903 break;
1904 case '=':
1905 token->type = OP_OPEN_EQUIV_CLASS;
1906 break;
1907 case ':':
1908 if (syntax & RE_CHAR_CLASSES)
1909 {
1910 token->type = OP_OPEN_CHAR_CLASS;
1911 break;
1912 }
1913 /* else fall through. */
1914 default:
1915 token->type = CHARACTER;
1916 token->opr.c = c;
1917 token_len = 1;
1918 break;
1919 }
3b0bdc72
UD
1920 return token_len;
1921 }
1922 switch (c)
1923 {
1924 case '-':
1925 token->type = OP_CHARSET_RANGE;
1926 break;
1927 case ']':
1928 token->type = OP_CLOSE_BRACKET;
1929 break;
1930 case '^':
1931 token->type = OP_NON_MATCH_LIST;
1932 break;
1933 default:
1934 token->type = CHARACTER;
1935 }
1936 return 1;
1937}
1938\f
1939/* Functions for parser. */
1940
1941/* Entry point of the parser.
1942 Parse the regular expression REGEXP and return the structure tree.
1943 If an error is occured, ERR is set by error code, and return NULL.
1944 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
1945 CAT
1946 / \
1947 / \
3b0bdc72
UD
1948 <reg_exp> EOR
1949
1950 CAT means concatenation.
1951 EOR means end of regular expression. */
1952
1953static bin_tree_t *
1954parse (regexp, preg, syntax, err)
1955 re_string_t *regexp;
1956 regex_t *preg;
1957 reg_syntax_t syntax;
1958 reg_errcode_t *err;
1959{
1960 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1961 bin_tree_t *tree, *eor, *root;
1962 re_token_t current_token;
f0d77aa8 1963 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 1964 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 1965 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72 1966 return NULL;
c5c9c929 1967 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
3b0bdc72 1968 if (tree != NULL)
ee70274a 1969 root = create_tree (dfa, tree, eor, CONCAT, 0);
3b0bdc72
UD
1970 else
1971 root = eor;
ee70274a 1972 if (BE (eor == NULL || root == NULL, 0))
485d775d
UD
1973 {
1974 *err = REG_ESPACE;
1975 return NULL;
1976 }
3b0bdc72
UD
1977 return root;
1978}
1979
1980/* This function build the following tree, from regular expression
1981 <branch1>|<branch2>:
15a7d175
UD
1982 ALT
1983 / \
1984 / \
3b0bdc72
UD
1985 <branch1> <branch2>
1986
1987 ALT means alternative, which represents the operator `|'. */
1988
1989static bin_tree_t *
1990parse_reg_exp (regexp, preg, token, syntax, nest, err)
1991 re_string_t *regexp;
1992 regex_t *preg;
1993 re_token_t *token;
1994 reg_syntax_t syntax;
1995 int nest;
1996 reg_errcode_t *err;
1997{
1998 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1999 bin_tree_t *tree, *branch = NULL;
3b0bdc72 2000 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 2001 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2002 return NULL;
2003
2004 while (token->type == OP_ALT)
2005 {
2006 re_token_t alt_token = *token;
f0d77aa8 2007 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2008 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
2009 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2010 {
2011 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2012 if (BE (*err != REG_NOERROR && branch == NULL, 0))
ee70274a 2013 return NULL;
15a7d175 2014 }
9b88fc16
UD
2015 else
2016 branch = NULL;
c5c9c929 2017 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
ee70274a 2018 if (BE (tree == NULL, 0))
15a7d175
UD
2019 {
2020 *err = REG_ESPACE;
2021 return NULL;
2022 }
0742e48e 2023 dfa->has_plural_match = 1;
3b0bdc72
UD
2024 }
2025 return tree;
2026}
2027
2028/* This function build the following tree, from regular expression
2029 <exp1><exp2>:
15a7d175
UD
2030 CAT
2031 / \
3b0bdc72
UD
2032 / \
2033 <exp1> <exp2>
2034
2035 CAT means concatenation. */
2036
2037static bin_tree_t *
2038parse_branch (regexp, preg, token, syntax, nest, err)
2039 re_string_t *regexp;
2040 regex_t *preg;
2041 re_token_t *token;
2042 reg_syntax_t syntax;
2043 int nest;
2044 reg_errcode_t *err;
2045{
2046 bin_tree_t *tree, *exp;
ee70274a 2047 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3b0bdc72 2048 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2049 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2050 return NULL;
2051
2052 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2053 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72
UD
2054 {
2055 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2056 if (BE (*err != REG_NOERROR && exp == NULL, 0))
15a7d175 2057 {
15a7d175
UD
2058 return NULL;
2059 }
3b0bdc72 2060 if (tree != NULL && exp != NULL)
15a7d175 2061 {
ee70274a 2062 tree = create_tree (dfa, tree, exp, CONCAT, 0);
15a7d175
UD
2063 if (tree == NULL)
2064 {
2065 *err = REG_ESPACE;
2066 return NULL;
2067 }
2068 }
3b0bdc72 2069 else if (tree == NULL)
15a7d175 2070 tree = exp;
3b0bdc72
UD
2071 /* Otherwise exp == NULL, we don't need to create new tree. */
2072 }
2073 return tree;
2074}
2075
2076/* This function build the following tree, from regular expression a*:
15a7d175
UD
2077 *
2078 |
2079 a
3b0bdc72
UD
2080*/
2081
2082static bin_tree_t *
2083parse_expression (regexp, preg, token, syntax, nest, err)
2084 re_string_t *regexp;
2085 regex_t *preg;
2086 re_token_t *token;
2087 reg_syntax_t syntax;
2088 int nest;
2089 reg_errcode_t *err;
2090{
2091 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2092 bin_tree_t *tree;
3b0bdc72
UD
2093 switch (token->type)
2094 {
2095 case CHARACTER:
c5c9c929 2096 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2097 if (BE (tree == NULL, 0))
15a7d175
UD
2098 {
2099 *err = REG_ESPACE;
2100 return NULL;
2101 }
3b0bdc72 2102#ifdef RE_ENABLE_I18N
3c0fb574 2103 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2104 {
2105 while (!re_string_eoi (regexp)
2106 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2107 {
15a7d175 2108 bin_tree_t *mbc_remain;
f0d77aa8 2109 fetch_token (token, regexp, syntax);
c5c9c929 2110 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a
UD
2111 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2112 if (BE (mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2113 {
2114 *err = REG_ESPACE;
2115 return NULL;
2116 }
15a7d175 2117 }
3b0bdc72
UD
2118 }
2119#endif
2120 break;
2121 case OP_OPEN_SUBEXP:
2122 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2123 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2124 return NULL;
3b0bdc72
UD
2125 break;
2126 case OP_OPEN_BRACKET:
2127 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2128 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2129 return NULL;
3b0bdc72
UD
2130 break;
2131 case OP_BACK_REF:
bc15410e 2132 if (BE (preg->re_nsub < token->opr.idx
15a7d175
UD
2133 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2134 {
2135 *err = REG_ESUBREG;
2136 return NULL;
2137 }
6291ee3c 2138 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
c5c9c929 2139 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2140 if (BE (tree == NULL, 0))
15a7d175
UD
2141 {
2142 *err = REG_ESPACE;
2143 return NULL;
2144 }
3b0bdc72
UD
2145 ++dfa->nbackref;
2146 dfa->has_mb_node = 1;
2147 break;
06e8303a
UD
2148 case OP_OPEN_DUP_NUM:
2149 if (syntax & RE_CONTEXT_INVALID_DUP)
2150 {
2151 *err = REG_BADRPT;
2152 return NULL;
2153 }
2154 /* FALLTHROUGH */
3b0bdc72
UD
2155 case OP_DUP_ASTERISK:
2156 case OP_DUP_PLUS:
2157 case OP_DUP_QUESTION:
3b0bdc72 2158 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2159 {
2160 *err = REG_BADRPT;
2161 return NULL;
2162 }
3b0bdc72 2163 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2164 {
f0d77aa8 2165 fetch_token (token, regexp, syntax);
15a7d175
UD
2166 return parse_expression (regexp, preg, token, syntax, nest, err);
2167 }
3b0bdc72
UD
2168 /* else fall through */
2169 case OP_CLOSE_SUBEXP:
2170 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2171 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2172 {
2173 *err = REG_ERPAREN;
2174 return NULL;
2175 }
3b0bdc72
UD
2176 /* else fall through */
2177 case OP_CLOSE_DUP_NUM:
2178 /* We treat it as a normal character. */
2179
2180 /* Then we can these characters as normal characters. */
2181 token->type = CHARACTER;
65e6becf
UD
2182 /* mb_partial and word_char bits should be initialized already
2183 by peek_token. */
c5c9c929 2184 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2185 if (BE (tree == NULL, 0))
15a7d175
UD
2186 {
2187 *err = REG_ESPACE;
2188 return NULL;
2189 }
3b0bdc72
UD
2190 break;
2191 case ANCHOR:
bb3f4825
UD
2192 if ((token->opr.ctx_type
2193 & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2194 && dfa->word_char == NULL)
15a7d175
UD
2195 {
2196 *err = init_word_char (dfa);
2197 if (BE (*err != REG_NOERROR, 0))
2198 return NULL;
2199 }
3b0bdc72 2200 if (token->opr.ctx_type == WORD_DELIM)
15a7d175
UD
2201 {
2202 bin_tree_t *tree_first, *tree_last;
15a7d175 2203 token->opr.ctx_type = WORD_FIRST;
c5c9c929 2204 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
15a7d175 2205 token->opr.ctx_type = WORD_LAST;
c5c9c929 2206 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
15a7d175 2207 token->type = OP_ALT;
c5c9c929 2208 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
ee70274a 2209 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
15a7d175
UD
2210 {
2211 *err = REG_ESPACE;
2212 return NULL;
2213 }
2214 }
3b0bdc72 2215 else
15a7d175 2216 {
c5c9c929 2217 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2218 if (BE (tree == NULL, 0))
54e1cabc
UD
2219 {
2220 *err = REG_ESPACE;
2221 return NULL;
2222 }
15a7d175 2223 }
3b0bdc72 2224 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2225 by repetition operators.
2226 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2227 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2228 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2229 return tree;
2230 case OP_PERIOD:
c5c9c929 2231 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2232 if (BE (tree == NULL, 0))
15a7d175
UD
2233 {
2234 *err = REG_ESPACE;
2235 return NULL;
2236 }
3c0fb574 2237 if (dfa->mb_cur_max > 1)
15a7d175 2238 dfa->has_mb_node = 1;
3b0bdc72
UD
2239 break;
2240 case OP_WORD:
e2b6bfa3 2241 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
bc15410e 2242 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2243 return NULL;
3b0bdc72
UD
2244 break;
2245 case OP_NOTWORD:
e2b6bfa3
UD
2246 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2247 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2248 return NULL;
2249 break;
2250 case OP_SPACE:
2251 tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2252 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2253 return NULL;
2254 break;
2255 case OP_NOTSPACE:
2256 tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
bc15410e 2257 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2258 return NULL;
3b0bdc72
UD
2259 break;
2260 case OP_ALT:
2261 case END_OF_RE:
2262 return NULL;
2263 case BACK_SLASH:
2264 *err = REG_EESCAPE;
2265 return NULL;
2266 default:
2267 /* Must not happen? */
2268#ifdef DEBUG
2269 assert (0);
2270#endif
2271 return NULL;
2272 }
f0d77aa8 2273 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2274
2275 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2276 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72
UD
2277 {
2278 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
bc15410e 2279 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2280 return NULL;
c34bfc8d
UD
2281 /* In BRE consecutive duplications are not allowed. */
2282 if ((syntax & RE_CONTEXT_INVALID_DUP)
2283 && (token->type == OP_DUP_ASTERISK
2284 || token->type == OP_OPEN_DUP_NUM))
2285 {
2286 *err = REG_BADRPT;
2287 return NULL;
2288 }
0742e48e 2289 dfa->has_plural_match = 1;
3b0bdc72
UD
2290 }
2291
2292 return tree;
2293}
2294
2295/* This function build the following tree, from regular expression
2296 (<reg_exp>):
15a7d175
UD
2297 SUBEXP
2298 |
2299 <reg_exp>
3b0bdc72
UD
2300*/
2301
2302static bin_tree_t *
2303parse_sub_exp (regexp, preg, token, syntax, nest, err)
2304 re_string_t *regexp;
2305 regex_t *preg;
2306 re_token_t *token;
2307 reg_syntax_t syntax;
2308 int nest;
2309 reg_errcode_t *err;
2310{
2311 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 2312 bin_tree_t *tree, *left_par, *right_par;
3b0bdc72
UD
2313 size_t cur_nsub;
2314 cur_nsub = preg->re_nsub++;
951d6408 2315 if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
3b0bdc72
UD
2316 {
2317 re_subexp_t *new_array;
2318 dfa->subexps_alloc *= 2;
2319 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
bc15410e 2320 if (BE (new_array == NULL, 0))
3b0bdc72
UD
2321 {
2322 dfa->subexps_alloc /= 2;
2323 *err = REG_ESPACE;
2324 return NULL;
2325 }
2326 dfa->subexps = new_array;
2327 }
2328 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2329 dfa->subexps[cur_nsub].end = -1;
81c64d40 2330
c5c9c929 2331 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2332 if (BE (left_par == NULL, 0))
485d775d
UD
2333 {
2334 *err = REG_ESPACE;
2335 return NULL;
2336 }
ee70274a 2337 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
f0d77aa8 2338 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2339
2340 /* The subexpression may be a null string. */
2341 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2342 tree = NULL;
3b0bdc72
UD
2343 else
2344 {
2345 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
bc15410e 2346 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2347 return NULL;
3b0bdc72 2348 }
81c64d40
UD
2349 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2350 {
06e8303a 2351 *err = REG_EPAREN;
81c64d40
UD
2352 return NULL;
2353 }
c5c9c929 2354 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
81c64d40 2355 dfa->subexps[cur_nsub].end = dfa->nodes_len;
81c64d40 2356 tree = ((tree == NULL) ? right_par
ee70274a
UD
2357 : create_tree (dfa, tree, right_par, CONCAT, 0));
2358 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2359 if (BE (right_par == NULL || tree == NULL, 0))
485d775d
UD
2360 {
2361 *err = REG_ESPACE;
2362 return NULL;
2363 }
ee70274a 2364 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
81c64d40 2365
3b0bdc72
UD
2366 return tree;
2367}
2368
2369/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2370
2371static bin_tree_t *
6b6557e8
UD
2372parse_dup_op (elem, regexp, dfa, token, syntax, err)
2373 bin_tree_t *elem;
3b0bdc72
UD
2374 re_string_t *regexp;
2375 re_dfa_t *dfa;
2376 re_token_t *token;
2377 reg_syntax_t syntax;
2378 reg_errcode_t *err;
2379{
2380 re_token_t dup_token;
6b6557e8
UD
2381 bin_tree_t *tree = NULL;
2382 int i, start, end, start_idx = re_string_cur_idx (regexp);
3b0bdc72 2383 re_token_t start_token = *token;
6b6557e8 2384
3b0bdc72
UD
2385 if (token->type == OP_OPEN_DUP_NUM)
2386 {
6b6557e8
UD
2387 end = 0;
2388 start = fetch_number (regexp, token, syntax);
3b0bdc72 2389 if (start == -1)
15a7d175
UD
2390 {
2391 if (token->type == CHARACTER && token->opr.c == ',')
2392 start = 0; /* We treat "{,m}" as "{0,m}". */
2393 else
2394 {
2395 *err = REG_BADBR; /* <re>{} is invalid. */
2396 return NULL;
2397 }
2398 }
602c2f9d 2399 if (BE (start != -2, 1))
15a7d175
UD
2400 {
2401 /* We treat "{n}" as "{n,n}". */
2402 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2403 : ((token->type == CHARACTER && token->opr.c == ',')
2404 ? fetch_number (regexp, token, syntax) : -2));
2405 }
602c2f9d 2406 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2407 {
2408 /* Invalid sequence. */
6b6557e8
UD
2409 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2410 {
2411 if (token->type == END_OF_RE)
2412 *err = REG_EBRACE;
2413 else
2414 *err = REG_BADBR;
2415
2416 return NULL;
2417 }
2418
2419 /* If the syntax bit is set, rollback. */
2420 re_string_set_index (regexp, start_idx);
2421 *token = start_token;
2422 token->type = CHARACTER;
2423 /* mb_partial and word_char bits should be already initialized by
2424 peek_token. */
2425 return elem;
15a7d175 2426 }
6b6557e8
UD
2427
2428 if (BE (end != -1 && start > end, 0))
15a7d175 2429 {
6b6557e8
UD
2430 /* First number greater than second. */
2431 *err = REG_BADBR;
15a7d175
UD
2432 return NULL;
2433 }
6b6557e8
UD
2434 }
2435 else
2436 {
2437 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2438 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2439 }
2440
2441 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2442 if (BE (elem == NULL, 0))
2443 start = end = 0;
602c2f9d 2444
6b6557e8
UD
2445 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2446 else if (BE (start > 0, 0))
2447 {
2448 tree = elem;
2449 for (i = 2; i <= start; ++i)
7de66108 2450 {
6b6557e8
UD
2451 elem = duplicate_tree (elem, dfa);
2452 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2453 if (BE (elem == NULL || tree == NULL, 0))
7de66108
UD
2454 goto parse_dup_op_espace;
2455 }
6b6557e8 2456 }
3b0bdc72 2457
6b6557e8
UD
2458 if (BE (end != start, 1))
2459 {
2460 dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2461 if (BE (start > 0, 0))
deeb4aae 2462 {
6b6557e8
UD
2463 elem = duplicate_tree (elem, dfa);
2464 if (BE (elem == NULL, 0))
2465 goto parse_dup_op_espace;
2466
2467 /* This subexpression will be marked as optional, so that
2468 empty matches do not touch the registers. */
2469 mark_opt_subexp (elem, dfa);
2470
2471 /* Prepare the tree with the modifier. */
2472 elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2473 tree = create_tree (dfa, tree, elem, CONCAT, 0);
deeb4aae 2474 }
6b6557e8 2475 else
15a7d175 2476 {
6b6557e8
UD
2477 /* We do not need to duplicate the tree because we have not
2478 created it yet. */
2479 mark_opt_subexp (elem, dfa);
2480 tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2481 }
2482
2483 if (BE (elem == NULL || tree == NULL, 0))
2484 goto parse_dup_op_espace;
2485
2486 /* This loop is actually executed only when end != -1,
2487 to rewrite <re>{0,n} as <re>?<re>?<re>?... We have
2488 already created the start+1-th copy. */
2489 for (i = start + 2; i <= end; ++i)
2490 {
2491 elem = duplicate_tree (elem, dfa);
2492 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2493 if (BE (elem == NULL || tree == NULL, 0))
15a7d175 2494 {
6b6557e8
UD
2495 *err = REG_ESPACE;
2496 return NULL;
15a7d175 2497 }
6b6557e8 2498 }
3b0bdc72 2499 }
6b6557e8 2500
f0d77aa8 2501 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2502 return tree;
2503
2504 parse_dup_op_espace:
3b0bdc72
UD
2505 *err = REG_ESPACE;
2506 return NULL;
3b0bdc72
UD
2507}
2508
2509/* Size of the names for collating symbol/equivalence_class/character_class.
2510 I'm not sure, but maybe enough. */
2511#define BRACKET_NAME_BUF_SIZE 32
2512
434d3784
UD
2513#ifndef _LIBC
2514 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2515 Build the range expression which starts from START_ELEM, and ends
2516 at END_ELEM. The result are written to MBCSET and SBCSET.
2517 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2518 mbcset->range_ends, is a pointer argument sinse we may
2519 update it. */
2520
2521static reg_errcode_t
c0a0f9a3
UD
2522# ifdef RE_ENABLE_I18N
2523build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
434d3784 2524 re_charset_t *mbcset;
434d3784 2525 int *range_alloc;
c0a0f9a3
UD
2526# else /* not RE_ENABLE_I18N */
2527build_range_exp (sbcset, start_elem, end_elem)
2528# endif /* not RE_ENABLE_I18N */
2529 re_bitset_ptr_t sbcset;
434d3784
UD
2530 bracket_elem_t *start_elem, *end_elem;
2531{
2532 unsigned int start_ch, end_ch;
2533 /* Equivalence Classes and Character Classes can't be a range start/end. */
2534 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2535 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2536 0))
434d3784
UD
2537 return REG_ERANGE;
2538
2539 /* We can handle no multi character collating elements without libc
2540 support. */
62439eac 2541 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2542 && strlen ((char *) start_elem->opr.name) > 1)
2543 || (end_elem->type == COLL_SYM
2544 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2545 return REG_ECOLLATE;
2546
2547# ifdef RE_ENABLE_I18N
2548 {
2549 wchar_t wc, start_wc, end_wc;
2550 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2551
2552 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2553 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2554 : 0));
434d3784 2555 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2556 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2557 : 0));
434d3784 2558 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
15a7d175 2559 ? __btowc (start_ch) : start_elem->opr.wch);
434d3784 2560 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
15a7d175 2561 ? __btowc (end_ch) : end_elem->opr.wch);
434d3784
UD
2562 cmp_buf[0] = start_wc;
2563 cmp_buf[4] = end_wc;
2564 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2565 return REG_ERANGE;
2566
2567 /* Check the space of the arrays. */
951d6408 2568 if (BE (*range_alloc == mbcset->nranges, 0))
434d3784 2569 {
15a7d175
UD
2570 /* There are not enough space, need realloc. */
2571 wchar_t *new_array_start, *new_array_end;
2572 int new_nranges;
2573
2574 /* +1 in case of mbcset->nranges is 0. */
2575 new_nranges = 2 * mbcset->nranges + 1;
2576 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2577 are NULL if *range_alloc == 0. */
2578 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2579 new_nranges);
2580 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2581 new_nranges);
2582
2583 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2584 return REG_ESPACE;
2585
2586 mbcset->range_starts = new_array_start;
2587 mbcset->range_ends = new_array_end;
2588 *range_alloc = new_nranges;
434d3784
UD
2589 }
2590
2591 mbcset->range_starts[mbcset->nranges] = start_wc;
2592 mbcset->range_ends[mbcset->nranges++] = end_wc;
2593
2594 /* Build the table for single byte characters. */
2595 for (wc = 0; wc <= SBC_MAX; ++wc)
2596 {
15a7d175
UD
2597 cmp_buf[2] = wc;
2598 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2599 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2600 bitset_set (sbcset, wc);
434d3784
UD
2601 }
2602 }
2603# else /* not RE_ENABLE_I18N */
2604 {
2605 unsigned int ch;
2606 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2607 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2608 : 0));
434d3784 2609 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2610 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2611 : 0));
434d3784
UD
2612 if (start_ch > end_ch)
2613 return REG_ERANGE;
2614 /* Build the table for single byte characters. */
2615 for (ch = 0; ch <= SBC_MAX; ++ch)
2616 if (start_ch <= ch && ch <= end_ch)
15a7d175 2617 bitset_set (sbcset, ch);
434d3784
UD
2618 }
2619# endif /* not RE_ENABLE_I18N */
2620 return REG_NOERROR;
2621}
2622#endif /* not _LIBC */
2623
2624#ifndef _LIBC
2625/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2626 Build the collating element which is represented by NAME.
2627 The result are written to MBCSET and SBCSET.
2628 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2629 pointer argument since we may update it. */
2630
2631static reg_errcode_t
c0a0f9a3
UD
2632# ifdef RE_ENABLE_I18N
2633build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
434d3784 2634 re_charset_t *mbcset;
434d3784 2635 int *coll_sym_alloc;
c0a0f9a3
UD
2636# else /* not RE_ENABLE_I18N */
2637build_collating_symbol (sbcset, name)
2638# endif /* not RE_ENABLE_I18N */
2639 re_bitset_ptr_t sbcset;
62439eac 2640 const unsigned char *name;
434d3784 2641{
62439eac
UD
2642 size_t name_len = strlen ((const char *) name);
2643 if (BE (name_len != 1, 0))
434d3784
UD
2644 return REG_ECOLLATE;
2645 else
2646 {
62439eac 2647 bitset_set (sbcset, name[0]);
434d3784
UD
2648 return REG_NOERROR;
2649 }
2650}
2651#endif /* not _LIBC */
2652
3b0bdc72
UD
2653/* This function parse bracket expression like "[abc]", "[a-c]",
2654 "[[.a-a.]]" etc. */
2655
2656static bin_tree_t *
2657parse_bracket_exp (regexp, dfa, token, syntax, err)
2658 re_string_t *regexp;
2659 re_dfa_t *dfa;
2660 re_token_t *token;
2661 reg_syntax_t syntax;
2662 reg_errcode_t *err;
2663{
2664#ifdef _LIBC
62439eac
UD
2665 const unsigned char *collseqmb;
2666 const char *collseqwc;
3b0bdc72
UD
2667 uint32_t nrules;
2668 int32_t table_size;
2669 const int32_t *symb_table;
2670 const unsigned char *extra;
2671
434d3784 2672 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2673 Seek the collating symbol entry correspondings to NAME.
2674 Return the index of the symbol in the SYMB_TABLE. */
2675
2676 static inline int32_t
25337753 2677 __attribute ((always_inline))
3b0bdc72 2678 seek_collating_symbol_entry (name, name_len)
15a7d175
UD
2679 const unsigned char *name;
2680 size_t name_len;
3b0bdc72 2681 {
62439eac 2682 int32_t hash = elem_hash ((const char *) name, name_len);
3b0bdc72
UD
2683 int32_t elem = hash % table_size;
2684 int32_t second = hash % (table_size - 2);
2685 while (symb_table[2 * elem] != 0)
15a7d175
UD
2686 {
2687 /* First compare the hashing value. */
2688 if (symb_table[2 * elem] == hash
2689 /* Compare the length of the name. */
2690 && name_len == extra[symb_table[2 * elem + 1]]
2691 /* Compare the name. */
2692 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2693 name_len) == 0)
2694 {
2695 /* Yep, this is the entry. */
2696 break;
2697 }
2698
2699 /* Next entry. */
2700 elem += second;
2701 }
3b0bdc72
UD
2702 return elem;
2703 }
2704
434d3784 2705 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2706 Look up the collation sequence value of BR_ELEM.
2707 Return the value if succeeded, UINT_MAX otherwise. */
2708
2709 static inline unsigned int
25337753 2710 __attribute ((always_inline))
3b0bdc72 2711 lookup_collation_sequence_value (br_elem)
15a7d175 2712 bracket_elem_t *br_elem;
3b0bdc72
UD
2713 {
2714 if (br_elem->type == SB_CHAR)
15a7d175
UD
2715 {
2716 /*
2717 if (MB_CUR_MAX == 1)
2718 */
2719 if (nrules == 0)
2720 return collseqmb[br_elem->opr.ch];
2721 else
2722 {
2723 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2724 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2725 }
2726 }
3b0bdc72 2727 else if (br_elem->type == MB_CHAR)
15a7d175 2728 {
25337753 2729 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2730 }
3b0bdc72 2731 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2732 {
2733 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2734 if (nrules != 0)
2735 {
2736 int32_t elem, idx;
2737 elem = seek_collating_symbol_entry (br_elem->opr.name,
2738 sym_name_len);
2739 if (symb_table[2 * elem] != 0)
2740 {
2741 /* We found the entry. */
2742 idx = symb_table[2 * elem + 1];
2743 /* Skip the name of collating element name. */
2744 idx += 1 + extra[idx];
2745 /* Skip the byte sequence of the collating element. */
2746 idx += 1 + extra[idx];
2747 /* Adjust for the alignment. */
2748 idx = (idx + 3) & ~3;
2749 /* Skip the multibyte collation sequence value. */
2750 idx += sizeof (unsigned int);
2751 /* Skip the wide char sequence of the collating element. */
2752 idx += sizeof (unsigned int) *
2753 (1 + *(unsigned int *) (extra + idx));
2754 /* Return the collation sequence value. */
2755 return *(unsigned int *) (extra + idx);
2756 }
2757 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2758 {
2759 /* No valid character. Match it as a single byte
2760 character. */
2761 return collseqmb[br_elem->opr.name[0]];
2762 }
2763 }
2764 else if (sym_name_len == 1)
2765 return collseqmb[br_elem->opr.name[0]];
2766 }
3b0bdc72
UD
2767 return UINT_MAX;
2768 }
2769
434d3784 2770 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2771 Build the range expression which starts from START_ELEM, and ends
2772 at END_ELEM. The result are written to MBCSET and SBCSET.
2773 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2774 mbcset->range_ends, is a pointer argument sinse we may
2775 update it. */
2776
2777 static inline reg_errcode_t
25337753 2778 __attribute ((always_inline))
c0a0f9a3
UD
2779# ifdef RE_ENABLE_I18N
2780 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
15a7d175
UD
2781 re_charset_t *mbcset;
2782 int *range_alloc;
c0a0f9a3
UD
2783# else /* not RE_ENABLE_I18N */
2784 build_range_exp (sbcset, start_elem, end_elem)
2785# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2786 re_bitset_ptr_t sbcset;
2787 bracket_elem_t *start_elem, *end_elem;
3b0bdc72
UD
2788 {
2789 unsigned int ch;
2790 uint32_t start_collseq;
2791 uint32_t end_collseq;
2792
c0a0f9a3 2793# ifdef RE_ENABLE_I18N
3b0bdc72 2794 /* Check the space of the arrays. */
951d6408 2795 if (BE (*range_alloc == mbcset->nranges, 0))
15a7d175
UD
2796 {
2797 /* There are not enough space, need realloc. */
2798 uint32_t *new_array_start;
2799 uint32_t *new_array_end;
3b0bdc72
UD
2800 int new_nranges;
2801
15a7d175
UD
2802 /* +1 in case of mbcset->nranges is 0. */
2803 new_nranges = 2 * mbcset->nranges + 1;
a9388965 2804 /* Use realloc since mbcset->range_starts and mbcset->range_ends
15a7d175
UD
2805 are NULL if *range_alloc == 0. */
2806 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2807 new_nranges);
2808 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2809 new_nranges);
a9388965 2810
15a7d175
UD
2811 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2812 return REG_ESPACE;
3b0bdc72 2813
15a7d175
UD
2814 mbcset->range_starts = new_array_start;
2815 mbcset->range_ends = new_array_end;
3b0bdc72 2816 *range_alloc = new_nranges;
15a7d175 2817 }
c0a0f9a3 2818# endif /* RE_ENABLE_I18N */
3b0bdc72 2819
434d3784 2820 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2821 start/end. */
bc15410e 2822 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2823 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2824 0))
2825 return REG_ERANGE;
3b0bdc72
UD
2826
2827 start_collseq = lookup_collation_sequence_value (start_elem);
2828 end_collseq = lookup_collation_sequence_value (end_elem);
2829 /* Check start/end collation sequence values. */
bc15410e 2830 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2831 return REG_ECOLLATE;
bc15410e 2832 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2833 return REG_ERANGE;
3b0bdc72 2834
c0a0f9a3 2835# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2836 /* Got valid collation sequence values, add them as a new entry. */
2837 mbcset->range_starts[mbcset->nranges] = start_collseq;
2838 mbcset->range_ends[mbcset->nranges++] = end_collseq;
c0a0f9a3 2839# endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2840
2841 /* Build the table for single byte characters. */
2842 for (ch = 0; ch <= SBC_MAX; ch++)
15a7d175
UD
2843 {
2844 uint32_t ch_collseq;
2845 /*
2846 if (MB_CUR_MAX == 1)
2847 */
2848 if (nrules == 0)
2849 ch_collseq = collseqmb[ch];
2850 else
25337753 2851 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
2852 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2853 bitset_set (sbcset, ch);
2854 }
3b0bdc72
UD
2855 return REG_NOERROR;
2856 }
3b0bdc72 2857
434d3784 2858 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2859 Build the collating element which is represented by NAME.
2860 The result are written to MBCSET and SBCSET.
2861 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2862 pointer argument sinse we may update it. */
2863
2864 static inline reg_errcode_t
25337753 2865 __attribute ((always_inline))
c0a0f9a3
UD
2866# ifdef RE_ENABLE_I18N
2867 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
15a7d175
UD
2868 re_charset_t *mbcset;
2869 int *coll_sym_alloc;
c0a0f9a3
UD
2870# else /* not RE_ENABLE_I18N */
2871 build_collating_symbol (sbcset, name)
2872# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2873 re_bitset_ptr_t sbcset;
2874 const unsigned char *name;
3b0bdc72 2875 {
3b0bdc72 2876 int32_t elem, idx;
62439eac 2877 size_t name_len = strlen ((const char *) name);
3b0bdc72 2878 if (nrules != 0)
15a7d175
UD
2879 {
2880 elem = seek_collating_symbol_entry (name, name_len);
2881 if (symb_table[2 * elem] != 0)
2882 {
2883 /* We found the entry. */
2884 idx = symb_table[2 * elem + 1];
2885 /* Skip the name of collating element name. */
2886 idx += 1 + extra[idx];
2887 }
2888 else if (symb_table[2 * elem] == 0 && name_len == 1)
2889 {
2890 /* No valid character, treat it as a normal
2891 character. */
2892 bitset_set (sbcset, name[0]);
2893 return REG_NOERROR;
2894 }
2895 else
2896 return REG_ECOLLATE;
3b0bdc72 2897
c0a0f9a3 2898# ifdef RE_ENABLE_I18N
15a7d175
UD
2899 /* Got valid collation sequence, add it as a new entry. */
2900 /* Check the space of the arrays. */
951d6408 2901 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
15a7d175
UD
2902 {
2903 /* Not enough, realloc it. */
2904 /* +1 in case of mbcset->ncoll_syms is 0. */
951d6408 2905 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
15a7d175
UD
2906 /* Use realloc since mbcset->coll_syms is NULL
2907 if *alloc == 0. */
951d6408
UD
2908 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2909 new_coll_sym_alloc);
2910 if (BE (new_coll_syms == NULL, 0))
15a7d175 2911 return REG_ESPACE;
951d6408
UD
2912 mbcset->coll_syms = new_coll_syms;
2913 *coll_sym_alloc = new_coll_sym_alloc;
15a7d175
UD
2914 }
2915 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
c0a0f9a3 2916# endif /* RE_ENABLE_I18N */
15a7d175
UD
2917 return REG_NOERROR;
2918 }
3b0bdc72 2919 else
15a7d175
UD
2920 {
2921 if (BE (name_len != 1, 0))
2922 return REG_ECOLLATE;
2923 else
2924 {
2925 bitset_set (sbcset, name[0]);
2926 return REG_NOERROR;
2927 }
2928 }
3b0bdc72 2929 }
434d3784
UD
2930#endif
2931
3b0bdc72
UD
2932 re_token_t br_token;
2933 re_bitset_ptr_t sbcset;
c0a0f9a3 2934#ifdef RE_ENABLE_I18N
3b0bdc72 2935 re_charset_t *mbcset;
3b0bdc72
UD
2936 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2937 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3
UD
2938#else /* not RE_ENABLE_I18N */
2939 int non_match = 0;
2940#endif /* not RE_ENABLE_I18N */
2941 bin_tree_t *work_tree;
ee70274a 2942 int token_len;
8cae99db 2943 int first_round = 1;
3b0bdc72 2944#ifdef _LIBC
62439eac
UD
2945 collseqmb = (const unsigned char *)
2946 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
2947 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2948 if (nrules)
2949 {
2950 /*
2951 if (MB_CUR_MAX > 1)
2952 */
15a7d175 2953 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
2954 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2955 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 2956 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 2957 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 2958 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
2959 }
2960#endif
2961 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 2962#ifdef RE_ENABLE_I18N
3b0bdc72 2963 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
2964#endif /* RE_ENABLE_I18N */
2965#ifdef RE_ENABLE_I18N
bc15410e 2966 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
2967#else
2968 if (BE (sbcset == NULL, 0))
2969#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2970 {
2971 *err = REG_ESPACE;
2972 return NULL;
2973 }
2974
2975 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2976 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2977 {
3b0bdc72 2978 *err = REG_BADPAT;
434d3784 2979 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2980 }
2981 if (token->type == OP_NON_MATCH_LIST)
2982 {
c0a0f9a3 2983#ifdef RE_ENABLE_I18N
3b0bdc72 2984 mbcset->non_match = 1;
c0a0f9a3
UD
2985#else /* not RE_ENABLE_I18N */
2986 non_match = 1;
2987#endif /* not RE_ENABLE_I18N */
3b0bdc72 2988 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 2989 bitset_set (sbcset, '\0');
3b0bdc72
UD
2990 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2991 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2992 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
2993 {
2994 *err = REG_BADPAT;
2995 goto parse_bracket_exp_free_return;
2996 }
3b0bdc72
UD
2997 }
2998
2999 /* We treat the first ']' as a normal character. */
3000 if (token->type == OP_CLOSE_BRACKET)
3001 token->type = CHARACTER;
3002
3003 while (1)
3004 {
3005 bracket_elem_t start_elem, end_elem;
62439eac
UD
3006 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3007 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72
UD
3008 reg_errcode_t ret;
3009 int token_len2 = 0, is_range_exp = 0;
3010 re_token_t token2;
3011
3012 start_elem.opr.name = start_name_buf;
3013 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 3014 syntax, first_round);
bc15410e 3015 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
3016 {
3017 *err = ret;
3018 goto parse_bracket_exp_free_return;
3019 }
78d8b07a 3020 first_round = 0;
3b0bdc72 3021
78d8b07a 3022 /* Get information about the next token. We need it in any case. */
3b0bdc72 3023 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3024
3025 /* Do not check for ranges if we know they are not allowed. */
3026 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3027 {
15a7d175
UD
3028 if (BE (token->type == END_OF_RE, 0))
3029 {
78d8b07a 3030 *err = REG_EBRACK;
15a7d175
UD
3031 goto parse_bracket_exp_free_return;
3032 }
78d8b07a 3033 if (token->type == OP_CHARSET_RANGE)
15a7d175 3034 {
78d8b07a
UD
3035 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3036 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3037 if (BE (token2.type == END_OF_RE, 0))
3038 {
3039 *err = REG_EBRACK;
3040 goto parse_bracket_exp_free_return;
3041 }
3042 if (token2.type == OP_CLOSE_BRACKET)
3043 {
3044 /* We treat the last '-' as a normal character. */
3045 re_string_skip_bytes (regexp, -token_len);
3046 token->type = CHARACTER;
3047 }
3048 else
3049 is_range_exp = 1;
15a7d175 3050 }
15a7d175 3051 }
3b0bdc72
UD
3052
3053 if (is_range_exp == 1)
15a7d175
UD
3054 {
3055 end_elem.opr.name = end_name_buf;
3056 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
78d8b07a 3057 dfa, syntax, 1);
15a7d175
UD
3058 if (BE (ret != REG_NOERROR, 0))
3059 {
3060 *err = ret;
3061 goto parse_bracket_exp_free_return;
3062 }
3063
3064 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3065
15a7d175 3066 *err = build_range_exp (sbcset,
c0a0f9a3 3067#ifdef RE_ENABLE_I18N
15a7d175 3068 mbcset, &range_alloc,
c0a0f9a3 3069#endif /* RE_ENABLE_I18N */
15a7d175
UD
3070 &start_elem, &end_elem);
3071 if (BE (*err != REG_NOERROR, 0))
3072 goto parse_bracket_exp_free_return;
3073 }
3b0bdc72 3074 else
15a7d175
UD
3075 {
3076 switch (start_elem.type)
3077 {
3078 case SB_CHAR:
3079 bitset_set (sbcset, start_elem.opr.ch);
3080 break;
c0a0f9a3 3081#ifdef RE_ENABLE_I18N
15a7d175
UD
3082 case MB_CHAR:
3083 /* Check whether the array has enough space. */
951d6408 3084 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
15a7d175 3085 {
951d6408 3086 wchar_t *new_mbchars;
15a7d175
UD
3087 /* Not enough, realloc it. */
3088 /* +1 in case of mbcset->nmbchars is 0. */
3089 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3090 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3091 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3092 mbchar_alloc);
3093 if (BE (new_mbchars == NULL, 0))
15a7d175 3094 goto parse_bracket_exp_espace;
951d6408 3095 mbcset->mbchars = new_mbchars;
15a7d175
UD
3096 }
3097 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3098 break;
c0a0f9a3 3099#endif /* RE_ENABLE_I18N */
15a7d175
UD
3100 case EQUIV_CLASS:
3101 *err = build_equiv_class (sbcset,
c0a0f9a3 3102#ifdef RE_ENABLE_I18N
15a7d175 3103 mbcset, &equiv_class_alloc,
c0a0f9a3 3104#endif /* RE_ENABLE_I18N */
3b0bdc72 3105 start_elem.opr.name);
15a7d175
UD
3106 if (BE (*err != REG_NOERROR, 0))
3107 goto parse_bracket_exp_free_return;
3108 break;
3109 case COLL_SYM:
3110 *err = build_collating_symbol (sbcset,
c0a0f9a3 3111#ifdef RE_ENABLE_I18N
15a7d175 3112 mbcset, &coll_sym_alloc,
c0a0f9a3 3113#endif /* RE_ENABLE_I18N */
3b0bdc72 3114 start_elem.opr.name);
15a7d175
UD
3115 if (BE (*err != REG_NOERROR, 0))
3116 goto parse_bracket_exp_free_return;
3117 break;
3118 case CHAR_CLASS:
66b110e8 3119 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3120#ifdef RE_ENABLE_I18N
7b7b9e70 3121 mbcset, &char_class_alloc,
c0a0f9a3 3122#endif /* RE_ENABLE_I18N */
7b7b9e70
UD
3123 start_elem.opr.name, syntax);
3124 if (BE (*err != REG_NOERROR, 0))
3125 goto parse_bracket_exp_free_return;
15a7d175
UD
3126 break;
3127 default:
3128 assert (0);
3129 break;
3130 }
3131 }
78d8b07a
UD
3132 if (BE (token->type == END_OF_RE, 0))
3133 {
3134 *err = REG_EBRACK;
3135 goto parse_bracket_exp_free_return;
3136 }
3b0bdc72 3137 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3138 break;
3b0bdc72
UD
3139 }
3140
3141 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3142
3143 /* If it is non-matching list. */
c0a0f9a3 3144#ifdef RE_ENABLE_I18N
3b0bdc72 3145 if (mbcset->non_match)
c0a0f9a3
UD
3146#else /* not RE_ENABLE_I18N */
3147 if (non_match)
3148#endif /* not RE_ENABLE_I18N */
3b0bdc72 3149 bitset_not (sbcset);
65e6becf
UD
3150#ifdef RE_ENABLE_I18N
3151 /* Ensure only single byte characters are set. */
3152 if (dfa->mb_cur_max > 1)
3153 bitset_mask (sbcset, dfa->sb_char);
3154#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3155
3156 /* Build a tree for simple bracket. */
3157 br_token.type = SIMPLE_BRACKET;
3158 br_token.opr.sbcset = sbcset;
c5c9c929 3159 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3160 if (BE (work_tree == NULL, 0))
3b0bdc72
UD
3161 goto parse_bracket_exp_espace;
3162
c0a0f9a3 3163#ifdef RE_ENABLE_I18N
3b0bdc72 3164 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3165 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166 || mbcset->non_match)))
3b0bdc72
UD
3167 {
3168 re_token_t alt_token;
3169 bin_tree_t *mbc_tree;
ad7f28c2 3170 int sbc_idx;
3b0bdc72 3171 /* Build a tree for complex bracket. */
ad7f28c2 3172 dfa->has_mb_node = 1;
ad7f28c2
UD
3173 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3174 if (sbcset[sbc_idx])
3175 break;
3176 /* If there are no bits set in sbcset, there is no point
3177 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3178 if (sbc_idx == BITSET_UINTS)
3179 {
3180 re_free (sbcset);
ee70274a
UD
3181 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3182 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
ad7f28c2
UD
3183 return work_tree;
3184 }
3b0bdc72
UD
3185 br_token.type = COMPLEX_BRACKET;
3186 br_token.opr.mbcset = mbcset;
c5c9c929 3187 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3188 if (BE (mbc_tree == NULL, 0))
15a7d175 3189 goto parse_bracket_exp_espace;
3b0bdc72
UD
3190 /* Then join them by ALT node. */
3191 alt_token.type = OP_ALT;
97fd3a30 3192 dfa->has_plural_match = 1;
c5c9c929 3193 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
ee70274a 3194 if (BE (mbc_tree != NULL, 1))
15a7d175 3195 return work_tree;
3b0bdc72
UD
3196 }
3197 else
3198 {
3199 free_charset (mbcset);
3200 return work_tree;
3201 }
c0a0f9a3
UD
3202#else /* not RE_ENABLE_I18N */
3203 return work_tree;
3204#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3205
3206 parse_bracket_exp_espace:
3b0bdc72 3207 *err = REG_ESPACE;
434d3784
UD
3208 parse_bracket_exp_free_return:
3209 re_free (sbcset);
c0a0f9a3 3210#ifdef RE_ENABLE_I18N
434d3784 3211 free_charset (mbcset);
c0a0f9a3 3212#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3213 return NULL;
3214}
3215
434d3784
UD
3216/* Parse an element in the bracket expression. */
3217
3b0bdc72 3218static reg_errcode_t
78d8b07a
UD
3219parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3220 accept_hyphen)
3b0bdc72
UD
3221 bracket_elem_t *elem;
3222 re_string_t *regexp;
3223 re_token_t *token;
3224 int token_len;
3225 re_dfa_t *dfa;
3226 reg_syntax_t syntax;
78d8b07a 3227 int accept_hyphen;
3b0bdc72
UD
3228{
3229#ifdef RE_ENABLE_I18N
3230 int cur_char_size;
3231 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3232 if (cur_char_size > 1)
3233 {
3234 elem->type = MB_CHAR;
3235 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3236 re_string_skip_bytes (regexp, cur_char_size);
3237 return REG_NOERROR;
3238 }
3239#endif /* RE_ENABLE_I18N */
3240 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3241 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3242 || token->type == OP_OPEN_EQUIV_CLASS)
3243 return parse_bracket_symbol (elem, regexp, token);
78d8b07a
UD
3244 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3245 {
3246 /* A '-' must only appear as anything but a range indicator before
3247 the closing bracket. Everything else is an error. */
3248 re_token_t token2;
3249 (void) peek_token_bracket (&token2, regexp, syntax);
3250 if (token2.type != OP_CLOSE_BRACKET)
3251 /* The actual error value is not standardized since this whole
3252 case is undefined. But ERANGE makes good sense. */
3253 return REG_ERANGE;
3254 }
3b0bdc72
UD
3255 elem->type = SB_CHAR;
3256 elem->opr.ch = token->opr.c;
3257 return REG_NOERROR;
3258}
3259
434d3784
UD
3260/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3261 such as [:<character_class>:], [.<collating_element>.], and
3262 [=<equivalent_class>=]. */
3263
3b0bdc72
UD
3264static reg_errcode_t
3265parse_bracket_symbol (elem, regexp, token)
3266 bracket_elem_t *elem;
3267 re_string_t *regexp;
3268 re_token_t *token;
3269{
3270 unsigned char ch, delim = token->opr.c;
3271 int i = 0;
602c2f9d 3272 for (;; ++i)
3b0bdc72 3273 {
602c2f9d 3274 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3275 return REG_EBRACK;
3b0bdc72 3276 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3277 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3278 else
15a7d175 3279 ch = re_string_fetch_byte (regexp);
3b0bdc72 3280 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3281 break;
3b0bdc72
UD
3282 elem->opr.name[i] = ch;
3283 }
3284 re_string_skip_bytes (regexp, 1);
3285 elem->opr.name[i] = '\0';
3286 switch (token->type)
3287 {
3288 case OP_OPEN_COLL_ELEM:
3289 elem->type = COLL_SYM;
3290 break;
3291 case OP_OPEN_EQUIV_CLASS:
3292 elem->type = EQUIV_CLASS;
3293 break;
3294 case OP_OPEN_CHAR_CLASS:
3295 elem->type = CHAR_CLASS;
3296 break;
3297 default:
3298 break;
3299 }
3300 return REG_NOERROR;
3301}
3302
3303 /* Helper function for parse_bracket_exp.
3304 Build the equivalence class which is represented by NAME.
3305 The result are written to MBCSET and SBCSET.
3306 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3307 is a pointer argument sinse we may update it. */
3308
3309static reg_errcode_t
c0a0f9a3
UD
3310#ifdef RE_ENABLE_I18N
3311build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3b0bdc72 3312 re_charset_t *mbcset;
3b0bdc72 3313 int *equiv_class_alloc;
c0a0f9a3
UD
3314#else /* not RE_ENABLE_I18N */
3315build_equiv_class (sbcset, name)
3316#endif /* not RE_ENABLE_I18N */
3317 re_bitset_ptr_t sbcset;
62439eac 3318 const unsigned char *name;
3b0bdc72 3319{
c0a0f9a3 3320#if defined _LIBC && defined RE_ENABLE_I18N
3b0bdc72
UD
3321 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3322 if (nrules != 0)
3323 {
3324 const int32_t *table, *indirect;
3325 const unsigned char *weights, *extra, *cp;
62439eac 3326 unsigned char char_buf[2];
3b0bdc72
UD
3327 int32_t idx1, idx2;
3328 unsigned int ch;
3329 size_t len;
3330 /* This #include defines a local function! */
3331# include <locale/weight.h>
3332 /* Calculate the index for equivalence class. */
62439eac 3333 cp = name;
3b0bdc72
UD
3334 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3335 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3336 _NL_COLLATE_WEIGHTMB);
3337 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3338 _NL_COLLATE_EXTRAMB);
3b0bdc72 3339 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3340 _NL_COLLATE_INDIRECTMB);
3b0bdc72 3341 idx1 = findidx (&cp);
62439eac 3342 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
15a7d175
UD
3343 /* This isn't a valid character. */
3344 return REG_ECOLLATE;
3b0bdc72
UD
3345
3346 /* Build single byte matcing table for this equivalence class. */
62439eac 3347 char_buf[1] = (unsigned char) '\0';
3b0bdc72
UD
3348 len = weights[idx1];
3349 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3350 {
3351 char_buf[0] = ch;
3352 cp = char_buf;
3353 idx2 = findidx (&cp);
3b0bdc72 3354/*
15a7d175 3355 idx2 = table[ch];
3b0bdc72 3356*/
15a7d175
UD
3357 if (idx2 == 0)
3358 /* This isn't a valid character. */
3359 continue;
3360 if (len == weights[idx2])
3361 {
3362 int cnt = 0;
3363 while (cnt <= len &&
3364 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3365 ++cnt;
3366
3367 if (cnt > len)
3368 bitset_set (sbcset, ch);
3369 }
3370 }
a9388965 3371 /* Check whether the array has enough space. */
951d6408 3372 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
15a7d175
UD
3373 {
3374 /* Not enough, realloc it. */
3375 /* +1 in case of mbcset->nequiv_classes is 0. */
951d6408 3376 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3377 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3378 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3379 int32_t,
3380 new_equiv_class_alloc);
3381 if (BE (new_equiv_classes == NULL, 0))
15a7d175 3382 return REG_ESPACE;
951d6408
UD
3383 mbcset->equiv_classes = new_equiv_classes;
3384 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3385 }
3b0bdc72
UD
3386 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3387 }
3388 else
c0a0f9a3 3389#endif /* _LIBC && RE_ENABLE_I18N */
3b0bdc72 3390 {
62439eac 3391 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3392 return REG_ECOLLATE;
62439eac 3393 bitset_set (sbcset, *name);
3b0bdc72
UD
3394 }
3395 return REG_NOERROR;
3396}
3397
3398 /* Helper function for parse_bracket_exp.
3399 Build the character class which is represented by NAME.
3400 The result are written to MBCSET and SBCSET.
3401 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3402 is a pointer argument sinse we may update it. */
3403
3404static reg_errcode_t
c0a0f9a3 3405#ifdef RE_ENABLE_I18N
66b110e8 3406build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3b0bdc72 3407 re_charset_t *mbcset;
3b0bdc72 3408 int *char_class_alloc;
c0a0f9a3 3409#else /* not RE_ENABLE_I18N */
66b110e8 3410build_charclass (trans, sbcset, class_name, syntax)
c0a0f9a3 3411#endif /* not RE_ENABLE_I18N */
66b110e8 3412 RE_TRANSLATE_TYPE trans;
c0a0f9a3 3413 re_bitset_ptr_t sbcset;
62439eac 3414 const unsigned char *class_name;
602c2f9d 3415 reg_syntax_t syntax;
3b0bdc72
UD
3416{
3417 int i;
62439eac 3418 const char *name = (const char *) class_name;
c0a0f9a3
UD
3419
3420 /* In case of REG_ICASE "upper" and "lower" match the both of
3421 upper and lower cases. */
3422 if ((syntax & RE_ICASE)
3423 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3424 name = "alpha";
3425
3426#ifdef RE_ENABLE_I18N
3b0bdc72 3427 /* Check the space of the arrays. */
951d6408 3428 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
a9388965
UD
3429 {
3430 /* Not enough, realloc it. */
3431 /* +1 in case of mbcset->nchar_classes is 0. */
951d6408 3432 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3433 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3434 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3435 new_char_class_alloc);
3436 if (BE (new_char_classes == NULL, 0))
15a7d175 3437 return REG_ESPACE;
951d6408
UD
3438 mbcset->char_classes = new_char_classes;
3439 *char_class_alloc = new_char_class_alloc;
a9388965 3440 }
3b0bdc72 3441 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3442#endif /* RE_ENABLE_I18N */
3b0bdc72 3443
66b110e8
UD
3444#define BUILD_CHARCLASS_LOOP(ctype_func) \
3445 for (i = 0; i < SBC_MAX; ++i) \
3446 { \
3447 if (ctype_func (i)) \
3448 { \
3449 int ch = trans ? trans[i] : i; \
3450 bitset_set (sbcset, ch); \
3451 } \
3b0bdc72
UD
3452 }
3453
3454 if (strcmp (name, "alnum") == 0)
3455 BUILD_CHARCLASS_LOOP (isalnum)
3456 else if (strcmp (name, "cntrl") == 0)
3457 BUILD_CHARCLASS_LOOP (iscntrl)
3458 else if (strcmp (name, "lower") == 0)
3459 BUILD_CHARCLASS_LOOP (islower)
3460 else if (strcmp (name, "space") == 0)
3461 BUILD_CHARCLASS_LOOP (isspace)
3462 else if (strcmp (name, "alpha") == 0)
3463 BUILD_CHARCLASS_LOOP (isalpha)
3464 else if (strcmp (name, "digit") == 0)
3465 BUILD_CHARCLASS_LOOP (isdigit)
3466 else if (strcmp (name, "print") == 0)
3467 BUILD_CHARCLASS_LOOP (isprint)
3468 else if (strcmp (name, "upper") == 0)
3469 BUILD_CHARCLASS_LOOP (isupper)
3470 else if (strcmp (name, "blank") == 0)
3471 BUILD_CHARCLASS_LOOP (isblank)
3472 else if (strcmp (name, "graph") == 0)
3473 BUILD_CHARCLASS_LOOP (isgraph)
3474 else if (strcmp (name, "punct") == 0)
3475 BUILD_CHARCLASS_LOOP (ispunct)
3476 else if (strcmp (name, "xdigit") == 0)
3477 BUILD_CHARCLASS_LOOP (isxdigit)
3478 else
3479 return REG_ECTYPE;
3480
3481 return REG_NOERROR;
3482}
3483
3484static bin_tree_t *
e2b6bfa3 3485build_charclass_op (dfa, trans, class_name, extra, not, err)
3b0bdc72 3486 re_dfa_t *dfa;
66b110e8 3487 RE_TRANSLATE_TYPE trans;
e2b6bfa3
UD
3488 const unsigned char *class_name;
3489 const unsigned char *extra;
3b0bdc72
UD
3490 int not;
3491 reg_errcode_t *err;
3492{
3493 re_bitset_ptr_t sbcset;
c0a0f9a3 3494#ifdef RE_ENABLE_I18N
3b0bdc72 3495 re_charset_t *mbcset;
c0a0f9a3
UD
3496 int alloc = 0;
3497#else /* not RE_ENABLE_I18N */
3498 int non_match = 0;
3499#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3500 reg_errcode_t ret;
3501 re_token_t br_token;
3502 bin_tree_t *tree;
3b0bdc72
UD
3503
3504 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 3505#ifdef RE_ENABLE_I18N
3b0bdc72 3506 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3507#endif /* RE_ENABLE_I18N */
3508
3509#ifdef RE_ENABLE_I18N
bc15410e 3510 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3511#else /* not RE_ENABLE_I18N */
3512 if (BE (sbcset == NULL, 0))
3513#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3514 {
3515 *err = REG_ESPACE;
3516 return NULL;
3517 }
3518
3519 if (not)
3520 {
c0a0f9a3 3521#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3522 /*
3523 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 3524 bitset_set(cset->sbcset, '\0');
3b0bdc72 3525 */
c0a0f9a3 3526 mbcset->non_match = 1;
c0a0f9a3
UD
3527#else /* not RE_ENABLE_I18N */
3528 non_match = 1;
3529#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3530 }
3531
602c2f9d 3532 /* We don't care the syntax in this case. */
66b110e8 3533 ret = build_charclass (trans, sbcset,
c0a0f9a3 3534#ifdef RE_ENABLE_I18N
15a7d175 3535 mbcset, &alloc,
c0a0f9a3 3536#endif /* RE_ENABLE_I18N */
e2b6bfa3 3537 class_name, 0);
c0a0f9a3 3538
bc15410e 3539 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3540 {
3541 re_free (sbcset);
c0a0f9a3 3542#ifdef RE_ENABLE_I18N
3b0bdc72 3543 free_charset (mbcset);
c0a0f9a3 3544#endif /* RE_ENABLE_I18N */
7b7b9e70 3545 *err = ret;
3b0bdc72
UD
3546 return NULL;
3547 }
434d3784 3548 /* \w match '_' also. */
e2b6bfa3
UD
3549 for (; *extra; extra++)
3550 bitset_set (sbcset, *extra);
3b0bdc72
UD
3551
3552 /* If it is non-matching list. */
c0a0f9a3 3553#ifdef RE_ENABLE_I18N
3b0bdc72 3554 if (mbcset->non_match)
c0a0f9a3
UD
3555#else /* not RE_ENABLE_I18N */
3556 if (non_match)
3557#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3558 bitset_not (sbcset);
3559
65e6becf
UD
3560#ifdef RE_ENABLE_I18N
3561 /* Ensure only single byte characters are set. */
3562 if (dfa->mb_cur_max > 1)
3563 bitset_mask (sbcset, dfa->sb_char);
3564#endif
3565
3b0bdc72
UD
3566 /* Build a tree for simple bracket. */
3567 br_token.type = SIMPLE_BRACKET;
3568 br_token.opr.sbcset = sbcset;
c5c9c929 3569 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3570 if (BE (tree == NULL, 0))
3b0bdc72
UD
3571 goto build_word_op_espace;
3572
c0a0f9a3 3573#ifdef RE_ENABLE_I18N
3c0fb574 3574 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
3575 {
3576 re_token_t alt_token;
3577 bin_tree_t *mbc_tree;
3578 /* Build a tree for complex bracket. */
3579 br_token.type = COMPLEX_BRACKET;
3580 br_token.opr.mbcset = mbcset;
3581 dfa->has_mb_node = 1;
c5c9c929 3582 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3583 if (BE (mbc_tree == NULL, 0))
15a7d175 3584 goto build_word_op_espace;
3b0bdc72
UD
3585 /* Then join them by ALT node. */
3586 alt_token.type = OP_ALT;
97fd3a30 3587 dfa->has_plural_match = 1;
c5c9c929 3588 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
ee70274a 3589 if (BE (mbc_tree != NULL, 1))
15a7d175 3590 return tree;
3b0bdc72
UD
3591 }
3592 else
3593 {
3594 free_charset (mbcset);
3595 return tree;
3596 }
c0a0f9a3
UD
3597#else /* not RE_ENABLE_I18N */
3598 return tree;
3599#endif /* not RE_ENABLE_I18N */
3600
3b0bdc72
UD
3601 build_word_op_espace:
3602 re_free (sbcset);
c0a0f9a3 3603#ifdef RE_ENABLE_I18N
3b0bdc72 3604 free_charset (mbcset);
c0a0f9a3 3605#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3606 *err = REG_ESPACE;
3607 return NULL;
3608}
3609
3610/* This is intended for the expressions like "a{1,3}".
3611 Fetch a number from `input', and return the number.
3612 Return -1, if the number field is empty like "{,1}".
3613 Return -2, If an error is occured. */
3614
3615static int
3616fetch_number (input, token, syntax)
3617 re_string_t *input;
3618 re_token_t *token;
3619 reg_syntax_t syntax;
3620{
3621 int num = -1;
3622 unsigned char c;
3623 while (1)
3624 {
f0d77aa8 3625 fetch_token (token, input, syntax);
3b0bdc72 3626 c = token->opr.c;
602c2f9d 3627 if (BE (token->type == END_OF_RE, 0))
15a7d175 3628 return -2;
3b0bdc72 3629 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3630 break;
602c2f9d 3631 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
15a7d175 3632 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
602c2f9d 3633 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3634 }
3b0bdc72
UD
3635 return num;
3636}
3637\f
c0a0f9a3 3638#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3639static void
3640free_charset (re_charset_t *cset)
3641{
3642 re_free (cset->mbchars);
c0a0f9a3 3643# ifdef _LIBC
3b0bdc72
UD
3644 re_free (cset->coll_syms);
3645 re_free (cset->equiv_classes);
3646 re_free (cset->range_starts);
3647 re_free (cset->range_ends);
c0a0f9a3 3648# endif
3b0bdc72
UD
3649 re_free (cset->char_classes);
3650 re_free (cset);
3651}
c0a0f9a3 3652#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3653\f
3654/* Functions for binary tree operation. */
3655
ee70274a 3656/* Create a tree node. */
3b0bdc72
UD
3657
3658static bin_tree_t *
ee70274a
UD
3659create_tree (dfa, left, right, type, index)
3660 re_dfa_t *dfa;
3b0bdc72
UD
3661 bin_tree_t *left;
3662 bin_tree_t *right;
3663 re_token_type_t type;
3664 int index;
3665{
3666 bin_tree_t *tree;
ee70274a 3667 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3b0bdc72 3668 {
ee70274a
UD
3669 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3670
3671 if (storage == NULL)
3672 return NULL;
3673 storage->next = dfa->str_tree_storage;
3674 dfa->str_tree_storage = storage;
3675 dfa->str_tree_storage_idx = 0;
3b0bdc72 3676 }
ee70274a
UD
3677 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3678
3b0bdc72
UD
3679 tree->parent = NULL;
3680 tree->left = left;
3681 tree->right = right;
3682 tree->type = type;
3683 tree->node_idx = index;
3684 tree->first = -1;
3685 tree->next = -1;
3686 re_node_set_init_empty (&tree->eclosure);
3687
3688 if (left != NULL)
3689 left->parent = tree;
3690 if (right != NULL)
3691 right->parent = tree;
3692 return tree;
3693}
3694
ee70274a 3695/* Create both a DFA node and a tree for it. */
3b0bdc72 3696
ee70274a
UD
3697static bin_tree_t *
3698re_dfa_add_tree_node (dfa, left, right, token)
3699 re_dfa_t *dfa;
3700 bin_tree_t *left;
3701 bin_tree_t *right;
c5c9c929 3702 const re_token_t *token;
f0d77aa8 3703{
c5c9c929 3704 int new_idx = re_dfa_add_node (dfa, *token, 0);
ee70274a
UD
3705
3706 if (new_idx == -1)
3707 return NULL;
3708
3709 return create_tree (dfa, left, right, 0, new_idx);
3b0bdc72
UD
3710}
3711
6b6557e8
UD
3712/* Mark the tree SRC as an optional subexpression. */
3713
3714static void
3715mark_opt_subexp (src, dfa)
3716 const bin_tree_t *src;
3717 re_dfa_t *dfa;
3718{
3719 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3720 a subexpression. */
3721 if (src->type == CONCAT
3722 && src->left->type == NON_TYPE
3723 && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3724 mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3725}
3726
3727
3728/* Recursive tree walker for mark_opt_subexp. */
3729
3730static void
3731mark_opt_subexp_iter (src, dfa, idx)
3732 const bin_tree_t *src;
3733 re_dfa_t *dfa;
3734{
3735 int node_idx;
3736
3737 if (src->type == NON_TYPE)
3738 {
3739 node_idx = src->node_idx;
3740 if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3741 || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3742 && dfa->nodes[node_idx].opr.idx == idx)
3743 dfa->nodes[node_idx].opt_subexp = 1;
3744 }
3745
3746 if (src->left != NULL)
3747 mark_opt_subexp_iter (src->left, dfa, idx);
3748
3749 if (src->right != NULL)
3750 mark_opt_subexp_iter (src->right, dfa, idx);
3751}
3752
ee70274a 3753
3b0bdc72
UD
3754/* Duplicate the node SRC, and return new node. */
3755
3756static bin_tree_t *
3757duplicate_tree (src, dfa)
3758 const bin_tree_t *src;
3759 re_dfa_t *dfa;
3760{
3761 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3762 int new_node_idx;
3763 /* Since node indies must be according to Post-order of the tree,
3764 we must duplicate the left at first. */
3765 if (src->left != NULL)
3766 {
3767 left = duplicate_tree (src->left, dfa);
3768 if (left == NULL)
15a7d175 3769 return NULL;
3b0bdc72
UD
3770 }
3771
3772 /* Secondaly, duplicate the right. */
3773 if (src->right != NULL)
3774 {
3775 right = duplicate_tree (src->right, dfa);
3776 if (right == NULL)
ee70274a 3777 return NULL;
3b0bdc72
UD
3778 }
3779
3780 /* At last, duplicate itself. */
3781 if (src->type == NON_TYPE)
3782 {
3783 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3784 dfa->nodes[new_node_idx].duplicated = 1;
bc15410e 3785 if (BE (new_node_idx == -1, 0))
ee70274a 3786 return NULL;
3b0bdc72
UD
3787 }
3788 else
3789 new_node_idx = src->type;
3790
ee70274a 3791 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3b0bdc72
UD
3792 return new_tree;
3793}