]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
value correctly. Patch by Mitsuru Kanda <mk@karaba.org>.
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
56b168be 2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
3b0bdc72 21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
15a7d175 22 int length, reg_syntax_t syntax);
3b0bdc72 23static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
24 const re_dfastate_t *init_state,
25 char *fastmap);
3b0bdc72 26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
56b168be 27static void init_word_char (re_dfa_t *dfa);
c0a0f9a3 28#ifdef RE_ENABLE_I18N
3b0bdc72 29static void free_charset (re_charset_t *cset);
c0a0f9a3 30#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
3b0bdc72
UD
36static reg_errcode_t analyze (re_dfa_t *dfa);
37static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
485d775d 41static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
15a7d175
UD
42 int top_clone_node, int root_node,
43 unsigned int constraint);
a9388965 44static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
15a7d175 45 unsigned int constraint);
a7d5c291
UD
46static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
3b0bdc72 48static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 49static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
15a7d175 50 int node, int root);
3b0bdc72
UD
51static void calc_inveclosure (re_dfa_t *dfa);
52static int fetch_number (re_string_t *input, re_token_t *token,
15a7d175 53 reg_syntax_t syntax);
f0d77aa8
UD
54static void fetch_token (re_token_t *result, re_string_t *input,
55 reg_syntax_t syntax);
3b0bdc72 56static int peek_token (re_token_t *token, re_string_t *input,
15a7d175 57 reg_syntax_t syntax);
3b0bdc72 58static int peek_token_bracket (re_token_t *token, re_string_t *input,
15a7d175 59 reg_syntax_t syntax);
3b0bdc72 60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 61 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
3b0bdc72 65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175
UD
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
3b0bdc72 68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175
UD
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
3b0bdc72 71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
3b0bdc72 74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
3b0bdc72 80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
78d8b07a
UD
84 reg_syntax_t syntax,
85 int accept_hyphen);
3b0bdc72 86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
87 re_string_t *regexp,
88 re_token_t *token);
434d3784 89#ifndef _LIBC
c0a0f9a3
UD
90# ifdef RE_ENABLE_I18N
91static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
92 re_charset_t *mbcset, int *range_alloc,
93 bracket_elem_t *start_elem,
94 bracket_elem_t *end_elem);
c0a0f9a3 95static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175
UD
96 re_charset_t *mbcset,
97 int *coll_sym_alloc,
98 const unsigned char *name);
c0a0f9a3
UD
99# else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
c0a0f9a3 103static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175 104 const unsigned char *name);
c0a0f9a3 105# endif /* not RE_ENABLE_I18N */
434d3784 106#endif /* not _LIBC */
c0a0f9a3
UD
107#ifdef RE_ENABLE_I18N
108static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175
UD
109 re_charset_t *mbcset,
110 int *equiv_class_alloc,
111 const unsigned char *name);
86576b62 112static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
66b110e8 113 re_bitset_ptr_t sbcset,
15a7d175
UD
114 re_charset_t *mbcset,
115 int *char_class_alloc,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
c0a0f9a3
UD
118#else /* not RE_ENABLE_I18N */
119static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175 120 const unsigned char *name);
86576b62 121static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
66b110e8 122 re_bitset_ptr_t sbcset,
15a7d175
UD
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
c0a0f9a3 125#endif /* not RE_ENABLE_I18N */
86576b62
UD
126static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
127 unsigned RE_TRANSLATE_TYPE trans,
06e8303a 128 const unsigned char *class_name,
10677727
UD
129 const unsigned char *extra,
130 int non_match, reg_errcode_t *err);
ee70274a
UD
131static bin_tree_t *create_tree (re_dfa_t *dfa,
132 bin_tree_t *left, bin_tree_t *right,
15a7d175 133 re_token_type_t type, int index);
ee70274a
UD
134static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
135 bin_tree_t *left, bin_tree_t *right,
c5c9c929 136 const re_token_t *token)
ee70274a 137 __attribute ((noinline));
3b0bdc72 138static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
6b6557e8
UD
139static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
140static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
3b0bdc72
UD
141\f
142/* This table gives an error message for each of the error codes listed
143 in regex.h. Obviously the order here has to be same as there.
144 POSIX doesn't require that we do anything for REG_NOERROR,
145 but why not be nice? */
146
6455d255 147const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
148 {
149#define REG_NOERROR_IDX 0
150 gettext_noop ("Success") /* REG_NOERROR */
151 "\0"
152#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
153 gettext_noop ("No match") /* REG_NOMATCH */
154 "\0"
155#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
156 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
157 "\0"
158#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
159 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
160 "\0"
161#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
162 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
163 "\0"
164#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
165 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
166 "\0"
167#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
168 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
169 "\0"
170#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
171 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
172 "\0"
173#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
174 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
175 "\0"
176#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
177 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
178 "\0"
179#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
180 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
181 "\0"
182#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
183 gettext_noop ("Invalid range end") /* REG_ERANGE */
184 "\0"
185#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
186 gettext_noop ("Memory exhausted") /* REG_ESPACE */
187 "\0"
188#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
189 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
190 "\0"
191#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
192 gettext_noop ("Premature end of regular expression") /* REG_EEND */
193 "\0"
194#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
195 gettext_noop ("Regular expression too big") /* REG_ESIZE */
196 "\0"
197#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
198 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
199 };
200
6455d255 201const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
202 {
203 REG_NOERROR_IDX,
204 REG_NOMATCH_IDX,
205 REG_BADPAT_IDX,
206 REG_ECOLLATE_IDX,
207 REG_ECTYPE_IDX,
208 REG_EESCAPE_IDX,
209 REG_ESUBREG_IDX,
210 REG_EBRACK_IDX,
211 REG_EPAREN_IDX,
212 REG_EBRACE_IDX,
213 REG_BADBR_IDX,
214 REG_ERANGE_IDX,
215 REG_ESPACE_IDX,
216 REG_BADRPT_IDX,
217 REG_EEND_IDX,
218 REG_ESIZE_IDX,
219 REG_ERPAREN_IDX
220 };
221\f
222/* Entry points for GNU code. */
223
224/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 225 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
226 Returns 0 if the pattern was valid, otherwise an error string.
227
228 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
229 are set in BUFP on entry. */
230
231const char *
232re_compile_pattern (pattern, length, bufp)
233 const char *pattern;
234 size_t length;
235 struct re_pattern_buffer *bufp;
236{
237 reg_errcode_t ret;
238
3b0bdc72
UD
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
241 setting no_sub. */
242 bufp->no_sub = 0;
243
244 /* Match anchors at newline. */
245 bufp->newline_anchor = 1;
246
75e4a282 247 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
248
249 if (!ret)
250 return NULL;
6455d255 251 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
252}
253#ifdef _LIBC
254weak_alias (__re_compile_pattern, re_compile_pattern)
255#endif
256
257/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260/* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262reg_syntax_t re_syntax_options;
263
264
265/* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
268
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
271
272reg_syntax_t
273re_set_syntax (syntax)
274 reg_syntax_t syntax;
275{
276 reg_syntax_t ret = re_syntax_options;
277
278 re_syntax_options = syntax;
279 return ret;
280}
281#ifdef _LIBC
282weak_alias (__re_set_syntax, re_set_syntax)
283#endif
284
285int
286re_compile_fastmap (bufp)
287 struct re_pattern_buffer *bufp;
288{
289 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
290 char *fastmap = bufp->fastmap;
291
292 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
293 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
294 if (dfa->init_state != dfa->init_state_word)
295 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
296 if (dfa->init_state != dfa->init_state_nl)
297 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
298 if (dfa->init_state != dfa->init_state_begbuf)
299 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
300 bufp->fastmap_accurate = 1;
301 return 0;
302}
303#ifdef _LIBC
304weak_alias (__re_compile_fastmap, re_compile_fastmap)
305#endif
306
1b2c2628 307static inline void
25337753 308__attribute ((always_inline))
1b2c2628
UD
309re_set_fastmap (char *fastmap, int icase, int ch)
310{
311 fastmap[ch] = 1;
312 if (icase)
313 fastmap[tolower (ch)] = 1;
314}
315
3b0bdc72
UD
316/* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
318
319static void
320re_compile_fastmap_iter (bufp, init_state, fastmap)
321 regex_t *bufp;
322 const re_dfastate_t *init_state;
323 char *fastmap;
324{
325 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
326 int node_cnt;
3c0fb574 327 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
328 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
329 {
330 int node = init_state->nodes.elems[node_cnt];
331 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
332
333 if (type == CHARACTER)
74e12fbc
UD
334 {
335 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
336#ifdef RE_ENABLE_I18N
14744156 337 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 338 {
3c0fb574 339 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
74e12fbc
UD
340 wchar_t wc;
341 mbstate_t state;
342
343 p = buf;
344 *p++ = dfa->nodes[node].opr.c;
345 while (++node < dfa->nodes_len
346 && dfa->nodes[node].type == CHARACTER
347 && dfa->nodes[node].mb_partial)
348 *p++ = dfa->nodes[node].opr.c;
349 memset (&state, 0, sizeof (state));
350 if (mbrtowc (&wc, (const char *) buf, p - buf,
351 &state) == p - buf
352 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
353 re_set_fastmap (fastmap, 0, buf[0]);
354 }
355#endif
356 }
3b0bdc72 357 else if (type == SIMPLE_BRACKET)
15a7d175
UD
358 {
359 int i, j, ch;
360 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
361 for (j = 0; j < UINT_BITS; ++j, ++ch)
362 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
363 re_set_fastmap (fastmap, icase, ch);
364 }
c0a0f9a3 365#ifdef RE_ENABLE_I18N
3b0bdc72 366 else if (type == COMPLEX_BRACKET)
15a7d175
UD
367 {
368 int i;
369 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
370 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
371 || cset->nranges || cset->nchar_classes)
372 {
c0a0f9a3 373# ifdef _LIBC
15a7d175
UD
374 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
375 {
376 /* In this case we want to catch the bytes which are
377 the first byte of any collation elements.
378 e.g. In da_DK, we want to catch 'a' since "aa"
379 is a valid collation element, and don't catch
380 'b' since 'b' is the only collation element
381 which starts from 'b'. */
382 int j, ch;
383 const int32_t *table = (const int32_t *)
384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
385 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
386 for (j = 0; j < UINT_BITS; ++j, ++ch)
387 if (table[ch] < 0)
388 re_set_fastmap (fastmap, icase, ch);
389 }
c0a0f9a3 390# else
3c0fb574 391 if (dfa->mb_cur_max > 1)
15a7d175
UD
392 for (i = 0; i < SBC_MAX; ++i)
393 if (__btowc (i) == WEOF)
394 re_set_fastmap (fastmap, icase, i);
c0a0f9a3 395# endif /* not _LIBC */
15a7d175
UD
396 }
397 for (i = 0; i < cset->nmbchars; ++i)
398 {
399 char buf[256];
400 mbstate_t state;
401 memset (&state, '\0', sizeof (state));
402 __wcrtomb (buf, cset->mbchars[i], &state);
403 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
14744156 404 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc
UD
405 {
406 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
407 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
408 }
15a7d175
UD
409 }
410 }
c0a0f9a3 411#endif /* RE_ENABLE_I18N */
c0d5034e
UD
412 else if (type == OP_PERIOD
413#ifdef RE_ENABLE_I18N
414 || type == OP_UTF8_PERIOD
415#endif /* RE_ENABLE_I18N */
416 || type == END_OF_RE)
15a7d175
UD
417 {
418 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419 if (type == END_OF_RE)
420 bufp->can_be_null = 1;
421 return;
422 }
3b0bdc72
UD
423 }
424}
425\f
426/* Entry point for POSIX code. */
427/* regcomp takes a regular expression as a string and compiles it.
428
429 PREG is a regex_t *. We do not expect any fields to be initialized,
430 since POSIX says we shouldn't. Thus, we set
431
432 `buffer' to the compiled pattern;
433 `used' to the length of the compiled pattern;
434 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
435 REG_EXTENDED bit in CFLAGS is set; otherwise, to
436 RE_SYNTAX_POSIX_BASIC;
437 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
438 `fastmap' to an allocated space for the fastmap;
439 `fastmap_accurate' to zero;
440 `re_nsub' to the number of subexpressions in PATTERN.
441
442 PATTERN is the address of the pattern string.
443
444 CFLAGS is a series of bits which affect compilation.
445
446 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
447 use POSIX basic syntax.
448
449 If REG_NEWLINE is set, then . and [^...] don't match newline.
450 Also, regexec will try a match beginning after every newline.
451
452 If REG_ICASE is set, then we considers upper- and lowercase
453 versions of letters to be equivalent when matching.
454
455 If REG_NOSUB is set, then when PREG is passed to regexec, that
456 routine will report only success or failure, and nothing about the
457 registers.
458
459 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
460 the return codes and their meanings.) */
461
462int
463regcomp (preg, pattern, cflags)
75e4a282
UD
464 regex_t *__restrict preg;
465 const char *__restrict pattern;
3b0bdc72
UD
466 int cflags;
467{
468 reg_errcode_t ret;
469 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 470 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
471
472 preg->buffer = NULL;
473 preg->allocated = 0;
474 preg->used = 0;
475
476 /* Try to allocate space for the fastmap. */
477 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 478 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
479 return REG_ESPACE;
480
481 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
482
483 /* If REG_NEWLINE is set, newlines are treated differently. */
484 if (cflags & REG_NEWLINE)
485 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
486 syntax &= ~RE_DOT_NEWLINE;
487 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
488 /* It also changes the matching behavior. */
489 preg->newline_anchor = 1;
490 }
491 else
492 preg->newline_anchor = 0;
493 preg->no_sub = !!(cflags & REG_NOSUB);
494 preg->translate = NULL;
495
496 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
497
498 /* POSIX doesn't distinguish between an unmatched open-group and an
499 unmatched close-group: both are REG_EPAREN. */
500 if (ret == REG_ERPAREN)
501 ret = REG_EPAREN;
502
a9388965 503 /* We have already checked preg->fastmap != NULL. */
bc15410e 504 if (BE (ret == REG_NOERROR, 1))
71ccd330 505 /* Compute the fastmap now, since regexec cannot modify the pattern
86576b62 506 buffer. This function never fails in this implementation. */
83b038f2 507 (void) re_compile_fastmap (preg);
71ccd330 508 else
3b0bdc72 509 {
71ccd330
UD
510 /* Some error occurred while compiling the expression. */
511 re_free (preg->fastmap);
512 preg->fastmap = NULL;
3b0bdc72
UD
513 }
514
515 return (int) ret;
516}
517#ifdef _LIBC
518weak_alias (__regcomp, regcomp)
519#endif
520
521/* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
523
524size_t
525regerror (errcode, preg, errbuf, errbuf_size)
526 int errcode;
527 const regex_t *preg;
528 char *errbuf;
529 size_t errbuf_size;
530{
531 const char *msg;
532 size_t msg_size;
533
bc15410e 534 if (BE (errcode < 0
15a7d175
UD
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
541 abort ();
542
6455d255 543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
544
545 msg_size = strlen (msg) + 1; /* Includes the null. */
546
bc15410e 547 if (BE (errbuf_size != 0, 1))
3b0bdc72 548 {
bc15410e 549 if (BE (msg_size > errbuf_size, 0))
15a7d175 550 {
3b0bdc72
UD
551#if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
553#else
15a7d175
UD
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
3b0bdc72 556#endif
15a7d175 557 }
3b0bdc72 558 else
15a7d175 559 memcpy (errbuf, msg, msg_size);
3b0bdc72
UD
560 }
561
562 return msg_size;
563}
564#ifdef _LIBC
565weak_alias (__regerror, regerror)
566#endif
567
3b0bdc72 568
e40a38b3
UD
569#ifdef RE_ENABLE_I18N
570/* This static array is used for the map to single-byte characters when
571 UTF-8 is used. Otherwise we would allocate memory just to initialize
572 it the same all the time. UTF-8 is the preferred encoding so this is
573 a worthwhile optimization. */
574static const bitset utf8_sb_map =
575{
576 /* Set the first 128 bits. */
577# if UINT_MAX == 0xffffffff
578 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
579# else
580# error "Add case for new unsigned int size"
581# endif
582};
583#endif
584
585
71ccd330
UD
586static void
587free_dfa_content (re_dfa_t *dfa)
3b0bdc72
UD
588{
589 int i, j;
3b0bdc72 590
71ccd330
UD
591 re_free (dfa->subexps);
592
ee70274a
UD
593 if (dfa->nodes)
594 for (i = 0; i < dfa->nodes_len; ++i)
595 {
596 re_token_t *node = dfa->nodes + i;
c0a0f9a3 597#ifdef RE_ENABLE_I18N
ee70274a
UD
598 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
599 free_charset (node->opr.mbcset);
600 else
c0a0f9a3 601#endif /* RE_ENABLE_I18N */
ee70274a
UD
602 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
603 re_free (node->opr.sbcset);
604 }
71ccd330
UD
605 re_free (dfa->nexts);
606 for (i = 0; i < dfa->nodes_len; ++i)
607 {
608 if (dfa->eclosures != NULL)
609 re_node_set_free (dfa->eclosures + i);
610 if (dfa->inveclosures != NULL)
611 re_node_set_free (dfa->inveclosures + i);
612 if (dfa->edests != NULL)
613 re_node_set_free (dfa->edests + i);
614 }
615 re_free (dfa->edests);
616 re_free (dfa->eclosures);
617 re_free (dfa->inveclosures);
618 re_free (dfa->nodes);
3b0bdc72 619
ee70274a
UD
620 if (dfa->state_table)
621 for (i = 0; i <= dfa->state_hash_mask; ++i)
622 {
623 struct re_state_table_entry *entry = dfa->state_table + i;
624 for (j = 0; j < entry->num; ++j)
625 {
626 re_dfastate_t *state = entry->array[j];
627 free_state (state);
628 }
629 re_free (entry->array);
630 }
71ccd330 631 re_free (dfa->state_table);
65e6becf 632#ifdef RE_ENABLE_I18N
e40a38b3
UD
633 if (dfa->sb_char != utf8_sb_map)
634 re_free (dfa->sb_char);
a96c63ed 635#endif
0742e48e 636#ifdef DEBUG
71ccd330 637 re_free (dfa->re_str);
0742e48e 638#endif
71ccd330
UD
639
640 re_free (dfa);
641}
642
643
644/* Free dynamically allocated space used by PREG. */
645
646void
647regfree (preg)
648 regex_t *preg;
649{
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
86576b62
UD
653 preg->buffer = NULL;
654 preg->allocated = 0;
71ccd330 655
3b0bdc72 656 re_free (preg->fastmap);
86576b62
UD
657 preg->fastmap = NULL;
658
659 re_free (preg->translate);
660 preg->translate = NULL;
3b0bdc72
UD
661}
662#ifdef _LIBC
663weak_alias (__regfree, regfree)
664#endif
665\f
666/* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669#if defined _REGEX_RE_COMP || defined _LIBC
670
671/* BSD has one and only one pattern buffer. */
672static struct re_pattern_buffer re_comp_buf;
673
674char *
675# ifdef _LIBC
676/* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679weak_function
680# endif
681re_comp (s)
682 const char *s;
683{
684 reg_errcode_t ret;
240e87c2 685 char *fastmap;
3b0bdc72
UD
686
687 if (!s)
688 {
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
691 return 0;
692 }
693
240e87c2
RM
694 if (re_comp_buf.buffer)
695 {
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
1b2c2628 699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
700 re_comp_buf.fastmap = fastmap;
701 }
702
703 if (re_comp_buf.fastmap == NULL)
3b0bdc72
UD
704 {
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
6455d255
UD
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
709 }
710
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
713
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
716
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
718
719 if (!ret)
720 return NULL;
721
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 724}
240e87c2
RM
725
726#ifdef _LIBC
c877418f 727libc_freeres_fn (free_mem)
240e87c2
RM
728{
729 __regfree (&re_comp_buf);
730}
240e87c2
RM
731#endif
732
3b0bdc72
UD
733#endif /* _REGEX_RE_COMP */
734\f
735/* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
738
739static reg_errcode_t
740re_compile_internal (preg, pattern, length, syntax)
741 regex_t *preg;
742 const char * pattern;
743 int length;
744 reg_syntax_t syntax;
745{
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
749
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
1b2c2628
UD
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
758
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
951d6408 761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3b0bdc72
UD
762 {
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
3b0bdc72 770 preg->allocated = sizeof (re_dfa_t);
951d6408 771 preg->buffer = (unsigned char *) dfa;
3b0bdc72 772 }
3b0bdc72
UD
773 preg->used = sizeof (re_dfa_t);
774
775 err = init_dfa (dfa, length);
bc15410e 776 if (BE (err != REG_NOERROR, 0))
3b0bdc72 777 {
ee70274a 778 free_dfa_content (dfa);
3b0bdc72 779 preg->buffer = NULL;
ca3c505e 780 preg->allocated = 0;
3b0bdc72
UD
781 return err;
782 }
0742e48e
UD
783#ifdef DEBUG
784 dfa->re_str = re_malloc (char, length + 1);
785 strncpy (dfa->re_str, pattern, length + 1);
786#endif
3b0bdc72 787
612546c6 788 err = re_string_construct (&regexp, pattern, length, preg->translate,
f0c7c524 789 syntax & RE_ICASE, dfa);
bc15410e 790 if (BE (err != REG_NOERROR, 0))
3b0bdc72 791 {
ee70274a
UD
792 re_compile_internal_free_return:
793 free_workarea_compile (preg);
794 re_string_destruct (&regexp);
795 free_dfa_content (dfa);
3b0bdc72 796 preg->buffer = NULL;
ca3c505e 797 preg->allocated = 0;
3b0bdc72
UD
798 return err;
799 }
800
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 804 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
805 goto re_compile_internal_free_return;
806
14744156
UD
807#ifdef RE_ENABLE_I18N
808 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 809 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
810 optimize_utf8 (dfa);
811#endif
812
3b0bdc72
UD
813 /* Analyze the tree and collect information which is necessary to
814 create the dfa. */
815 err = analyze (dfa);
bc15410e 816 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
817 goto re_compile_internal_free_return;
818
819 /* Then create the initial state of the dfa. */
820 err = create_initial_state (dfa);
3b0bdc72 821
3b0bdc72
UD
822 /* Release work areas. */
823 free_workarea_compile (preg);
824 re_string_destruct (&regexp);
825
71ccd330
UD
826 if (BE (err != REG_NOERROR, 0))
827 {
71ccd330
UD
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
ca3c505e 830 preg->allocated = 0;
71ccd330
UD
831 }
832
3b0bdc72
UD
833 return err;
834}
835
836/* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
838
839static reg_errcode_t
840init_dfa (dfa, pat_len)
841 re_dfa_t *dfa;
842 int pat_len;
843{
844 int table_size;
e40a38b3
UD
845#ifndef _LIBC
846 char *codeset_name;
847#endif
81c64d40
UD
848
849 memset (dfa, '\0', sizeof (re_dfa_t));
850
ee70274a
UD
851 /* Force allocation of str_tree_storage the first time. */
852 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
853
3b0bdc72
UD
854 dfa->nodes_alloc = pat_len + 1;
855 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
856
857 dfa->states_alloc = pat_len + 1;
858
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; table_size > 0; table_size <<= 1)
861 if (table_size > pat_len)
862 break;
863
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
866
867 dfa->subexps_alloc = 1;
868 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
3b0bdc72 869
3c0fb574
UD
870 dfa->mb_cur_max = MB_CUR_MAX;
871#ifdef _LIBC
bb3f4825 872 if (dfa->mb_cur_max == 6
3c0fb574
UD
873 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 dfa->is_utf8 = 1;
f0c7c524
UD
875 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
876 != 0);
e40a38b3
UD
877#else
878# ifdef HAVE_LANGINFO_CODESET
879 codeset_name = nl_langinfo (CODESET);
880# else
881 codeset_name = getenv ("LC_ALL");
882 if (codeset_name == NULL || codeset[0] == '\0')
883 codeset_name = getenv ("LC_CTYPE");
884 if (codeset_name == NULL || codeset[0] == '\0')
885 codeset_name = getenv ("LANG");
886 if (codeset_name == NULL)
887 codeset_name = "";
888 else if (strchr (codeset_name, '.') != NULL)
889 codeset_name = strchr (codeset_name, '.') + 1;
890# endif
891
892 if (strcasecmp (codeset_name, "UTF-8") == 0
893 || strcasecmp (codeset_name, "UTF8") == 0)
894 dfa->is_utf8 = 1;
895
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
3c0fb574 899#endif
e40a38b3 900
65e6becf
UD
901#ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
903 {
65e6becf 904 if (dfa->is_utf8)
e40a38b3 905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
65e6becf 906 else
e40a38b3
UD
907 {
908 int i, j, ch;
909
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
911 if (BE (dfa->sb_char == NULL, 0))
912 return REG_ESPACE;
913
914 /* Clear all bits by, then set those corresponding to single
915 byte chars. */
916 bitset_empty (dfa->sb_char);
917
918 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
919 for (j = 0; j < UINT_BITS; ++j, ++ch)
920 {
921 wchar_t wch = __btowc (ch);
922 if (wch != WEOF)
923 dfa->sb_char[i] |= 1 << j;
924# ifndef _LIBC
925 if (isascii (ch) && wch != (wchar_t) ch)
926 dfa->map_notascii = 1;
927# endif
928 }
929 }
65e6becf
UD
930 }
931#endif
3c0fb574 932
bc15410e 933 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
15a7d175 934 || dfa->subexps == NULL, 0))
ee70274a 935 return REG_ESPACE;
3b0bdc72
UD
936 return REG_NOERROR;
937}
938
939/* Initialize WORD_CHAR table, which indicate which character is
940 "word". In this case "word" means that it is the word construction
941 character used by some operators like "\<", "\>", etc. */
942
56b168be 943static void
3b0bdc72
UD
944init_word_char (dfa)
945 re_dfa_t *dfa;
946{
947 int i, j, ch;
56b168be 948 dfa->word_ops_used = 1;
3b0bdc72
UD
949 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
950 for (j = 0; j < UINT_BITS; ++j, ++ch)
951 if (isalnum (ch) || ch == '_')
15a7d175 952 dfa->word_char[i] |= 1 << j;
3b0bdc72
UD
953}
954
955/* Free the work area which are only used while compiling. */
956
957static void
958free_workarea_compile (preg)
959 regex_t *preg;
960{
961 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
ee70274a
UD
962 bin_tree_storage_t *storage, *next;
963 for (storage = dfa->str_tree_storage; storage; storage = next)
964 {
965 next = storage->next;
966 re_free (storage);
967 }
968 dfa->str_tree_storage = NULL;
969 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 970 dfa->str_tree = NULL;
a7d5c291
UD
971 re_free (dfa->org_indices);
972 dfa->org_indices = NULL;
3b0bdc72
UD
973}
974
975/* Create initial states for all contexts. */
976
977static reg_errcode_t
978create_initial_state (dfa)
979 re_dfa_t *dfa;
980{
981 int first, i;
982 reg_errcode_t err;
983 re_node_set init_nodes;
984
985 /* Initial states have the epsilon closure of the node which is
986 the first node of the regular expression. */
987 first = dfa->str_tree->first;
988 dfa->init_node = first;
989 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 990 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
991 return err;
992
993 /* The back-references which are in initial states can epsilon transit,
994 since in this case all of the subexpressions can be null.
995 Then we add epsilon closures of the nodes which are the next nodes of
996 the back-references. */
997 if (dfa->nbackref > 0)
998 for (i = 0; i < init_nodes.nelem; ++i)
999 {
15a7d175
UD
1000 int node_idx = init_nodes.elems[i];
1001 re_token_type_t type = dfa->nodes[node_idx].type;
1002
1003 int clexp_idx;
1004 if (type != OP_BACK_REF)
1005 continue;
1006 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1007 {
1008 re_token_t *clexp_node;
1009 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1010 if (clexp_node->type == OP_CLOSE_SUBEXP
1011 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
1012 break;
1013 }
1014 if (clexp_idx == init_nodes.nelem)
1015 continue;
1016
1017 if (type == OP_BACK_REF)
1018 {
1019 int dest_idx = dfa->edests[node_idx].elems[0];
1020 if (!re_node_set_contains (&init_nodes, dest_idx))
1021 {
1022 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1023 i = 0;
1024 }
1025 }
3b0bdc72
UD
1026 }
1027
1028 /* It must be the first time to invoke acquire_state. */
a9388965
UD
1029 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1030 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 1031 if (BE (dfa->init_state == NULL, 0))
a9388965 1032 return err;
3b0bdc72
UD
1033 if (dfa->init_state->has_constraint)
1034 {
a9388965 1035 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1036 CONTEXT_WORD);
a9388965 1037 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 1038 CONTEXT_NEWLINE);
a9388965 1039 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
1040 &init_nodes,
1041 CONTEXT_NEWLINE
1042 | CONTEXT_BEGBUF);
bc15410e 1043 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
1044 || dfa->init_state_begbuf == NULL, 0))
1045 return err;
3b0bdc72
UD
1046 }
1047 else
1048 dfa->init_state_word = dfa->init_state_nl
1049 = dfa->init_state_begbuf = dfa->init_state;
1050
3b0bdc72
UD
1051 re_node_set_free (&init_nodes);
1052 return REG_NOERROR;
1053}
1054\f
14744156
UD
1055#ifdef RE_ENABLE_I18N
1056/* If it is possible to do searching in single byte encoding instead of UTF-8
1057 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058 DFA nodes where needed. */
1059
1060static void
1061optimize_utf8 (dfa)
1062 re_dfa_t *dfa;
1063{
ad7f28c2 1064 int node, i, mb_chars = 0, has_period = 0;
14744156
UD
1065
1066 for (node = 0; node < dfa->nodes_len; ++node)
1067 switch (dfa->nodes[node].type)
1068 {
1069 case CHARACTER:
14744156 1070 if (dfa->nodes[node].opr.c >= 0x80)
5f93cd52 1071 mb_chars = 1;
14744156
UD
1072 break;
1073 case ANCHOR:
1074 switch (dfa->nodes[node].opr.idx)
1075 {
1076 case LINE_FIRST:
1077 case LINE_LAST:
1078 case BUF_FIRST:
1079 case BUF_LAST:
1080 break;
1081 default:
1082 /* Word anchors etc. cannot be handled. */
1083 return;
1084 }
1085 break;
ad7f28c2
UD
1086 case OP_PERIOD:
1087 has_period = 1;
1088 break;
14744156
UD
1089 case OP_BACK_REF:
1090 case OP_ALT:
1091 case END_OF_RE:
14744156
UD
1092 case OP_DUP_ASTERISK:
1093 case OP_DUP_QUESTION:
14744156
UD
1094 case OP_OPEN_SUBEXP:
1095 case OP_CLOSE_SUBEXP:
1096 break;
a8067e8f
UD
1097 case SIMPLE_BRACKET:
1098 /* Just double check. */
1099 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1100 if (dfa->nodes[node].opr.sbcset[i])
1101 return;
1102 break;
14744156
UD
1103 default:
1104 return;
1105 }
1106
ad7f28c2 1107 if (mb_chars || has_period)
5f93cd52 1108 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1109 {
1110 if (dfa->nodes[node].type == CHARACTER
1111 && dfa->nodes[node].opr.c >= 0x80)
1112 dfa->nodes[node].mb_partial = 0;
1113 else if (dfa->nodes[node].type == OP_PERIOD)
1114 dfa->nodes[node].type = OP_UTF8_PERIOD;
1115 }
5f93cd52 1116
14744156
UD
1117 /* The search can be in single byte locale. */
1118 dfa->mb_cur_max = 1;
1119 dfa->is_utf8 = 0;
ad7f28c2 1120 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1121}
1122#endif
1123\f
3b0bdc72
UD
1124/* Analyze the structure tree, and calculate "first", "next", "edest",
1125 "eclosure", and "inveclosure". */
1126
1127static reg_errcode_t
1128analyze (dfa)
1129 re_dfa_t *dfa;
1130{
1131 int i;
1132 reg_errcode_t ret;
1133
1134 /* Allocate arrays. */
3b0bdc72 1135 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
a7d5c291 1136 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3b0bdc72
UD
1137 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1138 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1139 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1140 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
15a7d175 1141 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
3b0bdc72
UD
1142 return REG_ESPACE;
1143 /* Initialize them. */
1144 for (i = 0; i < dfa->nodes_len; ++i)
1145 {
3b0bdc72
UD
1146 dfa->nexts[i] = -1;
1147 re_node_set_init_empty (dfa->edests + i);
1148 re_node_set_init_empty (dfa->eclosures + i);
1149 re_node_set_init_empty (dfa->inveclosures + i);
1150 }
1151
1152 ret = analyze_tree (dfa, dfa->str_tree);
bc15410e 1153 if (BE (ret == REG_NOERROR, 1))
3b0bdc72
UD
1154 {
1155 ret = calc_eclosure (dfa);
1156 if (ret == REG_NOERROR)
1157 calc_inveclosure (dfa);
1158 }
1159 return ret;
1160}
1161
1162/* Helper functions for analyze.
1163 This function calculate "first", "next", and "edest" for the subtree
1164 whose root is NODE. */
1165
1166static reg_errcode_t
1167analyze_tree (dfa, node)
1168 re_dfa_t *dfa;
1169 bin_tree_t *node;
1170{
1171 reg_errcode_t ret;
1172 if (node->first == -1)
1173 calc_first (dfa, node);
1174 if (node->next == -1)
1175 calc_next (dfa, node);
1176 if (node->eclosure.nelem == 0)
1177 calc_epsdest (dfa, node);
1178 /* Calculate "first" etc. for the left child. */
1179 if (node->left != NULL)
1180 {
1181 ret = analyze_tree (dfa, node->left);
bc15410e 1182 if (BE (ret != REG_NOERROR, 0))
15a7d175 1183 return ret;
3b0bdc72
UD
1184 }
1185 /* Calculate "first" etc. for the right child. */
1186 if (node->right != NULL)
1187 {
1188 ret = analyze_tree (dfa, node->right);
bc15410e 1189 if (BE (ret != REG_NOERROR, 0))
15a7d175 1190 return ret;
3b0bdc72
UD
1191 }
1192 return REG_NOERROR;
1193}
1194
1195/* Calculate "first" for the node NODE. */
1196static void
1197calc_first (dfa, node)
1198 re_dfa_t *dfa;
1199 bin_tree_t *node;
1200{
1201 int idx, type;
1202 idx = node->node_idx;
1203 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1204
1205 switch (type)
1206 {
1207#ifdef DEBUG
3b0bdc72
UD
1208 case OP_OPEN_BRACKET:
1209 case OP_CLOSE_BRACKET:
1210 case OP_OPEN_DUP_NUM:
1211 case OP_CLOSE_DUP_NUM:
6b6557e8 1212 case OP_DUP_PLUS:
3b0bdc72
UD
1213 case OP_NON_MATCH_LIST:
1214 case OP_OPEN_COLL_ELEM:
1215 case OP_CLOSE_COLL_ELEM:
1216 case OP_OPEN_EQUIV_CLASS:
1217 case OP_CLOSE_EQUIV_CLASS:
1218 case OP_OPEN_CHAR_CLASS:
1219 case OP_CLOSE_CHAR_CLASS:
7de66108 1220 /* These must not appear here. */
3b0bdc72
UD
1221 assert (0);
1222#endif
1223 case END_OF_RE:
1224 case CHARACTER:
1225 case OP_PERIOD:
1226 case OP_DUP_ASTERISK:
1227 case OP_DUP_QUESTION:
c0a0f9a3 1228#ifdef RE_ENABLE_I18N
ad7f28c2 1229 case OP_UTF8_PERIOD:
3b0bdc72 1230 case COMPLEX_BRACKET:
c0a0f9a3 1231#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1232 case SIMPLE_BRACKET:
1233 case OP_BACK_REF:
1234 case ANCHOR:
81c64d40
UD
1235 case OP_OPEN_SUBEXP:
1236 case OP_CLOSE_SUBEXP:
3b0bdc72
UD
1237 node->first = idx;
1238 break;
3b0bdc72
UD
1239 case OP_ALT:
1240 node->first = idx;
1241 break;
3b0bdc72
UD
1242 /* else fall through */
1243 default:
1244#ifdef DEBUG
1245 assert (node->left != NULL);
1246#endif
1247 if (node->left->first == -1)
15a7d175 1248 calc_first (dfa, node->left);
3b0bdc72
UD
1249 node->first = node->left->first;
1250 break;
1251 }
3b0bdc72
UD
1252}
1253
1254/* Calculate "next" for the node NODE. */
1255
1256static void
1257calc_next (dfa, node)
1258 re_dfa_t *dfa;
1259 bin_tree_t *node;
1260{
1261 int idx, type;
1262 bin_tree_t *parent = node->parent;
1263 if (parent == NULL)
1264 {
1265 node->next = -1;
1266 idx = node->node_idx;
1267 if (node->type == 0)
15a7d175 1268 dfa->nexts[idx] = node->next;
3b0bdc72
UD
1269 return;
1270 }
1271
1272 idx = parent->node_idx;
1273 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1274
1275 switch (type)
1276 {
1277 case OP_DUP_ASTERISK:
3b0bdc72
UD
1278 node->next = idx;
1279 break;
1280 case CONCAT:
1281 if (parent->left == node)
15a7d175
UD
1282 {
1283 if (parent->right->first == -1)
1284 calc_first (dfa, parent->right);
1285 node->next = parent->right->first;
1286 break;
1287 }
3b0bdc72
UD
1288 /* else fall through */
1289 default:
1290 if (parent->next == -1)
15a7d175 1291 calc_next (dfa, parent);
3b0bdc72
UD
1292 node->next = parent->next;
1293 break;
1294 }
1295 idx = node->node_idx;
1296 if (node->type == 0)
1297 dfa->nexts[idx] = node->next;
1298}
1299
1300/* Calculate "edest" for the node NODE. */
1301
1302static void
1303calc_epsdest (dfa, node)
1304 re_dfa_t *dfa;
1305 bin_tree_t *node;
1306{
1307 int idx;
1308 idx = node->node_idx;
1309 if (node->type == 0)
1310 {
1311 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
15a7d175
UD
1312 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1313 {
1314 if (node->left->first == -1)
1315 calc_first (dfa, node->left);
1316 if (node->next == -1)
1317 calc_next (dfa, node);
1318 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1319 node->next);
1320 }
3b0bdc72 1321 else if (dfa->nodes[idx].type == OP_ALT)
15a7d175
UD
1322 {
1323 int left, right;
1324 if (node->left != NULL)
1325 {
1326 if (node->left->first == -1)
1327 calc_first (dfa, node->left);
1328 left = node->left->first;
1329 }
1330 else
1331 {
1332 if (node->next == -1)
1333 calc_next (dfa, node);
1334 left = node->next;
1335 }
1336 if (node->right != NULL)
1337 {
1338 if (node->right->first == -1)
1339 calc_first (dfa, node->right);
1340 right = node->right->first;
1341 }
1342 else
1343 {
1344 if (node->next == -1)
1345 calc_next (dfa, node);
1346 right = node->next;
1347 }
1348 re_node_set_init_2 (dfa->edests + idx, left, right);
1349 }
81c64d40 1350 else if (dfa->nodes[idx].type == ANCHOR
15a7d175
UD
1351 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1352 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1353 || dfa->nodes[idx].type == OP_BACK_REF)
1354 re_node_set_init_1 (dfa->edests + idx, node->next);
1769a73f
UD
1355 else
1356 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
3b0bdc72
UD
1357 }
1358}
1359
485d775d
UD
1360/* Duplicate the epsilon closure of the node ROOT_NODE.
1361 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1362 to their own constraint. */
1363
1364static reg_errcode_t
1365duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
15a7d175 1366 init_constraint)
485d775d
UD
1367 re_dfa_t *dfa;
1368 int top_org_node, top_clone_node, root_node;
1369 unsigned int init_constraint;
1370{
1371 reg_errcode_t err;
1372 int org_node, clone_node, ret;
1373 unsigned int constraint = init_constraint;
1374 for (org_node = top_org_node, clone_node = top_clone_node;;)
1375 {
1376 int org_dest, clone_dest;
1377 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1378 {
485d775d
UD
1379 /* If the back reference epsilon-transit, its destination must
1380 also have the constraint. Then duplicate the epsilon closure
1381 of the destination of the back reference, and store it in
1382 edests of the back reference. */
15a7d175
UD
1383 org_dest = dfa->nexts[org_node];
1384 re_node_set_empty (dfa->edests + clone_node);
1385 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1386 if (BE (err != REG_NOERROR, 0))
1387 return err;
1388 dfa->nexts[clone_node] = dfa->nexts[org_node];
1389 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1390 if (BE (ret < 0, 0))
1391 return REG_ESPACE;
1392 }
485d775d 1393 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1394 {
485d775d
UD
1395 /* In case of the node can't epsilon-transit, don't duplicate the
1396 destination and store the original destination as the
1397 destination of the node. */
15a7d175
UD
1398 dfa->nexts[clone_node] = dfa->nexts[org_node];
1399 break;
1400 }
485d775d 1401 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1402 {
485d775d
UD
1403 /* In case of the node can epsilon-transit, and it has only one
1404 destination. */
15a7d175
UD
1405 org_dest = dfa->edests[org_node].elems[0];
1406 re_node_set_empty (dfa->edests + clone_node);
1407 if (dfa->nodes[org_node].type == ANCHOR)
1408 {
485d775d 1409 /* In case of the node has another constraint, append it. */
15a7d175
UD
1410 if (org_node == root_node && clone_node != org_node)
1411 {
485d775d
UD
1412 /* ...but if the node is root_node itself, it means the
1413 epsilon closure have a loop, then tie it to the
1414 destination of the root_node. */
15a7d175
UD
1415 ret = re_node_set_insert (dfa->edests + clone_node,
1416 org_dest);
1417 if (BE (ret < 0, 0))
1418 return REG_ESPACE;
1419 break;
1420 }
1421 constraint |= dfa->nodes[org_node].opr.ctx_type;
1422 }
1423 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1424 if (BE (err != REG_NOERROR, 0))
1425 return err;
1426 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1427 if (BE (ret < 0, 0))
1428 return REG_ESPACE;
1429 }
485d775d 1430 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1431 {
485d775d 1432 /* In case of the node can epsilon-transit, and it has two
a7d5c291 1433 destinations. E.g. '|', '*', '+', '?'. */
15a7d175
UD
1434 org_dest = dfa->edests[org_node].elems[0];
1435 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1436 /* Search for a duplicated node which satisfies the constraint. */
1437 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1438 if (clone_dest == -1)
1439 {
1440 /* There are no such a duplicated node, create a new one. */
1441 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1442 if (BE (err != REG_NOERROR, 0))
1443 return err;
7de66108
UD
1444 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1445 if (BE (ret < 0, 0))
1446 return REG_ESPACE;
1447 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1448 root_node, constraint);
1449 if (BE (err != REG_NOERROR, 0))
1450 return err;
a7d5c291
UD
1451 }
1452 else
1453 {
1454 /* There are a duplicated node which satisfy the constraint,
1455 use it to avoid infinite loop. */
1456 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457 if (BE (ret < 0, 0))
1458 return REG_ESPACE;
1459 }
15a7d175
UD
1460
1461 org_dest = dfa->edests[org_node].elems[1];
1462 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1463 if (BE (err != REG_NOERROR, 0))
1464 return err;
1465 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466 if (BE (ret < 0, 0))
1467 return REG_ESPACE;
1468 }
485d775d
UD
1469 org_node = org_dest;
1470 clone_node = clone_dest;
1471 }
1472 return REG_NOERROR;
1473}
1474
a7d5c291
UD
1475/* Search for a node which is duplicated from the node ORG_NODE, and
1476 satisfies the constraint CONSTRAINT. */
1477
1478static int
1479search_duplicated_node (dfa, org_node, constraint)
1480 re_dfa_t *dfa;
1481 int org_node;
1482 unsigned int constraint;
1483{
1484 int idx;
1485 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1486 {
1487 if (org_node == dfa->org_indices[idx]
1488 && constraint == dfa->nodes[idx].constraint)
1489 return idx; /* Found. */
1490 }
1491 return -1; /* Not found. */
1492}
1493
a9388965
UD
1494/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1495 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1496 otherwise return the error code. */
1497
1498static reg_errcode_t
1499duplicate_node (new_idx, dfa, org_idx, constraint)
3b0bdc72 1500 re_dfa_t *dfa;
a9388965 1501 int *new_idx, org_idx;
3b0bdc72
UD
1502 unsigned int constraint;
1503{
7de66108 1504 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
485d775d 1505 if (BE (dup_idx == -1, 0))
a9388965 1506 return REG_ESPACE;
485d775d
UD
1507 dfa->nodes[dup_idx].constraint = constraint;
1508 if (dfa->nodes[org_idx].type == ANCHOR)
1509 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
a9388965 1510 dfa->nodes[dup_idx].duplicated = 1;
485d775d
UD
1511 re_node_set_init_empty (dfa->edests + dup_idx);
1512 re_node_set_init_empty (dfa->eclosures + dup_idx);
1513 re_node_set_init_empty (dfa->inveclosures + dup_idx);
3b0bdc72 1514
a7d5c291
UD
1515 /* Store the index of the original node. */
1516 dfa->org_indices[dup_idx] = org_idx;
a9388965
UD
1517 *new_idx = dup_idx;
1518 return REG_NOERROR;
3b0bdc72
UD
1519}
1520
1521static void
1522calc_inveclosure (dfa)
1523 re_dfa_t *dfa;
1524{
485d775d 1525 int src, idx, dest;
3b0bdc72
UD
1526 for (src = 0; src < dfa->nodes_len; ++src)
1527 {
1528 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175
UD
1529 {
1530 dest = dfa->eclosures[src].elems[idx];
1531 re_node_set_insert (dfa->inveclosures + dest, src);
1532 }
3b0bdc72
UD
1533 }
1534}
1535
1536/* Calculate "eclosure" for all the node in DFA. */
1537
1538static reg_errcode_t
1539calc_eclosure (dfa)
1540 re_dfa_t *dfa;
1541{
485d775d 1542 int node_idx, incomplete;
3b0bdc72
UD
1543#ifdef DEBUG
1544 assert (dfa->nodes_len > 0);
1545#endif
485d775d 1546 incomplete = 0;
3b0bdc72 1547 /* For each nodes, calculate epsilon closure. */
485d775d 1548 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1549 {
a9388965 1550 reg_errcode_t err;
3b0bdc72 1551 re_node_set eclosure_elem;
485d775d 1552 if (node_idx == dfa->nodes_len)
15a7d175
UD
1553 {
1554 if (!incomplete)
1555 break;
1556 incomplete = 0;
1557 node_idx = 0;
1558 }
3b0bdc72
UD
1559
1560#ifdef DEBUG
3b0bdc72
UD
1561 assert (dfa->eclosures[node_idx].nelem != -1);
1562#endif
1563 /* If we have already calculated, skip it. */
1564 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1565 continue;
3b0bdc72 1566 /* Calculate epsilon closure of `node_idx'. */
a9388965 1567 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1568 if (BE (err != REG_NOERROR, 0))
15a7d175 1569 return err;
3b0bdc72
UD
1570
1571 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175
UD
1572 {
1573 incomplete = 1;
1574 re_node_set_free (&eclosure_elem);
1575 }
3b0bdc72 1576 }
3b0bdc72
UD
1577 return REG_NOERROR;
1578}
1579
1580/* Calculate epsilon closure of NODE. */
1581
a9388965
UD
1582static reg_errcode_t
1583calc_eclosure_iter (new_set, dfa, node, root)
1584 re_node_set *new_set;
3b0bdc72
UD
1585 re_dfa_t *dfa;
1586 int node, root;
1587{
a9388965 1588 reg_errcode_t err;
3b0bdc72 1589 unsigned int constraint;
485d775d 1590 int i, incomplete;
3b0bdc72 1591 re_node_set eclosure;
485d775d 1592 incomplete = 0;
a9388965 1593 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1594 if (BE (err != REG_NOERROR, 0))
a9388965 1595 return err;
3b0bdc72
UD
1596
1597 /* This indicates that we are calculating this node now.
1598 We reference this value to avoid infinite loop. */
1599 dfa->eclosures[node].nelem = -1;
1600
1601 constraint = ((dfa->nodes[node].type == ANCHOR)
15a7d175 1602 ? dfa->nodes[node].opr.ctx_type : 0);
485d775d
UD
1603 /* If the current node has constraints, duplicate all nodes.
1604 Since they must inherit the constraints. */
1605 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1606 {
1607 int org_node, cur_node;
1608 org_node = cur_node = node;
1609 err = duplicate_node_closure (dfa, node, node, node, constraint);
1610 if (BE (err != REG_NOERROR, 0))
15a7d175 1611 return err;
485d775d 1612 }
3b0bdc72
UD
1613
1614 /* Expand each epsilon destination nodes. */
485d775d 1615 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1616 for (i = 0; i < dfa->edests[node].nelem; ++i)
1617 {
15a7d175
UD
1618 re_node_set eclosure_elem;
1619 int edest = dfa->edests[node].elems[i];
1620 /* If calculating the epsilon closure of `edest' is in progress,
1621 return intermediate result. */
1622 if (dfa->eclosures[edest].nelem == -1)
1623 {
1624 incomplete = 1;
1625 continue;
1626 }
1627 /* If we haven't calculated the epsilon closure of `edest' yet,
1628 calculate now. Otherwise use calculated epsilon closure. */
1629 if (dfa->eclosures[edest].nelem == 0)
1630 {
1631 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1632 if (BE (err != REG_NOERROR, 0))
1633 return err;
1634 }
1635 else
1636 eclosure_elem = dfa->eclosures[edest];
1637 /* Merge the epsilon closure of `edest'. */
1638 re_node_set_merge (&eclosure, &eclosure_elem);
1639 /* If the epsilon closure of `edest' is incomplete,
1640 the epsilon closure of this node is also incomplete. */
1641 if (dfa->eclosures[edest].nelem == 0)
1642 {
1643 incomplete = 1;
1644 re_node_set_free (&eclosure_elem);
1645 }
3b0bdc72
UD
1646 }
1647
3b0bdc72
UD
1648 /* Epsilon closures include itself. */
1649 re_node_set_insert (&eclosure, node);
1650 if (incomplete && !root)
1651 dfa->eclosures[node].nelem = 0;
1652 else
1653 dfa->eclosures[node] = eclosure;
a9388965
UD
1654 *new_set = eclosure;
1655 return REG_NOERROR;
3b0bdc72
UD
1656}
1657\f
1658/* Functions for token which are used in the parser. */
1659
1660/* Fetch a token from INPUT.
1661 We must not use this function inside bracket expressions. */
1662
f0d77aa8
UD
1663static void
1664fetch_token (result, input, syntax)
1665 re_token_t *result;
3b0bdc72
UD
1666 re_string_t *input;
1667 reg_syntax_t syntax;
1668{
f0d77aa8 1669 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1670}
1671
1672/* Peek a token from INPUT, and return the length of the token.
1673 We must not use this function inside bracket expressions. */
1674
1675static int
1676peek_token (token, input, syntax)
1677 re_token_t *token;
1678 re_string_t *input;
1679 reg_syntax_t syntax;
1680{
1681 unsigned char c;
1682
1683 if (re_string_eoi (input))
1684 {
1685 token->type = END_OF_RE;
1686 return 0;
1687 }
1688
1689 c = re_string_peek_byte (input, 0);
1690 token->opr.c = c;
1691
65e6becf 1692 token->word_char = 0;
3b0bdc72
UD
1693#ifdef RE_ENABLE_I18N
1694 token->mb_partial = 0;
3c0fb574 1695 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1696 !re_string_first_byte (input, re_string_cur_idx (input)))
1697 {
1698 token->type = CHARACTER;
1699 token->mb_partial = 1;
1700 return 1;
1701 }
1702#endif
1703 if (c == '\\')
1704 {
1705 unsigned char c2;
1706 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1707 {
1708 token->type = BACK_SLASH;
1709 return 1;
1710 }
3b0bdc72
UD
1711
1712 c2 = re_string_peek_byte_case (input, 1);
1713 token->opr.c = c2;
1714 token->type = CHARACTER;
65e6becf
UD
1715#ifdef RE_ENABLE_I18N
1716 if (input->mb_cur_max > 1)
1717 {
1718 wint_t wc = re_string_wchar_at (input,
1719 re_string_cur_idx (input) + 1);
1720 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1721 }
1722 else
1723#endif
1724 token->word_char = IS_WORD_CHAR (c2) != 0;
1725
3b0bdc72 1726 switch (c2)
15a7d175
UD
1727 {
1728 case '|':
1729 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1730 token->type = OP_ALT;
1731 break;
1732 case '1': case '2': case '3': case '4': case '5':
1733 case '6': case '7': case '8': case '9':
1734 if (!(syntax & RE_NO_BK_REFS))
1735 {
1736 token->type = OP_BACK_REF;
1737 token->opr.idx = c2 - '0';
1738 }
1739 break;
1740 case '<':
1741 if (!(syntax & RE_NO_GNU_OPS))
1742 {
1743 token->type = ANCHOR;
bb3f4825 1744 token->opr.ctx_type = WORD_FIRST;
15a7d175
UD
1745 }
1746 break;
1747 case '>':
1748 if (!(syntax & RE_NO_GNU_OPS))
1749 {
1750 token->type = ANCHOR;
bb3f4825 1751 token->opr.ctx_type = WORD_LAST;
15a7d175
UD
1752 }
1753 break;
1754 case 'b':
1755 if (!(syntax & RE_NO_GNU_OPS))
1756 {
1757 token->type = ANCHOR;
bb3f4825 1758 token->opr.ctx_type = WORD_DELIM;
15a7d175
UD
1759 }
1760 break;
1761 case 'B':
1762 if (!(syntax & RE_NO_GNU_OPS))
1763 {
1764 token->type = ANCHOR;
bb3f4825 1765 token->opr.ctx_type = INSIDE_WORD;
15a7d175
UD
1766 }
1767 break;
1768 case 'w':
1769 if (!(syntax & RE_NO_GNU_OPS))
1770 token->type = OP_WORD;
1771 break;
1772 case 'W':
1773 if (!(syntax & RE_NO_GNU_OPS))
1774 token->type = OP_NOTWORD;
1775 break;
e2b6bfa3
UD
1776 case 's':
1777 if (!(syntax & RE_NO_GNU_OPS))
1778 token->type = OP_SPACE;
1779 break;
1780 case 'S':
1781 if (!(syntax & RE_NO_GNU_OPS))
1782 token->type = OP_NOTSPACE;
1783 break;
15a7d175
UD
1784 case '`':
1785 if (!(syntax & RE_NO_GNU_OPS))
1786 {
1787 token->type = ANCHOR;
bb3f4825 1788 token->opr.ctx_type = BUF_FIRST;
15a7d175
UD
1789 }
1790 break;
1791 case '\'':
1792 if (!(syntax & RE_NO_GNU_OPS))
1793 {
1794 token->type = ANCHOR;
bb3f4825 1795 token->opr.ctx_type = BUF_LAST;
15a7d175
UD
1796 }
1797 break;
1798 case '(':
1799 if (!(syntax & RE_NO_BK_PARENS))
1800 token->type = OP_OPEN_SUBEXP;
1801 break;
1802 case ')':
1803 if (!(syntax & RE_NO_BK_PARENS))
1804 token->type = OP_CLOSE_SUBEXP;
1805 break;
1806 case '+':
1807 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1808 token->type = OP_DUP_PLUS;
1809 break;
1810 case '?':
1811 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1812 token->type = OP_DUP_QUESTION;
1813 break;
1814 case '{':
1815 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1816 token->type = OP_OPEN_DUP_NUM;
1817 break;
1818 case '}':
1819 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1820 token->type = OP_CLOSE_DUP_NUM;
1821 break;
1822 default:
1823 break;
1824 }
3b0bdc72
UD
1825 return 2;
1826 }
1827
1828 token->type = CHARACTER;
65e6becf
UD
1829#ifdef RE_ENABLE_I18N
1830 if (input->mb_cur_max > 1)
1831 {
1832 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1833 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1834 }
1835 else
1836#endif
1837 token->word_char = IS_WORD_CHAR (token->opr.c);
1838
3b0bdc72
UD
1839 switch (c)
1840 {
1841 case '\n':
1842 if (syntax & RE_NEWLINE_ALT)
15a7d175 1843 token->type = OP_ALT;
3b0bdc72
UD
1844 break;
1845 case '|':
1846 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1847 token->type = OP_ALT;
3b0bdc72
UD
1848 break;
1849 case '*':
1850 token->type = OP_DUP_ASTERISK;
1851 break;
1852 case '+':
1853 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1854 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1855 break;
1856 case '?':
1857 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1858 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1859 break;
1860 case '{':
1861 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1862 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1863 break;
1864 case '}':
1865 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1866 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1867 break;
1868 case '(':
1869 if (syntax & RE_NO_BK_PARENS)
15a7d175 1870 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1871 break;
1872 case ')':
1873 if (syntax & RE_NO_BK_PARENS)
15a7d175 1874 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1875 break;
1876 case '[':
1877 token->type = OP_OPEN_BRACKET;
1878 break;
1879 case '.':
1880 token->type = OP_PERIOD;
1881 break;
1882 case '^':
134abcb5 1883 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
1884 re_string_cur_idx (input) != 0)
1885 {
1886 char prev = re_string_peek_byte (input, -1);
134abcb5 1887 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1888 break;
1889 }
3b0bdc72 1890 token->type = ANCHOR;
bb3f4825 1891 token->opr.ctx_type = LINE_FIRST;
3b0bdc72
UD
1892 break;
1893 case '$':
1894 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
1895 re_string_cur_idx (input) + 1 != re_string_length (input))
1896 {
1897 re_token_t next;
1898 re_string_skip_bytes (input, 1);
1899 peek_token (&next, input, syntax);
1900 re_string_skip_bytes (input, -1);
1901 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1902 break;
1903 }
3b0bdc72 1904 token->type = ANCHOR;
bb3f4825 1905 token->opr.ctx_type = LINE_LAST;
3b0bdc72
UD
1906 break;
1907 default:
1908 break;
1909 }
1910 return 1;
1911}
1912
1913/* Peek a token from INPUT, and return the length of the token.
1914 We must not use this function out of bracket expressions. */
1915
1916static int
1917peek_token_bracket (token, input, syntax)
1918 re_token_t *token;
1919 re_string_t *input;
1920 reg_syntax_t syntax;
1921{
1922 unsigned char c;
1923 if (re_string_eoi (input))
1924 {
1925 token->type = END_OF_RE;
1926 return 0;
1927 }
1928 c = re_string_peek_byte (input, 0);
1929 token->opr.c = c;
1930
1931#ifdef RE_ENABLE_I18N
3c0fb574 1932 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1933 !re_string_first_byte (input, re_string_cur_idx (input)))
1934 {
1935 token->type = CHARACTER;
1936 return 1;
1937 }
1938#endif /* RE_ENABLE_I18N */
1939
294b6bcc
UD
1940 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1941 && re_string_cur_idx (input) + 1 < re_string_length (input))
3b0bdc72
UD
1942 {
1943 /* In this case, '\' escape a character. */
1944 unsigned char c2;
82bbb29e
UD
1945 re_string_skip_bytes (input, 1);
1946 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
1947 token->opr.c = c2;
1948 token->type = CHARACTER;
1949 return 1;
1950 }
1951 if (c == '[') /* '[' is a special char in a bracket exps. */
1952 {
1953 unsigned char c2;
1954 int token_len;
294b6bcc
UD
1955 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1956 c2 = re_string_peek_byte (input, 1);
1957 else
1958 c2 = 0;
3b0bdc72
UD
1959 token->opr.c = c2;
1960 token_len = 2;
1961 switch (c2)
15a7d175
UD
1962 {
1963 case '.':
1964 token->type = OP_OPEN_COLL_ELEM;
1965 break;
1966 case '=':
1967 token->type = OP_OPEN_EQUIV_CLASS;
1968 break;
1969 case ':':
1970 if (syntax & RE_CHAR_CLASSES)
1971 {
1972 token->type = OP_OPEN_CHAR_CLASS;
1973 break;
1974 }
1975 /* else fall through. */
1976 default:
1977 token->type = CHARACTER;
1978 token->opr.c = c;
1979 token_len = 1;
1980 break;
1981 }
3b0bdc72
UD
1982 return token_len;
1983 }
1984 switch (c)
1985 {
1986 case '-':
1987 token->type = OP_CHARSET_RANGE;
1988 break;
1989 case ']':
1990 token->type = OP_CLOSE_BRACKET;
1991 break;
1992 case '^':
1993 token->type = OP_NON_MATCH_LIST;
1994 break;
1995 default:
1996 token->type = CHARACTER;
1997 }
1998 return 1;
1999}
2000\f
2001/* Functions for parser. */
2002
2003/* Entry point of the parser.
2004 Parse the regular expression REGEXP and return the structure tree.
2005 If an error is occured, ERR is set by error code, and return NULL.
2006 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
2007 CAT
2008 / \
2009 / \
3b0bdc72
UD
2010 <reg_exp> EOR
2011
2012 CAT means concatenation.
2013 EOR means end of regular expression. */
2014
2015static bin_tree_t *
2016parse (regexp, preg, syntax, err)
2017 re_string_t *regexp;
2018 regex_t *preg;
2019 reg_syntax_t syntax;
2020 reg_errcode_t *err;
2021{
2022 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2023 bin_tree_t *tree, *eor, *root;
2024 re_token_t current_token;
56b168be 2025 dfa->syntax = syntax;
f0d77aa8 2026 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2027 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 2028 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72 2029 return NULL;
c5c9c929 2030 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
3b0bdc72 2031 if (tree != NULL)
ee70274a 2032 root = create_tree (dfa, tree, eor, CONCAT, 0);
3b0bdc72
UD
2033 else
2034 root = eor;
ee70274a 2035 if (BE (eor == NULL || root == NULL, 0))
485d775d
UD
2036 {
2037 *err = REG_ESPACE;
2038 return NULL;
2039 }
3b0bdc72
UD
2040 return root;
2041}
2042
2043/* This function build the following tree, from regular expression
2044 <branch1>|<branch2>:
15a7d175
UD
2045 ALT
2046 / \
2047 / \
3b0bdc72
UD
2048 <branch1> <branch2>
2049
2050 ALT means alternative, which represents the operator `|'. */
2051
2052static bin_tree_t *
2053parse_reg_exp (regexp, preg, token, syntax, nest, err)
2054 re_string_t *regexp;
2055 regex_t *preg;
2056 re_token_t *token;
2057 reg_syntax_t syntax;
2058 int nest;
2059 reg_errcode_t *err;
2060{
2061 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2062 bin_tree_t *tree, *branch = NULL;
3b0bdc72 2063 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 2064 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2065 return NULL;
2066
2067 while (token->type == OP_ALT)
2068 {
2069 re_token_t alt_token = *token;
f0d77aa8 2070 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 2071 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
2072 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2073 {
2074 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2075 if (BE (*err != REG_NOERROR && branch == NULL, 0))
ee70274a 2076 return NULL;
15a7d175 2077 }
9b88fc16
UD
2078 else
2079 branch = NULL;
c5c9c929 2080 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
ee70274a 2081 if (BE (tree == NULL, 0))
15a7d175
UD
2082 {
2083 *err = REG_ESPACE;
2084 return NULL;
2085 }
0742e48e 2086 dfa->has_plural_match = 1;
3b0bdc72
UD
2087 }
2088 return tree;
2089}
2090
2091/* This function build the following tree, from regular expression
2092 <exp1><exp2>:
15a7d175
UD
2093 CAT
2094 / \
3b0bdc72
UD
2095 / \
2096 <exp1> <exp2>
2097
2098 CAT means concatenation. */
2099
2100static bin_tree_t *
2101parse_branch (regexp, preg, token, syntax, nest, err)
2102 re_string_t *regexp;
2103 regex_t *preg;
2104 re_token_t *token;
2105 reg_syntax_t syntax;
2106 int nest;
2107 reg_errcode_t *err;
2108{
2109 bin_tree_t *tree, *exp;
ee70274a 2110 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3b0bdc72 2111 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2112 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2113 return NULL;
2114
2115 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2116 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72
UD
2117 {
2118 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2119 if (BE (*err != REG_NOERROR && exp == NULL, 0))
15a7d175 2120 {
15a7d175
UD
2121 return NULL;
2122 }
3b0bdc72 2123 if (tree != NULL && exp != NULL)
15a7d175 2124 {
ee70274a 2125 tree = create_tree (dfa, tree, exp, CONCAT, 0);
15a7d175
UD
2126 if (tree == NULL)
2127 {
2128 *err = REG_ESPACE;
2129 return NULL;
2130 }
2131 }
3b0bdc72 2132 else if (tree == NULL)
15a7d175 2133 tree = exp;
3b0bdc72
UD
2134 /* Otherwise exp == NULL, we don't need to create new tree. */
2135 }
2136 return tree;
2137}
2138
2139/* This function build the following tree, from regular expression a*:
15a7d175
UD
2140 *
2141 |
2142 a
3b0bdc72
UD
2143*/
2144
2145static bin_tree_t *
2146parse_expression (regexp, preg, token, syntax, nest, err)
2147 re_string_t *regexp;
2148 regex_t *preg;
2149 re_token_t *token;
2150 reg_syntax_t syntax;
2151 int nest;
2152 reg_errcode_t *err;
2153{
2154 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2155 bin_tree_t *tree;
3b0bdc72
UD
2156 switch (token->type)
2157 {
2158 case CHARACTER:
c5c9c929 2159 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2160 if (BE (tree == NULL, 0))
15a7d175
UD
2161 {
2162 *err = REG_ESPACE;
2163 return NULL;
2164 }
3b0bdc72 2165#ifdef RE_ENABLE_I18N
3c0fb574 2166 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2167 {
2168 while (!re_string_eoi (regexp)
2169 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2170 {
15a7d175 2171 bin_tree_t *mbc_remain;
f0d77aa8 2172 fetch_token (token, regexp, syntax);
c5c9c929 2173 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a
UD
2174 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2175 if (BE (mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2176 {
2177 *err = REG_ESPACE;
2178 return NULL;
2179 }
15a7d175 2180 }
3b0bdc72
UD
2181 }
2182#endif
2183 break;
2184 case OP_OPEN_SUBEXP:
2185 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2186 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2187 return NULL;
3b0bdc72
UD
2188 break;
2189 case OP_OPEN_BRACKET:
2190 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2191 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2192 return NULL;
3b0bdc72
UD
2193 break;
2194 case OP_BACK_REF:
bc15410e 2195 if (BE (preg->re_nsub < token->opr.idx
15a7d175
UD
2196 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2197 {
2198 *err = REG_ESUBREG;
2199 return NULL;
2200 }
6291ee3c 2201 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
c5c9c929 2202 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2203 if (BE (tree == NULL, 0))
15a7d175
UD
2204 {
2205 *err = REG_ESPACE;
2206 return NULL;
2207 }
3b0bdc72
UD
2208 ++dfa->nbackref;
2209 dfa->has_mb_node = 1;
2210 break;
06e8303a
UD
2211 case OP_OPEN_DUP_NUM:
2212 if (syntax & RE_CONTEXT_INVALID_DUP)
2213 {
2214 *err = REG_BADRPT;
2215 return NULL;
2216 }
2217 /* FALLTHROUGH */
3b0bdc72
UD
2218 case OP_DUP_ASTERISK:
2219 case OP_DUP_PLUS:
2220 case OP_DUP_QUESTION:
3b0bdc72 2221 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2222 {
2223 *err = REG_BADRPT;
2224 return NULL;
2225 }
3b0bdc72 2226 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2227 {
f0d77aa8 2228 fetch_token (token, regexp, syntax);
15a7d175
UD
2229 return parse_expression (regexp, preg, token, syntax, nest, err);
2230 }
3b0bdc72
UD
2231 /* else fall through */
2232 case OP_CLOSE_SUBEXP:
2233 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2234 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2235 {
2236 *err = REG_ERPAREN;
2237 return NULL;
2238 }
3b0bdc72
UD
2239 /* else fall through */
2240 case OP_CLOSE_DUP_NUM:
2241 /* We treat it as a normal character. */
2242
2243 /* Then we can these characters as normal characters. */
2244 token->type = CHARACTER;
65e6becf
UD
2245 /* mb_partial and word_char bits should be initialized already
2246 by peek_token. */
c5c9c929 2247 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2248 if (BE (tree == NULL, 0))
15a7d175
UD
2249 {
2250 *err = REG_ESPACE;
2251 return NULL;
2252 }
3b0bdc72
UD
2253 break;
2254 case ANCHOR:
bb3f4825
UD
2255 if ((token->opr.ctx_type
2256 & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
56b168be
UD
2257 && dfa->word_ops_used == 0)
2258 init_word_char (dfa);
3b0bdc72 2259 if (token->opr.ctx_type == WORD_DELIM)
15a7d175
UD
2260 {
2261 bin_tree_t *tree_first, *tree_last;
15a7d175 2262 token->opr.ctx_type = WORD_FIRST;
c5c9c929 2263 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
15a7d175 2264 token->opr.ctx_type = WORD_LAST;
c5c9c929 2265 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
15a7d175 2266 token->type = OP_ALT;
c5c9c929 2267 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
ee70274a 2268 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
15a7d175
UD
2269 {
2270 *err = REG_ESPACE;
2271 return NULL;
2272 }
2273 }
3b0bdc72 2274 else
15a7d175 2275 {
c5c9c929 2276 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2277 if (BE (tree == NULL, 0))
54e1cabc
UD
2278 {
2279 *err = REG_ESPACE;
2280 return NULL;
2281 }
15a7d175 2282 }
3b0bdc72 2283 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2284 by repetition operators.
2285 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2286 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2287 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2288 return tree;
2289 case OP_PERIOD:
c5c9c929 2290 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2291 if (BE (tree == NULL, 0))
15a7d175
UD
2292 {
2293 *err = REG_ESPACE;
2294 return NULL;
2295 }
3c0fb574 2296 if (dfa->mb_cur_max > 1)
15a7d175 2297 dfa->has_mb_node = 1;
3b0bdc72
UD
2298 break;
2299 case OP_WORD:
3b0bdc72 2300 case OP_NOTWORD:
266c1f50
UD
2301 tree = build_charclass_op (dfa, regexp->trans,
2302 (const unsigned char *) "alnum",
2303 (const unsigned char *) "_",
2304 token->type == OP_NOTWORD, err);
e2b6bfa3
UD
2305 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306 return NULL;
2307 break;
2308 case OP_SPACE:
e2b6bfa3 2309 case OP_NOTSPACE:
266c1f50
UD
2310 tree = build_charclass_op (dfa, regexp->trans,
2311 (const unsigned char *) "space",
2312 (const unsigned char *) "",
2313 token->type == OP_NOTSPACE, err);
bc15410e 2314 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2315 return NULL;
3b0bdc72
UD
2316 break;
2317 case OP_ALT:
2318 case END_OF_RE:
2319 return NULL;
2320 case BACK_SLASH:
2321 *err = REG_EESCAPE;
2322 return NULL;
2323 default:
2324 /* Must not happen? */
2325#ifdef DEBUG
2326 assert (0);
2327#endif
2328 return NULL;
2329 }
f0d77aa8 2330 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2331
2332 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2333 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72
UD
2334 {
2335 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
bc15410e 2336 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2337 return NULL;
c34bfc8d
UD
2338 /* In BRE consecutive duplications are not allowed. */
2339 if ((syntax & RE_CONTEXT_INVALID_DUP)
2340 && (token->type == OP_DUP_ASTERISK
2341 || token->type == OP_OPEN_DUP_NUM))
2342 {
2343 *err = REG_BADRPT;
2344 return NULL;
2345 }
0742e48e 2346 dfa->has_plural_match = 1;
3b0bdc72
UD
2347 }
2348
2349 return tree;
2350}
2351
2352/* This function build the following tree, from regular expression
2353 (<reg_exp>):
15a7d175
UD
2354 SUBEXP
2355 |
2356 <reg_exp>
3b0bdc72
UD
2357*/
2358
2359static bin_tree_t *
2360parse_sub_exp (regexp, preg, token, syntax, nest, err)
2361 re_string_t *regexp;
2362 regex_t *preg;
2363 re_token_t *token;
2364 reg_syntax_t syntax;
2365 int nest;
2366 reg_errcode_t *err;
2367{
2368 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 2369 bin_tree_t *tree, *left_par, *right_par;
3b0bdc72
UD
2370 size_t cur_nsub;
2371 cur_nsub = preg->re_nsub++;
951d6408 2372 if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
3b0bdc72
UD
2373 {
2374 re_subexp_t *new_array;
2375 dfa->subexps_alloc *= 2;
2376 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
bc15410e 2377 if (BE (new_array == NULL, 0))
3b0bdc72
UD
2378 {
2379 dfa->subexps_alloc /= 2;
2380 *err = REG_ESPACE;
2381 return NULL;
2382 }
2383 dfa->subexps = new_array;
2384 }
2385 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2386 dfa->subexps[cur_nsub].end = -1;
81c64d40 2387
c5c9c929 2388 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2389 if (BE (left_par == NULL, 0))
485d775d
UD
2390 {
2391 *err = REG_ESPACE;
2392 return NULL;
2393 }
ee70274a 2394 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
f0d77aa8 2395 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2396
2397 /* The subexpression may be a null string. */
2398 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2399 tree = NULL;
3b0bdc72
UD
2400 else
2401 {
2402 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
bc15410e 2403 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2404 return NULL;
3b0bdc72 2405 }
81c64d40
UD
2406 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2407 {
06e8303a 2408 *err = REG_EPAREN;
81c64d40
UD
2409 return NULL;
2410 }
c5c9c929 2411 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
81c64d40 2412 dfa->subexps[cur_nsub].end = dfa->nodes_len;
81c64d40 2413 tree = ((tree == NULL) ? right_par
ee70274a
UD
2414 : create_tree (dfa, tree, right_par, CONCAT, 0));
2415 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2416 if (BE (right_par == NULL || tree == NULL, 0))
485d775d
UD
2417 {
2418 *err = REG_ESPACE;
2419 return NULL;
2420 }
ee70274a 2421 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
81c64d40 2422
3b0bdc72
UD
2423 return tree;
2424}
2425
2426/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2427
2428static bin_tree_t *
6b6557e8
UD
2429parse_dup_op (elem, regexp, dfa, token, syntax, err)
2430 bin_tree_t *elem;
3b0bdc72
UD
2431 re_string_t *regexp;
2432 re_dfa_t *dfa;
2433 re_token_t *token;
2434 reg_syntax_t syntax;
2435 reg_errcode_t *err;
2436{
2437 re_token_t dup_token;
6b6557e8
UD
2438 bin_tree_t *tree = NULL;
2439 int i, start, end, start_idx = re_string_cur_idx (regexp);
3b0bdc72 2440 re_token_t start_token = *token;
6b6557e8 2441
3b0bdc72
UD
2442 if (token->type == OP_OPEN_DUP_NUM)
2443 {
6b6557e8
UD
2444 end = 0;
2445 start = fetch_number (regexp, token, syntax);
3b0bdc72 2446 if (start == -1)
15a7d175
UD
2447 {
2448 if (token->type == CHARACTER && token->opr.c == ',')
2449 start = 0; /* We treat "{,m}" as "{0,m}". */
2450 else
2451 {
2452 *err = REG_BADBR; /* <re>{} is invalid. */
2453 return NULL;
2454 }
2455 }
602c2f9d 2456 if (BE (start != -2, 1))
15a7d175
UD
2457 {
2458 /* We treat "{n}" as "{n,n}". */
2459 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2460 : ((token->type == CHARACTER && token->opr.c == ',')
2461 ? fetch_number (regexp, token, syntax) : -2));
2462 }
602c2f9d 2463 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2464 {
2465 /* Invalid sequence. */
6b6557e8
UD
2466 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2467 {
2468 if (token->type == END_OF_RE)
2469 *err = REG_EBRACE;
2470 else
2471 *err = REG_BADBR;
2472
2473 return NULL;
2474 }
2475
2476 /* If the syntax bit is set, rollback. */
2477 re_string_set_index (regexp, start_idx);
2478 *token = start_token;
2479 token->type = CHARACTER;
2480 /* mb_partial and word_char bits should be already initialized by
2481 peek_token. */
2482 return elem;
15a7d175 2483 }
6b6557e8
UD
2484
2485 if (BE (end != -1 && start > end, 0))
15a7d175 2486 {
6b6557e8
UD
2487 /* First number greater than second. */
2488 *err = REG_BADBR;
15a7d175
UD
2489 return NULL;
2490 }
6b6557e8
UD
2491 }
2492 else
2493 {
2494 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2495 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2496 }
2497
2498 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2499 if (BE (elem == NULL, 0))
2500 start = end = 0;
602c2f9d 2501
6b6557e8
UD
2502 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2503 else if (BE (start > 0, 0))
2504 {
2505 tree = elem;
2506 for (i = 2; i <= start; ++i)
7de66108 2507 {
6b6557e8
UD
2508 elem = duplicate_tree (elem, dfa);
2509 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2510 if (BE (elem == NULL || tree == NULL, 0))
7de66108
UD
2511 goto parse_dup_op_espace;
2512 }
6b6557e8 2513 }
3b0bdc72 2514
6b6557e8
UD
2515 if (BE (end != start, 1))
2516 {
2517 dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2518 if (BE (start > 0, 0))
deeb4aae 2519 {
6b6557e8
UD
2520 elem = duplicate_tree (elem, dfa);
2521 if (BE (elem == NULL, 0))
2522 goto parse_dup_op_espace;
2523
2524 /* This subexpression will be marked as optional, so that
2525 empty matches do not touch the registers. */
2526 mark_opt_subexp (elem, dfa);
2527
2528 /* Prepare the tree with the modifier. */
2529 elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2530 tree = create_tree (dfa, tree, elem, CONCAT, 0);
deeb4aae 2531 }
6b6557e8 2532 else
15a7d175 2533 {
6b6557e8
UD
2534 /* We do not need to duplicate the tree because we have not
2535 created it yet. */
2536 mark_opt_subexp (elem, dfa);
2537 tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2538 }
a96c63ed 2539
6b6557e8
UD
2540 if (BE (elem == NULL || tree == NULL, 0))
2541 goto parse_dup_op_espace;
2542
2543 /* This loop is actually executed only when end != -1,
2544 to rewrite <re>{0,n} as <re>?<re>?<re>?... We have
2545 already created the start+1-th copy. */
2546 for (i = start + 2; i <= end; ++i)
2547 {
2548 elem = duplicate_tree (elem, dfa);
2549 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2550 if (BE (elem == NULL || tree == NULL, 0))
15a7d175 2551 {
6b6557e8
UD
2552 *err = REG_ESPACE;
2553 return NULL;
15a7d175 2554 }
6b6557e8 2555 }
3b0bdc72 2556 }
6b6557e8 2557
f0d77aa8 2558 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2559 return tree;
2560
2561 parse_dup_op_espace:
3b0bdc72
UD
2562 *err = REG_ESPACE;
2563 return NULL;
3b0bdc72
UD
2564}
2565
2566/* Size of the names for collating symbol/equivalence_class/character_class.
2567 I'm not sure, but maybe enough. */
2568#define BRACKET_NAME_BUF_SIZE 32
2569
434d3784
UD
2570#ifndef _LIBC
2571 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2572 Build the range expression which starts from START_ELEM, and ends
2573 at END_ELEM. The result are written to MBCSET and SBCSET.
2574 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2575 mbcset->range_ends, is a pointer argument sinse we may
2576 update it. */
2577
2578static reg_errcode_t
c0a0f9a3
UD
2579# ifdef RE_ENABLE_I18N
2580build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
434d3784 2581 re_charset_t *mbcset;
434d3784 2582 int *range_alloc;
c0a0f9a3
UD
2583# else /* not RE_ENABLE_I18N */
2584build_range_exp (sbcset, start_elem, end_elem)
2585# endif /* not RE_ENABLE_I18N */
2586 re_bitset_ptr_t sbcset;
434d3784
UD
2587 bracket_elem_t *start_elem, *end_elem;
2588{
2589 unsigned int start_ch, end_ch;
2590 /* Equivalence Classes and Character Classes can't be a range start/end. */
2591 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2592 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2593 0))
434d3784
UD
2594 return REG_ERANGE;
2595
2596 /* We can handle no multi character collating elements without libc
2597 support. */
62439eac 2598 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2599 && strlen ((char *) start_elem->opr.name) > 1)
2600 || (end_elem->type == COLL_SYM
2601 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2602 return REG_ECOLLATE;
2603
2604# ifdef RE_ENABLE_I18N
2605 {
2606 wchar_t wc, start_wc, end_wc;
2607 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2608
2609 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2610 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2611 : 0));
434d3784 2612 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2613 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2614 : 0));
434d3784 2615 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
15a7d175 2616 ? __btowc (start_ch) : start_elem->opr.wch);
434d3784 2617 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
15a7d175 2618 ? __btowc (end_ch) : end_elem->opr.wch);
4bb333cd
UD
2619 if (start_wc == WEOF || end_wc == WEOF)
2620 return REG_ECOLLATE;
434d3784
UD
2621 cmp_buf[0] = start_wc;
2622 cmp_buf[4] = end_wc;
2623 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2624 return REG_ERANGE;
2625
10677727
UD
2626 /* Got valid collation sequence values, add them as a new entry.
2627 However, for !_LIBC we have no collation elements: if the
2628 character set is single byte, the single byte character set
2629 that we build below suffices. parse_bracket_exp passes
2630 no MBCSET if dfa->mb_cur_max == 1. */
2631 if (mbcset)
434d3784 2632 {
10677727
UD
2633 /* Check the space of the arrays. */
2634 if (BE (*range_alloc == mbcset->nranges, 0))
2635 {
2636 /* There is not enough space, need realloc. */
2637 wchar_t *new_array_start, *new_array_end;
2638 int new_nranges;
2639
2640 /* +1 in case of mbcset->nranges is 0. */
2641 new_nranges = 2 * mbcset->nranges + 1;
2642 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2643 are NULL if *range_alloc == 0. */
2644 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2645 new_nranges);
2646 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2647 new_nranges);
2648
2649 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2650 return REG_ESPACE;
2651
2652 mbcset->range_starts = new_array_start;
2653 mbcset->range_ends = new_array_end;
2654 *range_alloc = new_nranges;
2655 }
2656
2657 mbcset->range_starts[mbcset->nranges] = start_wc;
2658 mbcset->range_ends[mbcset->nranges++] = end_wc;
434d3784
UD
2659 }
2660
434d3784 2661 /* Build the table for single byte characters. */
1e7947dc 2662 for (wc = 0; wc < SBC_MAX; ++wc)
434d3784 2663 {
15a7d175
UD
2664 cmp_buf[2] = wc;
2665 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2666 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2667 bitset_set (sbcset, wc);
434d3784
UD
2668 }
2669 }
2670# else /* not RE_ENABLE_I18N */
2671 {
2672 unsigned int ch;
2673 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2674 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2675 : 0));
434d3784 2676 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2677 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2678 : 0));
434d3784
UD
2679 if (start_ch > end_ch)
2680 return REG_ERANGE;
2681 /* Build the table for single byte characters. */
1e7947dc 2682 for (ch = 0; ch < SBC_MAX; ++ch)
434d3784 2683 if (start_ch <= ch && ch <= end_ch)
15a7d175 2684 bitset_set (sbcset, ch);
434d3784
UD
2685 }
2686# endif /* not RE_ENABLE_I18N */
2687 return REG_NOERROR;
2688}
2689#endif /* not _LIBC */
2690
2691#ifndef _LIBC
2692/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2693 Build the collating element which is represented by NAME.
2694 The result are written to MBCSET and SBCSET.
2695 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2696 pointer argument since we may update it. */
2697
2698static reg_errcode_t
c0a0f9a3
UD
2699# ifdef RE_ENABLE_I18N
2700build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
434d3784 2701 re_charset_t *mbcset;
434d3784 2702 int *coll_sym_alloc;
c0a0f9a3
UD
2703# else /* not RE_ENABLE_I18N */
2704build_collating_symbol (sbcset, name)
2705# endif /* not RE_ENABLE_I18N */
2706 re_bitset_ptr_t sbcset;
62439eac 2707 const unsigned char *name;
434d3784 2708{
62439eac
UD
2709 size_t name_len = strlen ((const char *) name);
2710 if (BE (name_len != 1, 0))
434d3784
UD
2711 return REG_ECOLLATE;
2712 else
2713 {
62439eac 2714 bitset_set (sbcset, name[0]);
434d3784
UD
2715 return REG_NOERROR;
2716 }
2717}
2718#endif /* not _LIBC */
2719
3b0bdc72
UD
2720/* This function parse bracket expression like "[abc]", "[a-c]",
2721 "[[.a-a.]]" etc. */
2722
2723static bin_tree_t *
2724parse_bracket_exp (regexp, dfa, token, syntax, err)
2725 re_string_t *regexp;
2726 re_dfa_t *dfa;
2727 re_token_t *token;
2728 reg_syntax_t syntax;
2729 reg_errcode_t *err;
2730{
2731#ifdef _LIBC
62439eac
UD
2732 const unsigned char *collseqmb;
2733 const char *collseqwc;
3b0bdc72
UD
2734 uint32_t nrules;
2735 int32_t table_size;
2736 const int32_t *symb_table;
2737 const unsigned char *extra;
2738
434d3784 2739 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2740 Seek the collating symbol entry correspondings to NAME.
2741 Return the index of the symbol in the SYMB_TABLE. */
2742
7090d3ca 2743 auto inline int32_t
25337753 2744 __attribute ((always_inline))
3b0bdc72 2745 seek_collating_symbol_entry (name, name_len)
15a7d175
UD
2746 const unsigned char *name;
2747 size_t name_len;
3b0bdc72 2748 {
62439eac 2749 int32_t hash = elem_hash ((const char *) name, name_len);
3b0bdc72
UD
2750 int32_t elem = hash % table_size;
2751 int32_t second = hash % (table_size - 2);
2752 while (symb_table[2 * elem] != 0)
15a7d175
UD
2753 {
2754 /* First compare the hashing value. */
2755 if (symb_table[2 * elem] == hash
2756 /* Compare the length of the name. */
2757 && name_len == extra[symb_table[2 * elem + 1]]
2758 /* Compare the name. */
2759 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2760 name_len) == 0)
2761 {
2762 /* Yep, this is the entry. */
2763 break;
2764 }
2765
2766 /* Next entry. */
2767 elem += second;
2768 }
3b0bdc72
UD
2769 return elem;
2770 }
2771
434d3784 2772 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2773 Look up the collation sequence value of BR_ELEM.
2774 Return the value if succeeded, UINT_MAX otherwise. */
2775
7090d3ca 2776 auto inline unsigned int
25337753 2777 __attribute ((always_inline))
3b0bdc72 2778 lookup_collation_sequence_value (br_elem)
15a7d175 2779 bracket_elem_t *br_elem;
3b0bdc72
UD
2780 {
2781 if (br_elem->type == SB_CHAR)
15a7d175
UD
2782 {
2783 /*
2784 if (MB_CUR_MAX == 1)
2785 */
2786 if (nrules == 0)
2787 return collseqmb[br_elem->opr.ch];
2788 else
2789 {
2790 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2791 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2792 }
2793 }
3b0bdc72 2794 else if (br_elem->type == MB_CHAR)
15a7d175 2795 {
25337753 2796 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2797 }
3b0bdc72 2798 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2799 {
2800 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2801 if (nrules != 0)
2802 {
2803 int32_t elem, idx;
2804 elem = seek_collating_symbol_entry (br_elem->opr.name,
2805 sym_name_len);
2806 if (symb_table[2 * elem] != 0)
2807 {
2808 /* We found the entry. */
2809 idx = symb_table[2 * elem + 1];
2810 /* Skip the name of collating element name. */
2811 idx += 1 + extra[idx];
2812 /* Skip the byte sequence of the collating element. */
2813 idx += 1 + extra[idx];
2814 /* Adjust for the alignment. */
2815 idx = (idx + 3) & ~3;
2816 /* Skip the multibyte collation sequence value. */
2817 idx += sizeof (unsigned int);
2818 /* Skip the wide char sequence of the collating element. */
2819 idx += sizeof (unsigned int) *
2820 (1 + *(unsigned int *) (extra + idx));
2821 /* Return the collation sequence value. */
2822 return *(unsigned int *) (extra + idx);
2823 }
2824 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2825 {
2826 /* No valid character. Match it as a single byte
2827 character. */
2828 return collseqmb[br_elem->opr.name[0]];
2829 }
2830 }
2831 else if (sym_name_len == 1)
2832 return collseqmb[br_elem->opr.name[0]];
2833 }
3b0bdc72
UD
2834 return UINT_MAX;
2835 }
2836
434d3784 2837 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2838 Build the range expression which starts from START_ELEM, and ends
2839 at END_ELEM. The result are written to MBCSET and SBCSET.
2840 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2841 mbcset->range_ends, is a pointer argument sinse we may
2842 update it. */
2843
7090d3ca 2844 auto inline reg_errcode_t
25337753 2845 __attribute ((always_inline))
c0a0f9a3 2846 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
15a7d175
UD
2847 re_charset_t *mbcset;
2848 int *range_alloc;
15a7d175
UD
2849 re_bitset_ptr_t sbcset;
2850 bracket_elem_t *start_elem, *end_elem;
3b0bdc72
UD
2851 {
2852 unsigned int ch;
2853 uint32_t start_collseq;
2854 uint32_t end_collseq;
2855
434d3784 2856 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2857 start/end. */
bc15410e 2858 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2859 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2860 0))
2861 return REG_ERANGE;
3b0bdc72
UD
2862
2863 start_collseq = lookup_collation_sequence_value (start_elem);
2864 end_collseq = lookup_collation_sequence_value (end_elem);
2865 /* Check start/end collation sequence values. */
bc15410e 2866 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2867 return REG_ECOLLATE;
bc15410e 2868 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2869 return REG_ERANGE;
3b0bdc72 2870
10677727
UD
2871 /* Got valid collation sequence values, add them as a new entry.
2872 However, if we have no collation elements, and the character set
2873 is single byte, the single byte character set that we
2874 build below suffices. */
2875 if (nrules > 0 || dfa->mb_cur_max > 1)
2876 {
2877 /* Check the space of the arrays. */
2878 if (BE (*range_alloc == mbcset->nranges, 0))
2879 {
2880 /* There is not enough space, need realloc. */
2881 uint32_t *new_array_start;
2882 uint32_t *new_array_end;
2883 int new_nranges;
2884
2885 /* +1 in case of mbcset->nranges is 0. */
2886 new_nranges = 2 * mbcset->nranges + 1;
2887 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2888 new_nranges);
2889 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2890 new_nranges);
2891
2892 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2893 return REG_ESPACE;
2894
2895 mbcset->range_starts = new_array_start;
2896 mbcset->range_ends = new_array_end;
2897 *range_alloc = new_nranges;
2898 }
2899
2900 mbcset->range_starts[mbcset->nranges] = start_collseq;
2901 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2902 }
3b0bdc72
UD
2903
2904 /* Build the table for single byte characters. */
1e7947dc 2905 for (ch = 0; ch < SBC_MAX; ch++)
15a7d175
UD
2906 {
2907 uint32_t ch_collseq;
2908 /*
2909 if (MB_CUR_MAX == 1)
2910 */
2911 if (nrules == 0)
2912 ch_collseq = collseqmb[ch];
2913 else
25337753 2914 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
2915 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2916 bitset_set (sbcset, ch);
2917 }
3b0bdc72
UD
2918 return REG_NOERROR;
2919 }
3b0bdc72 2920
434d3784 2921 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2922 Build the collating element which is represented by NAME.
2923 The result are written to MBCSET and SBCSET.
2924 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2925 pointer argument sinse we may update it. */
2926
7090d3ca 2927 auto inline reg_errcode_t
25337753 2928 __attribute ((always_inline))
c0a0f9a3 2929 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
15a7d175
UD
2930 re_charset_t *mbcset;
2931 int *coll_sym_alloc;
15a7d175
UD
2932 re_bitset_ptr_t sbcset;
2933 const unsigned char *name;
3b0bdc72 2934 {
3b0bdc72 2935 int32_t elem, idx;
62439eac 2936 size_t name_len = strlen ((const char *) name);
3b0bdc72 2937 if (nrules != 0)
15a7d175
UD
2938 {
2939 elem = seek_collating_symbol_entry (name, name_len);
2940 if (symb_table[2 * elem] != 0)
2941 {
2942 /* We found the entry. */
2943 idx = symb_table[2 * elem + 1];
2944 /* Skip the name of collating element name. */
2945 idx += 1 + extra[idx];
2946 }
2947 else if (symb_table[2 * elem] == 0 && name_len == 1)
2948 {
2949 /* No valid character, treat it as a normal
2950 character. */
2951 bitset_set (sbcset, name[0]);
2952 return REG_NOERROR;
2953 }
2954 else
2955 return REG_ECOLLATE;
3b0bdc72 2956
15a7d175
UD
2957 /* Got valid collation sequence, add it as a new entry. */
2958 /* Check the space of the arrays. */
951d6408 2959 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
15a7d175
UD
2960 {
2961 /* Not enough, realloc it. */
2962 /* +1 in case of mbcset->ncoll_syms is 0. */
951d6408 2963 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
15a7d175
UD
2964 /* Use realloc since mbcset->coll_syms is NULL
2965 if *alloc == 0. */
951d6408
UD
2966 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2967 new_coll_sym_alloc);
2968 if (BE (new_coll_syms == NULL, 0))
15a7d175 2969 return REG_ESPACE;
951d6408
UD
2970 mbcset->coll_syms = new_coll_syms;
2971 *coll_sym_alloc = new_coll_sym_alloc;
15a7d175
UD
2972 }
2973 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
15a7d175
UD
2974 return REG_NOERROR;
2975 }
3b0bdc72 2976 else
15a7d175
UD
2977 {
2978 if (BE (name_len != 1, 0))
2979 return REG_ECOLLATE;
2980 else
2981 {
2982 bitset_set (sbcset, name[0]);
2983 return REG_NOERROR;
2984 }
2985 }
3b0bdc72 2986 }
434d3784
UD
2987#endif
2988
3b0bdc72
UD
2989 re_token_t br_token;
2990 re_bitset_ptr_t sbcset;
c0a0f9a3 2991#ifdef RE_ENABLE_I18N
3b0bdc72 2992 re_charset_t *mbcset;
3b0bdc72
UD
2993 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2994 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3 2995#endif /* not RE_ENABLE_I18N */
10677727 2996 int non_match = 0;
c0a0f9a3 2997 bin_tree_t *work_tree;
ee70274a 2998 int token_len;
8cae99db 2999 int first_round = 1;
3b0bdc72 3000#ifdef _LIBC
62439eac
UD
3001 collseqmb = (const unsigned char *)
3002 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
3003 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3004 if (nrules)
3005 {
3006 /*
3007 if (MB_CUR_MAX > 1)
3008 */
15a7d175 3009 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
3010 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3011 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3012 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 3013 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3014 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
3015 }
3016#endif
3017 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 3018#ifdef RE_ENABLE_I18N
3b0bdc72 3019 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3020#endif /* RE_ENABLE_I18N */
3021#ifdef RE_ENABLE_I18N
bc15410e 3022 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3023#else
3024 if (BE (sbcset == NULL, 0))
3025#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3026 {
3027 *err = REG_ESPACE;
3028 return NULL;
3029 }
3030
3031 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3032 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 3033 {
3b0bdc72 3034 *err = REG_BADPAT;
434d3784 3035 goto parse_bracket_exp_free_return;
3b0bdc72
UD
3036 }
3037 if (token->type == OP_NON_MATCH_LIST)
3038 {
c0a0f9a3 3039#ifdef RE_ENABLE_I18N
3b0bdc72 3040 mbcset->non_match = 1;
c0a0f9a3 3041#endif /* not RE_ENABLE_I18N */
10677727 3042 non_match = 1;
3b0bdc72 3043 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 3044 bitset_set (sbcset, '\0');
3b0bdc72
UD
3045 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3046 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3047 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
3048 {
3049 *err = REG_BADPAT;
3050 goto parse_bracket_exp_free_return;
3051 }
3b0bdc72
UD
3052 }
3053
3054 /* We treat the first ']' as a normal character. */
3055 if (token->type == OP_CLOSE_BRACKET)
3056 token->type = CHARACTER;
3057
3058 while (1)
3059 {
3060 bracket_elem_t start_elem, end_elem;
62439eac
UD
3061 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3062 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72
UD
3063 reg_errcode_t ret;
3064 int token_len2 = 0, is_range_exp = 0;
3065 re_token_t token2;
3066
3067 start_elem.opr.name = start_name_buf;
3068 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 3069 syntax, first_round);
bc15410e 3070 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
3071 {
3072 *err = ret;
3073 goto parse_bracket_exp_free_return;
3074 }
78d8b07a 3075 first_round = 0;
3b0bdc72 3076
78d8b07a 3077 /* Get information about the next token. We need it in any case. */
3b0bdc72 3078 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3079
3080 /* Do not check for ranges if we know they are not allowed. */
3081 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3082 {
15a7d175
UD
3083 if (BE (token->type == END_OF_RE, 0))
3084 {
78d8b07a 3085 *err = REG_EBRACK;
15a7d175
UD
3086 goto parse_bracket_exp_free_return;
3087 }
78d8b07a 3088 if (token->type == OP_CHARSET_RANGE)
15a7d175 3089 {
78d8b07a
UD
3090 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3091 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3092 if (BE (token2.type == END_OF_RE, 0))
3093 {
3094 *err = REG_EBRACK;
3095 goto parse_bracket_exp_free_return;
3096 }
3097 if (token2.type == OP_CLOSE_BRACKET)
3098 {
3099 /* We treat the last '-' as a normal character. */
3100 re_string_skip_bytes (regexp, -token_len);
3101 token->type = CHARACTER;
3102 }
3103 else
3104 is_range_exp = 1;
15a7d175 3105 }
15a7d175 3106 }
3b0bdc72
UD
3107
3108 if (is_range_exp == 1)
15a7d175
UD
3109 {
3110 end_elem.opr.name = end_name_buf;
3111 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
78d8b07a 3112 dfa, syntax, 1);
15a7d175
UD
3113 if (BE (ret != REG_NOERROR, 0))
3114 {
3115 *err = ret;
3116 goto parse_bracket_exp_free_return;
3117 }
3118
3119 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3120
10677727
UD
3121#ifdef _LIBC
3122 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3123 &start_elem, &end_elem);
3124#else
3125# ifdef RE_ENABLE_I18N
15a7d175 3126 *err = build_range_exp (sbcset,
10677727
UD
3127 dfa->mb_cur_max > 1 ? mbcset : NULL,
3128 &range_alloc, &start_elem, &end_elem);
3129# else
3130 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3131# endif
c0a0f9a3 3132#endif /* RE_ENABLE_I18N */
15a7d175
UD
3133 if (BE (*err != REG_NOERROR, 0))
3134 goto parse_bracket_exp_free_return;
3135 }
3b0bdc72 3136 else
15a7d175
UD
3137 {
3138 switch (start_elem.type)
3139 {
3140 case SB_CHAR:
3141 bitset_set (sbcset, start_elem.opr.ch);
3142 break;
c0a0f9a3 3143#ifdef RE_ENABLE_I18N
15a7d175
UD
3144 case MB_CHAR:
3145 /* Check whether the array has enough space. */
951d6408 3146 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
15a7d175 3147 {
951d6408 3148 wchar_t *new_mbchars;
15a7d175
UD
3149 /* Not enough, realloc it. */
3150 /* +1 in case of mbcset->nmbchars is 0. */
3151 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3152 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3153 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3154 mbchar_alloc);
3155 if (BE (new_mbchars == NULL, 0))
15a7d175 3156 goto parse_bracket_exp_espace;
951d6408 3157 mbcset->mbchars = new_mbchars;
15a7d175
UD
3158 }
3159 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3160 break;
c0a0f9a3 3161#endif /* RE_ENABLE_I18N */
15a7d175
UD
3162 case EQUIV_CLASS:
3163 *err = build_equiv_class (sbcset,
c0a0f9a3 3164#ifdef RE_ENABLE_I18N
15a7d175 3165 mbcset, &equiv_class_alloc,
c0a0f9a3 3166#endif /* RE_ENABLE_I18N */
3b0bdc72 3167 start_elem.opr.name);
15a7d175
UD
3168 if (BE (*err != REG_NOERROR, 0))
3169 goto parse_bracket_exp_free_return;
3170 break;
3171 case COLL_SYM:
3172 *err = build_collating_symbol (sbcset,
c0a0f9a3 3173#ifdef RE_ENABLE_I18N
15a7d175 3174 mbcset, &coll_sym_alloc,
c0a0f9a3 3175#endif /* RE_ENABLE_I18N */
3b0bdc72 3176 start_elem.opr.name);
15a7d175
UD
3177 if (BE (*err != REG_NOERROR, 0))
3178 goto parse_bracket_exp_free_return;
3179 break;
3180 case CHAR_CLASS:
66b110e8 3181 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3182#ifdef RE_ENABLE_I18N
7b7b9e70 3183 mbcset, &char_class_alloc,
c0a0f9a3 3184#endif /* RE_ENABLE_I18N */
7b7b9e70
UD
3185 start_elem.opr.name, syntax);
3186 if (BE (*err != REG_NOERROR, 0))
3187 goto parse_bracket_exp_free_return;
15a7d175
UD
3188 break;
3189 default:
3190 assert (0);
3191 break;
3192 }
3193 }
78d8b07a
UD
3194 if (BE (token->type == END_OF_RE, 0))
3195 {
3196 *err = REG_EBRACK;
3197 goto parse_bracket_exp_free_return;
3198 }
3b0bdc72 3199 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3200 break;
3b0bdc72
UD
3201 }
3202
3203 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3204
3205 /* If it is non-matching list. */
c0a0f9a3 3206 if (non_match)
3b0bdc72 3207 bitset_not (sbcset);
10677727 3208
65e6becf
UD
3209#ifdef RE_ENABLE_I18N
3210 /* Ensure only single byte characters are set. */
3211 if (dfa->mb_cur_max > 1)
3212 bitset_mask (sbcset, dfa->sb_char);
3213#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3214
3215 /* Build a tree for simple bracket. */
3216 br_token.type = SIMPLE_BRACKET;
3217 br_token.opr.sbcset = sbcset;
c5c9c929 3218 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3219 if (BE (work_tree == NULL, 0))
3b0bdc72
UD
3220 goto parse_bracket_exp_espace;
3221
c0a0f9a3 3222#ifdef RE_ENABLE_I18N
3b0bdc72 3223 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3224 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3225 || mbcset->non_match)))
3b0bdc72
UD
3226 {
3227 re_token_t alt_token;
3228 bin_tree_t *mbc_tree;
ad7f28c2 3229 int sbc_idx;
3b0bdc72 3230 /* Build a tree for complex bracket. */
ad7f28c2 3231 dfa->has_mb_node = 1;
ad7f28c2
UD
3232 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3233 if (sbcset[sbc_idx])
3234 break;
3235 /* If there are no bits set in sbcset, there is no point
3236 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3237 if (sbc_idx == BITSET_UINTS)
3238 {
3239 re_free (sbcset);
ee70274a
UD
3240 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3241 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
ad7f28c2
UD
3242 return work_tree;
3243 }
3b0bdc72
UD
3244 br_token.type = COMPLEX_BRACKET;
3245 br_token.opr.mbcset = mbcset;
c5c9c929 3246 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3247 if (BE (mbc_tree == NULL, 0))
15a7d175 3248 goto parse_bracket_exp_espace;
3b0bdc72
UD
3249 /* Then join them by ALT node. */
3250 alt_token.type = OP_ALT;
97fd3a30 3251 dfa->has_plural_match = 1;
c5c9c929 3252 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
ee70274a 3253 if (BE (mbc_tree != NULL, 1))
15a7d175 3254 return work_tree;
3b0bdc72
UD
3255 }
3256 else
3257 {
3258 free_charset (mbcset);
3259 return work_tree;
3260 }
c0a0f9a3
UD
3261#else /* not RE_ENABLE_I18N */
3262 return work_tree;
3263#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3264
3265 parse_bracket_exp_espace:
3b0bdc72 3266 *err = REG_ESPACE;
434d3784
UD
3267 parse_bracket_exp_free_return:
3268 re_free (sbcset);
c0a0f9a3 3269#ifdef RE_ENABLE_I18N
434d3784 3270 free_charset (mbcset);
c0a0f9a3 3271#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3272 return NULL;
3273}
3274
434d3784
UD
3275/* Parse an element in the bracket expression. */
3276
3b0bdc72 3277static reg_errcode_t
78d8b07a
UD
3278parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3279 accept_hyphen)
3b0bdc72
UD
3280 bracket_elem_t *elem;
3281 re_string_t *regexp;
3282 re_token_t *token;
3283 int token_len;
3284 re_dfa_t *dfa;
3285 reg_syntax_t syntax;
78d8b07a 3286 int accept_hyphen;
3b0bdc72
UD
3287{
3288#ifdef RE_ENABLE_I18N
3289 int cur_char_size;
3290 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3291 if (cur_char_size > 1)
3292 {
3293 elem->type = MB_CHAR;
3294 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3295 re_string_skip_bytes (regexp, cur_char_size);
3296 return REG_NOERROR;
3297 }
3298#endif /* RE_ENABLE_I18N */
3299 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3300 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3301 || token->type == OP_OPEN_EQUIV_CLASS)
3302 return parse_bracket_symbol (elem, regexp, token);
78d8b07a
UD
3303 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3304 {
3305 /* A '-' must only appear as anything but a range indicator before
3306 the closing bracket. Everything else is an error. */
3307 re_token_t token2;
3308 (void) peek_token_bracket (&token2, regexp, syntax);
3309 if (token2.type != OP_CLOSE_BRACKET)
3310 /* The actual error value is not standardized since this whole
3311 case is undefined. But ERANGE makes good sense. */
3312 return REG_ERANGE;
3313 }
3b0bdc72
UD
3314 elem->type = SB_CHAR;
3315 elem->opr.ch = token->opr.c;
3316 return REG_NOERROR;
3317}
3318
434d3784
UD
3319/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3320 such as [:<character_class>:], [.<collating_element>.], and
3321 [=<equivalent_class>=]. */
3322
3b0bdc72
UD
3323static reg_errcode_t
3324parse_bracket_symbol (elem, regexp, token)
3325 bracket_elem_t *elem;
3326 re_string_t *regexp;
3327 re_token_t *token;
3328{
3329 unsigned char ch, delim = token->opr.c;
3330 int i = 0;
294b6bcc
UD
3331 if (re_string_eoi(regexp))
3332 return REG_EBRACK;
602c2f9d 3333 for (;; ++i)
3b0bdc72 3334 {
294b6bcc 3335 if (i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3336 return REG_EBRACK;
3b0bdc72 3337 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3338 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3339 else
15a7d175 3340 ch = re_string_fetch_byte (regexp);
294b6bcc
UD
3341 if (re_string_eoi(regexp))
3342 return REG_EBRACK;
3b0bdc72 3343 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3344 break;
3b0bdc72
UD
3345 elem->opr.name[i] = ch;
3346 }
3347 re_string_skip_bytes (regexp, 1);
3348 elem->opr.name[i] = '\0';
3349 switch (token->type)
3350 {
3351 case OP_OPEN_COLL_ELEM:
3352 elem->type = COLL_SYM;
3353 break;
3354 case OP_OPEN_EQUIV_CLASS:
3355 elem->type = EQUIV_CLASS;
3356 break;
3357 case OP_OPEN_CHAR_CLASS:
3358 elem->type = CHAR_CLASS;
3359 break;
3360 default:
3361 break;
3362 }
3363 return REG_NOERROR;
3364}
3365
3366 /* Helper function for parse_bracket_exp.
3367 Build the equivalence class which is represented by NAME.
3368 The result are written to MBCSET and SBCSET.
3369 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3370 is a pointer argument sinse we may update it. */
3371
3372static reg_errcode_t
c0a0f9a3
UD
3373#ifdef RE_ENABLE_I18N
3374build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3b0bdc72 3375 re_charset_t *mbcset;
3b0bdc72 3376 int *equiv_class_alloc;
c0a0f9a3
UD
3377#else /* not RE_ENABLE_I18N */
3378build_equiv_class (sbcset, name)
3379#endif /* not RE_ENABLE_I18N */
3380 re_bitset_ptr_t sbcset;
62439eac 3381 const unsigned char *name;
3b0bdc72 3382{
10677727 3383#if defined _LIBC
3b0bdc72
UD
3384 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3385 if (nrules != 0)
3386 {
3387 const int32_t *table, *indirect;
3388 const unsigned char *weights, *extra, *cp;
62439eac 3389 unsigned char char_buf[2];
3b0bdc72
UD
3390 int32_t idx1, idx2;
3391 unsigned int ch;
3392 size_t len;
3393 /* This #include defines a local function! */
3394# include <locale/weight.h>
3395 /* Calculate the index for equivalence class. */
62439eac 3396 cp = name;
3b0bdc72
UD
3397 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3398 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3399 _NL_COLLATE_WEIGHTMB);
3400 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3401 _NL_COLLATE_EXTRAMB);
3b0bdc72 3402 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3403 _NL_COLLATE_INDIRECTMB);
3b0bdc72 3404 idx1 = findidx (&cp);
62439eac 3405 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
15a7d175
UD
3406 /* This isn't a valid character. */
3407 return REG_ECOLLATE;
3b0bdc72
UD
3408
3409 /* Build single byte matcing table for this equivalence class. */
62439eac 3410 char_buf[1] = (unsigned char) '\0';
3b0bdc72
UD
3411 len = weights[idx1];
3412 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3413 {
3414 char_buf[0] = ch;
3415 cp = char_buf;
3416 idx2 = findidx (&cp);
3b0bdc72 3417/*
15a7d175 3418 idx2 = table[ch];
3b0bdc72 3419*/
15a7d175
UD
3420 if (idx2 == 0)
3421 /* This isn't a valid character. */
3422 continue;
3423 if (len == weights[idx2])
3424 {
3425 int cnt = 0;
3426 while (cnt <= len &&
3427 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3428 ++cnt;
3429
3430 if (cnt > len)
3431 bitset_set (sbcset, ch);
3432 }
3433 }
a9388965 3434 /* Check whether the array has enough space. */
951d6408 3435 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
15a7d175
UD
3436 {
3437 /* Not enough, realloc it. */
3438 /* +1 in case of mbcset->nequiv_classes is 0. */
951d6408 3439 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3440 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3441 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3442 int32_t,
3443 new_equiv_class_alloc);
3444 if (BE (new_equiv_classes == NULL, 0))
15a7d175 3445 return REG_ESPACE;
951d6408
UD
3446 mbcset->equiv_classes = new_equiv_classes;
3447 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3448 }
3b0bdc72
UD
3449 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3450 }
3451 else
10677727 3452#endif /* _LIBC */
3b0bdc72 3453 {
62439eac 3454 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3455 return REG_ECOLLATE;
62439eac 3456 bitset_set (sbcset, *name);
3b0bdc72
UD
3457 }
3458 return REG_NOERROR;
3459}
3460
3461 /* Helper function for parse_bracket_exp.
3462 Build the character class which is represented by NAME.
3463 The result are written to MBCSET and SBCSET.
3464 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3465 is a pointer argument sinse we may update it. */
3466
3467static reg_errcode_t
c0a0f9a3 3468#ifdef RE_ENABLE_I18N
66b110e8 3469build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3b0bdc72 3470 re_charset_t *mbcset;
3b0bdc72 3471 int *char_class_alloc;
c0a0f9a3 3472#else /* not RE_ENABLE_I18N */
66b110e8 3473build_charclass (trans, sbcset, class_name, syntax)
c0a0f9a3 3474#endif /* not RE_ENABLE_I18N */
86576b62 3475 unsigned RE_TRANSLATE_TYPE trans;
c0a0f9a3 3476 re_bitset_ptr_t sbcset;
62439eac 3477 const unsigned char *class_name;
602c2f9d 3478 reg_syntax_t syntax;
3b0bdc72
UD
3479{
3480 int i;
62439eac 3481 const char *name = (const char *) class_name;
c0a0f9a3
UD
3482
3483 /* In case of REG_ICASE "upper" and "lower" match the both of
3484 upper and lower cases. */
3485 if ((syntax & RE_ICASE)
3486 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487 name = "alpha";
3488
3489#ifdef RE_ENABLE_I18N
3b0bdc72 3490 /* Check the space of the arrays. */
951d6408 3491 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
a9388965
UD
3492 {
3493 /* Not enough, realloc it. */
3494 /* +1 in case of mbcset->nchar_classes is 0. */
951d6408 3495 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3496 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3497 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498 new_char_class_alloc);
3499 if (BE (new_char_classes == NULL, 0))
15a7d175 3500 return REG_ESPACE;
951d6408
UD
3501 mbcset->char_classes = new_char_classes;
3502 *char_class_alloc = new_char_class_alloc;
a9388965 3503 }
3b0bdc72 3504 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3505#endif /* RE_ENABLE_I18N */
3b0bdc72 3506
66b110e8
UD
3507#define BUILD_CHARCLASS_LOOP(ctype_func) \
3508 for (i = 0; i < SBC_MAX; ++i) \
3509 { \
3510 if (ctype_func (i)) \
3511 { \
3512 int ch = trans ? trans[i] : i; \
3513 bitset_set (sbcset, ch); \
3514 } \
3b0bdc72
UD
3515 }
3516
3517 if (strcmp (name, "alnum") == 0)
3518 BUILD_CHARCLASS_LOOP (isalnum)
3519 else if (strcmp (name, "cntrl") == 0)
3520 BUILD_CHARCLASS_LOOP (iscntrl)
3521 else if (strcmp (name, "lower") == 0)
3522 BUILD_CHARCLASS_LOOP (islower)
3523 else if (strcmp (name, "space") == 0)
3524 BUILD_CHARCLASS_LOOP (isspace)
3525 else if (strcmp (name, "alpha") == 0)
3526 BUILD_CHARCLASS_LOOP (isalpha)
3527 else if (strcmp (name, "digit") == 0)
3528 BUILD_CHARCLASS_LOOP (isdigit)
3529 else if (strcmp (name, "print") == 0)
3530 BUILD_CHARCLASS_LOOP (isprint)
3531 else if (strcmp (name, "upper") == 0)
3532 BUILD_CHARCLASS_LOOP (isupper)
3533 else if (strcmp (name, "blank") == 0)
3534 BUILD_CHARCLASS_LOOP (isblank)
3535 else if (strcmp (name, "graph") == 0)
3536 BUILD_CHARCLASS_LOOP (isgraph)
3537 else if (strcmp (name, "punct") == 0)
3538 BUILD_CHARCLASS_LOOP (ispunct)
3539 else if (strcmp (name, "xdigit") == 0)
3540 BUILD_CHARCLASS_LOOP (isxdigit)
3541 else
3542 return REG_ECTYPE;
3543
3544 return REG_NOERROR;
3545}
3546
3547static bin_tree_t *
10677727 3548build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3b0bdc72 3549 re_dfa_t *dfa;
86576b62 3550 unsigned RE_TRANSLATE_TYPE trans;
e2b6bfa3
UD
3551 const unsigned char *class_name;
3552 const unsigned char *extra;
10677727 3553 int non_match;
3b0bdc72
UD
3554 reg_errcode_t *err;
3555{
3556 re_bitset_ptr_t sbcset;
c0a0f9a3 3557#ifdef RE_ENABLE_I18N
3b0bdc72 3558 re_charset_t *mbcset;
c0a0f9a3 3559 int alloc = 0;
c0a0f9a3 3560#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3561 reg_errcode_t ret;
3562 re_token_t br_token;
3563 bin_tree_t *tree;
3b0bdc72
UD
3564
3565 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 3566#ifdef RE_ENABLE_I18N
3b0bdc72 3567 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3568#endif /* RE_ENABLE_I18N */
3569
3570#ifdef RE_ENABLE_I18N
bc15410e 3571 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3572#else /* not RE_ENABLE_I18N */
3573 if (BE (sbcset == NULL, 0))
3574#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3575 {
3576 *err = REG_ESPACE;
3577 return NULL;
3578 }
3579
10677727 3580 if (non_match)
3b0bdc72 3581 {
c0a0f9a3 3582#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3583 /*
3584 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 3585 bitset_set(cset->sbcset, '\0');
3b0bdc72 3586 */
c0a0f9a3 3587 mbcset->non_match = 1;
c0a0f9a3 3588#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3589 }
3590
602c2f9d 3591 /* We don't care the syntax in this case. */
66b110e8 3592 ret = build_charclass (trans, sbcset,
c0a0f9a3 3593#ifdef RE_ENABLE_I18N
15a7d175 3594 mbcset, &alloc,
c0a0f9a3 3595#endif /* RE_ENABLE_I18N */
e2b6bfa3 3596 class_name, 0);
c0a0f9a3 3597
bc15410e 3598 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3599 {
3600 re_free (sbcset);
c0a0f9a3 3601#ifdef RE_ENABLE_I18N
3b0bdc72 3602 free_charset (mbcset);
c0a0f9a3 3603#endif /* RE_ENABLE_I18N */
7b7b9e70 3604 *err = ret;
3b0bdc72
UD
3605 return NULL;
3606 }
434d3784 3607 /* \w match '_' also. */
e2b6bfa3
UD
3608 for (; *extra; extra++)
3609 bitset_set (sbcset, *extra);
3b0bdc72
UD
3610
3611 /* If it is non-matching list. */
c0a0f9a3 3612 if (non_match)
3b0bdc72
UD
3613 bitset_not (sbcset);
3614
65e6becf
UD
3615#ifdef RE_ENABLE_I18N
3616 /* Ensure only single byte characters are set. */
3617 if (dfa->mb_cur_max > 1)
3618 bitset_mask (sbcset, dfa->sb_char);
3619#endif
3620
3b0bdc72
UD
3621 /* Build a tree for simple bracket. */
3622 br_token.type = SIMPLE_BRACKET;
3623 br_token.opr.sbcset = sbcset;
c5c9c929 3624 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3625 if (BE (tree == NULL, 0))
3b0bdc72
UD
3626 goto build_word_op_espace;
3627
c0a0f9a3 3628#ifdef RE_ENABLE_I18N
3c0fb574 3629 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
3630 {
3631 re_token_t alt_token;
3632 bin_tree_t *mbc_tree;
3633 /* Build a tree for complex bracket. */
3634 br_token.type = COMPLEX_BRACKET;
3635 br_token.opr.mbcset = mbcset;
3636 dfa->has_mb_node = 1;
c5c9c929 3637 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3638 if (BE (mbc_tree == NULL, 0))
15a7d175 3639 goto build_word_op_espace;
3b0bdc72
UD
3640 /* Then join them by ALT node. */
3641 alt_token.type = OP_ALT;
97fd3a30 3642 dfa->has_plural_match = 1;
c5c9c929 3643 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
ee70274a 3644 if (BE (mbc_tree != NULL, 1))
15a7d175 3645 return tree;
3b0bdc72
UD
3646 }
3647 else
3648 {
3649 free_charset (mbcset);
3650 return tree;
3651 }
c0a0f9a3
UD
3652#else /* not RE_ENABLE_I18N */
3653 return tree;
3654#endif /* not RE_ENABLE_I18N */
3655
3b0bdc72
UD
3656 build_word_op_espace:
3657 re_free (sbcset);
c0a0f9a3 3658#ifdef RE_ENABLE_I18N
3b0bdc72 3659 free_charset (mbcset);
c0a0f9a3 3660#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3661 *err = REG_ESPACE;
3662 return NULL;
3663}
3664
3665/* This is intended for the expressions like "a{1,3}".
3666 Fetch a number from `input', and return the number.
3667 Return -1, if the number field is empty like "{,1}".
3668 Return -2, If an error is occured. */
3669
3670static int
3671fetch_number (input, token, syntax)
3672 re_string_t *input;
3673 re_token_t *token;
3674 reg_syntax_t syntax;
3675{
3676 int num = -1;
3677 unsigned char c;
3678 while (1)
3679 {
f0d77aa8 3680 fetch_token (token, input, syntax);
3b0bdc72 3681 c = token->opr.c;
602c2f9d 3682 if (BE (token->type == END_OF_RE, 0))
15a7d175 3683 return -2;
3b0bdc72 3684 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3685 break;
602c2f9d 3686 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
15a7d175 3687 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
602c2f9d 3688 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3689 }
3b0bdc72
UD
3690 return num;
3691}
3692\f
c0a0f9a3 3693#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3694static void
3695free_charset (re_charset_t *cset)
3696{
3697 re_free (cset->mbchars);
c0a0f9a3 3698# ifdef _LIBC
3b0bdc72
UD
3699 re_free (cset->coll_syms);
3700 re_free (cset->equiv_classes);
3701 re_free (cset->range_starts);
3702 re_free (cset->range_ends);
c0a0f9a3 3703# endif
3b0bdc72
UD
3704 re_free (cset->char_classes);
3705 re_free (cset);
3706}
c0a0f9a3 3707#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3708\f
3709/* Functions for binary tree operation. */
3710
ee70274a 3711/* Create a tree node. */
3b0bdc72
UD
3712
3713static bin_tree_t *
ee70274a
UD
3714create_tree (dfa, left, right, type, index)
3715 re_dfa_t *dfa;
3b0bdc72
UD
3716 bin_tree_t *left;
3717 bin_tree_t *right;
3718 re_token_type_t type;
3719 int index;
3720{
3721 bin_tree_t *tree;
ee70274a 3722 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3b0bdc72 3723 {
ee70274a
UD
3724 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3725
3726 if (storage == NULL)
3727 return NULL;
3728 storage->next = dfa->str_tree_storage;
3729 dfa->str_tree_storage = storage;
3730 dfa->str_tree_storage_idx = 0;
3b0bdc72 3731 }
ee70274a
UD
3732 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3733
3b0bdc72
UD
3734 tree->parent = NULL;
3735 tree->left = left;
3736 tree->right = right;
3737 tree->type = type;
3738 tree->node_idx = index;
3739 tree->first = -1;
3740 tree->next = -1;
3741 re_node_set_init_empty (&tree->eclosure);
3742
3743 if (left != NULL)
3744 left->parent = tree;
3745 if (right != NULL)
3746 right->parent = tree;
3747 return tree;
3748}
3749
ee70274a 3750/* Create both a DFA node and a tree for it. */
3b0bdc72 3751
ee70274a
UD
3752static bin_tree_t *
3753re_dfa_add_tree_node (dfa, left, right, token)
3754 re_dfa_t *dfa;
3755 bin_tree_t *left;
3756 bin_tree_t *right;
c5c9c929 3757 const re_token_t *token;
f0d77aa8 3758{
c5c9c929 3759 int new_idx = re_dfa_add_node (dfa, *token, 0);
ee70274a
UD
3760
3761 if (new_idx == -1)
3762 return NULL;
3763
3764 return create_tree (dfa, left, right, 0, new_idx);
3b0bdc72
UD
3765}
3766
6b6557e8
UD
3767/* Mark the tree SRC as an optional subexpression. */
3768
3769static void
3770mark_opt_subexp (src, dfa)
3771 const bin_tree_t *src;
3772 re_dfa_t *dfa;
3773{
3774 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3775 a subexpression. */
3776 if (src->type == CONCAT
3777 && src->left->type == NON_TYPE
3778 && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3779 mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3780}
3781
3782
3783/* Recursive tree walker for mark_opt_subexp. */
3784
3785static void
3786mark_opt_subexp_iter (src, dfa, idx)
3787 const bin_tree_t *src;
3788 re_dfa_t *dfa;
a96c63ed 3789 int idx;
6b6557e8
UD
3790{
3791 int node_idx;
3792
3793 if (src->type == NON_TYPE)
3794 {
3795 node_idx = src->node_idx;
3796 if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3797 || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3798 && dfa->nodes[node_idx].opr.idx == idx)
3799 dfa->nodes[node_idx].opt_subexp = 1;
3800 }
3801
3802 if (src->left != NULL)
3803 mark_opt_subexp_iter (src->left, dfa, idx);
3804
3805 if (src->right != NULL)
3806 mark_opt_subexp_iter (src->right, dfa, idx);
3807}
3808
ee70274a 3809
3b0bdc72
UD
3810/* Duplicate the node SRC, and return new node. */
3811
3812static bin_tree_t *
3813duplicate_tree (src, dfa)
3814 const bin_tree_t *src;
3815 re_dfa_t *dfa;
3816{
3817 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3818 int new_node_idx;
3819 /* Since node indies must be according to Post-order of the tree,
3820 we must duplicate the left at first. */
3821 if (src->left != NULL)
3822 {
3823 left = duplicate_tree (src->left, dfa);
3824 if (left == NULL)
15a7d175 3825 return NULL;
3b0bdc72
UD
3826 }
3827
3828 /* Secondaly, duplicate the right. */
3829 if (src->right != NULL)
3830 {
3831 right = duplicate_tree (src->right, dfa);
3832 if (right == NULL)
ee70274a 3833 return NULL;
3b0bdc72
UD
3834 }
3835
3836 /* At last, duplicate itself. */
3837 if (src->type == NON_TYPE)
3838 {
3839 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3840 dfa->nodes[new_node_idx].duplicated = 1;
bc15410e 3841 if (BE (new_node_idx == -1, 0))
ee70274a 3842 return NULL;
3b0bdc72
UD
3843 }
3844 else
3845 new_node_idx = src->type;
3846
ee70274a 3847 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3b0bdc72
UD
3848 return new_tree;
3849}