]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
Update.
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
ca3c505e 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
3b0bdc72 21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
15a7d175 22 int length, reg_syntax_t syntax);
3b0bdc72 23static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
24 const re_dfastate_t *init_state,
25 char *fastmap);
3b0bdc72 26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
a9388965 27static reg_errcode_t init_word_char (re_dfa_t *dfa);
c0a0f9a3 28#ifdef RE_ENABLE_I18N
3b0bdc72 29static void free_charset (re_charset_t *cset);
c0a0f9a3 30#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
3b0bdc72
UD
36static reg_errcode_t analyze (re_dfa_t *dfa);
37static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
485d775d 41static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
15a7d175
UD
42 int top_clone_node, int root_node,
43 unsigned int constraint);
a9388965 44static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
15a7d175 45 unsigned int constraint);
a7d5c291
UD
46static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
3b0bdc72 48static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 49static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
15a7d175 50 int node, int root);
3b0bdc72
UD
51static void calc_inveclosure (re_dfa_t *dfa);
52static int fetch_number (re_string_t *input, re_token_t *token,
15a7d175 53 reg_syntax_t syntax);
3b0bdc72
UD
54static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
55static int peek_token (re_token_t *token, re_string_t *input,
15a7d175 56 reg_syntax_t syntax);
3b0bdc72 57static int peek_token_bracket (re_token_t *token, re_string_t *input,
15a7d175 58 reg_syntax_t syntax);
3b0bdc72 59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 60 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
3b0bdc72 64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175
UD
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
3b0bdc72 67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175
UD
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
3b0bdc72 70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
3b0bdc72 73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
3b0bdc72 79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax);
3b0bdc72 84static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
85 re_string_t *regexp,
86 re_token_t *token);
434d3784 87#ifndef _LIBC
c0a0f9a3
UD
88# ifdef RE_ENABLE_I18N
89static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
90 re_charset_t *mbcset, int *range_alloc,
91 bracket_elem_t *start_elem,
92 bracket_elem_t *end_elem);
c0a0f9a3 93static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175
UD
94 re_charset_t *mbcset,
95 int *coll_sym_alloc,
96 const unsigned char *name);
c0a0f9a3
UD
97# else /* not RE_ENABLE_I18N */
98static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
99 bracket_elem_t *start_elem,
100 bracket_elem_t *end_elem);
c0a0f9a3 101static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175 102 const unsigned char *name);
c0a0f9a3 103# endif /* not RE_ENABLE_I18N */
434d3784 104#endif /* not _LIBC */
c0a0f9a3
UD
105#ifdef RE_ENABLE_I18N
106static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175
UD
107 re_charset_t *mbcset,
108 int *equiv_class_alloc,
109 const unsigned char *name);
66b110e8
UD
110static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
111 re_bitset_ptr_t sbcset,
15a7d175
UD
112 re_charset_t *mbcset,
113 int *char_class_alloc,
114 const unsigned char *class_name,
115 reg_syntax_t syntax);
c0a0f9a3
UD
116#else /* not RE_ENABLE_I18N */
117static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175 118 const unsigned char *name);
66b110e8
UD
119static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
120 re_bitset_ptr_t sbcset,
15a7d175
UD
121 const unsigned char *class_name,
122 reg_syntax_t syntax);
c0a0f9a3 123#endif /* not RE_ENABLE_I18N */
e2b6bfa3 124static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
06e8303a 125 const unsigned char *class_name,
e2b6bfa3
UD
126 const unsigned char *extra, int not,
127 reg_errcode_t *err);
3b0bdc72
UD
128static void free_bin_tree (bin_tree_t *tree);
129static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
15a7d175 130 re_token_type_t type, int index);
3b0bdc72
UD
131static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
132\f
133/* This table gives an error message for each of the error codes listed
134 in regex.h. Obviously the order here has to be same as there.
135 POSIX doesn't require that we do anything for REG_NOERROR,
136 but why not be nice? */
137
6455d255 138const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
139 {
140#define REG_NOERROR_IDX 0
141 gettext_noop ("Success") /* REG_NOERROR */
142 "\0"
143#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
144 gettext_noop ("No match") /* REG_NOMATCH */
145 "\0"
146#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
147 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
148 "\0"
149#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
150 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
151 "\0"
152#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
153 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
154 "\0"
155#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
156 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
157 "\0"
158#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
159 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
160 "\0"
161#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
162 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
163 "\0"
164#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
165 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
166 "\0"
167#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
168 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
169 "\0"
170#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
171 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
172 "\0"
173#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
174 gettext_noop ("Invalid range end") /* REG_ERANGE */
175 "\0"
176#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
177 gettext_noop ("Memory exhausted") /* REG_ESPACE */
178 "\0"
179#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
180 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
181 "\0"
182#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
183 gettext_noop ("Premature end of regular expression") /* REG_EEND */
184 "\0"
185#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
186 gettext_noop ("Regular expression too big") /* REG_ESIZE */
187 "\0"
188#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
189 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
190 };
191
6455d255 192const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
193 {
194 REG_NOERROR_IDX,
195 REG_NOMATCH_IDX,
196 REG_BADPAT_IDX,
197 REG_ECOLLATE_IDX,
198 REG_ECTYPE_IDX,
199 REG_EESCAPE_IDX,
200 REG_ESUBREG_IDX,
201 REG_EBRACK_IDX,
202 REG_EPAREN_IDX,
203 REG_EBRACE_IDX,
204 REG_BADBR_IDX,
205 REG_ERANGE_IDX,
206 REG_ESPACE_IDX,
207 REG_BADRPT_IDX,
208 REG_EEND_IDX,
209 REG_ESIZE_IDX,
210 REG_ERPAREN_IDX
211 };
212\f
213/* Entry points for GNU code. */
214
215/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 216 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
217 Returns 0 if the pattern was valid, otherwise an error string.
218
219 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
220 are set in BUFP on entry. */
221
222const char *
223re_compile_pattern (pattern, length, bufp)
224 const char *pattern;
225 size_t length;
226 struct re_pattern_buffer *bufp;
227{
228 reg_errcode_t ret;
229
3b0bdc72
UD
230 /* And GNU code determines whether or not to get register information
231 by passing null for the REGS argument to re_match, etc., not by
232 setting no_sub. */
233 bufp->no_sub = 0;
234
235 /* Match anchors at newline. */
236 bufp->newline_anchor = 1;
237
75e4a282 238 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
239
240 if (!ret)
241 return NULL;
6455d255 242 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
243}
244#ifdef _LIBC
245weak_alias (__re_compile_pattern, re_compile_pattern)
246#endif
247
248/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
249 also be assigned to arbitrarily: each pattern buffer stores its own
250 syntax, so it can be changed between regex compilations. */
251/* This has no initializer because initialized variables in Emacs
252 become read-only after dumping. */
253reg_syntax_t re_syntax_options;
254
255
256/* Specify the precise syntax of regexps for compilation. This provides
257 for compatibility for various utilities which historically have
258 different, incompatible syntaxes.
259
260 The argument SYNTAX is a bit mask comprised of the various bits
261 defined in regex.h. We return the old syntax. */
262
263reg_syntax_t
264re_set_syntax (syntax)
265 reg_syntax_t syntax;
266{
267 reg_syntax_t ret = re_syntax_options;
268
269 re_syntax_options = syntax;
270 return ret;
271}
272#ifdef _LIBC
273weak_alias (__re_set_syntax, re_set_syntax)
274#endif
275
276int
277re_compile_fastmap (bufp)
278 struct re_pattern_buffer *bufp;
279{
280 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
281 char *fastmap = bufp->fastmap;
282
283 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
284 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
285 if (dfa->init_state != dfa->init_state_word)
286 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
287 if (dfa->init_state != dfa->init_state_nl)
288 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
289 if (dfa->init_state != dfa->init_state_begbuf)
290 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
291 bufp->fastmap_accurate = 1;
292 return 0;
293}
294#ifdef _LIBC
295weak_alias (__re_compile_fastmap, re_compile_fastmap)
296#endif
297
1b2c2628 298static inline void
25337753 299__attribute ((always_inline))
1b2c2628
UD
300re_set_fastmap (char *fastmap, int icase, int ch)
301{
302 fastmap[ch] = 1;
303 if (icase)
304 fastmap[tolower (ch)] = 1;
305}
306
3b0bdc72
UD
307/* Helper function for re_compile_fastmap.
308 Compile fastmap for the initial_state INIT_STATE. */
309
310static void
311re_compile_fastmap_iter (bufp, init_state, fastmap)
312 regex_t *bufp;
313 const re_dfastate_t *init_state;
314 char *fastmap;
315{
316 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
317 int node_cnt;
3c0fb574 318 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
319 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
320 {
321 int node = init_state->nodes.elems[node_cnt];
322 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
323
324 if (type == CHARACTER)
74e12fbc
UD
325 {
326 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
327#ifdef RE_ENABLE_I18N
14744156 328 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 329 {
3c0fb574 330 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
74e12fbc
UD
331 wchar_t wc;
332 mbstate_t state;
333
334 p = buf;
335 *p++ = dfa->nodes[node].opr.c;
336 while (++node < dfa->nodes_len
337 && dfa->nodes[node].type == CHARACTER
338 && dfa->nodes[node].mb_partial)
339 *p++ = dfa->nodes[node].opr.c;
340 memset (&state, 0, sizeof (state));
341 if (mbrtowc (&wc, (const char *) buf, p - buf,
342 &state) == p - buf
343 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
344 re_set_fastmap (fastmap, 0, buf[0]);
345 }
346#endif
347 }
3b0bdc72 348 else if (type == SIMPLE_BRACKET)
15a7d175
UD
349 {
350 int i, j, ch;
351 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
352 for (j = 0; j < UINT_BITS; ++j, ++ch)
353 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
354 re_set_fastmap (fastmap, icase, ch);
355 }
c0a0f9a3 356#ifdef RE_ENABLE_I18N
3b0bdc72 357 else if (type == COMPLEX_BRACKET)
15a7d175
UD
358 {
359 int i;
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 || cset->nranges || cset->nchar_classes)
363 {
c0a0f9a3 364# ifdef _LIBC
15a7d175
UD
365 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
366 {
367 /* In this case we want to catch the bytes which are
368 the first byte of any collation elements.
369 e.g. In da_DK, we want to catch 'a' since "aa"
370 is a valid collation element, and don't catch
371 'b' since 'b' is the only collation element
372 which starts from 'b'. */
373 int j, ch;
374 const int32_t *table = (const int32_t *)
375 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
376 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
377 for (j = 0; j < UINT_BITS; ++j, ++ch)
378 if (table[ch] < 0)
379 re_set_fastmap (fastmap, icase, ch);
380 }
c0a0f9a3 381# else
3c0fb574 382 if (dfa->mb_cur_max > 1)
15a7d175
UD
383 for (i = 0; i < SBC_MAX; ++i)
384 if (__btowc (i) == WEOF)
385 re_set_fastmap (fastmap, icase, i);
c0a0f9a3 386# endif /* not _LIBC */
15a7d175
UD
387 }
388 for (i = 0; i < cset->nmbchars; ++i)
389 {
390 char buf[256];
391 mbstate_t state;
392 memset (&state, '\0', sizeof (state));
393 __wcrtomb (buf, cset->mbchars[i], &state);
394 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
14744156 395 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc
UD
396 {
397 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
398 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
399 }
15a7d175
UD
400 }
401 }
c0a0f9a3 402#endif /* RE_ENABLE_I18N */
1b2c2628 403 else if (type == END_OF_RE || type == OP_PERIOD)
15a7d175
UD
404 {
405 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
406 if (type == END_OF_RE)
407 bufp->can_be_null = 1;
408 return;
409 }
3b0bdc72
UD
410 }
411}
412\f
413/* Entry point for POSIX code. */
414/* regcomp takes a regular expression as a string and compiles it.
415
416 PREG is a regex_t *. We do not expect any fields to be initialized,
417 since POSIX says we shouldn't. Thus, we set
418
419 `buffer' to the compiled pattern;
420 `used' to the length of the compiled pattern;
421 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
422 REG_EXTENDED bit in CFLAGS is set; otherwise, to
423 RE_SYNTAX_POSIX_BASIC;
424 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
425 `fastmap' to an allocated space for the fastmap;
426 `fastmap_accurate' to zero;
427 `re_nsub' to the number of subexpressions in PATTERN.
428
429 PATTERN is the address of the pattern string.
430
431 CFLAGS is a series of bits which affect compilation.
432
433 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
434 use POSIX basic syntax.
435
436 If REG_NEWLINE is set, then . and [^...] don't match newline.
437 Also, regexec will try a match beginning after every newline.
438
439 If REG_ICASE is set, then we considers upper- and lowercase
440 versions of letters to be equivalent when matching.
441
442 If REG_NOSUB is set, then when PREG is passed to regexec, that
443 routine will report only success or failure, and nothing about the
444 registers.
445
446 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
447 the return codes and their meanings.) */
448
449int
450regcomp (preg, pattern, cflags)
75e4a282
UD
451 regex_t *__restrict preg;
452 const char *__restrict pattern;
3b0bdc72
UD
453 int cflags;
454{
455 reg_errcode_t ret;
456 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 457 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
458
459 preg->buffer = NULL;
460 preg->allocated = 0;
461 preg->used = 0;
462
463 /* Try to allocate space for the fastmap. */
464 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 465 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
466 return REG_ESPACE;
467
468 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
469
470 /* If REG_NEWLINE is set, newlines are treated differently. */
471 if (cflags & REG_NEWLINE)
472 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
473 syntax &= ~RE_DOT_NEWLINE;
474 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
475 /* It also changes the matching behavior. */
476 preg->newline_anchor = 1;
477 }
478 else
479 preg->newline_anchor = 0;
480 preg->no_sub = !!(cflags & REG_NOSUB);
481 preg->translate = NULL;
482
483 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
484
485 /* POSIX doesn't distinguish between an unmatched open-group and an
486 unmatched close-group: both are REG_EPAREN. */
487 if (ret == REG_ERPAREN)
488 ret = REG_EPAREN;
489
a9388965 490 /* We have already checked preg->fastmap != NULL. */
bc15410e 491 if (BE (ret == REG_NOERROR, 1))
71ccd330
UD
492 /* Compute the fastmap now, since regexec cannot modify the pattern
493 buffer. This function nevers fails in this implementation. */
83b038f2 494 (void) re_compile_fastmap (preg);
71ccd330 495 else
3b0bdc72 496 {
71ccd330
UD
497 /* Some error occurred while compiling the expression. */
498 re_free (preg->fastmap);
499 preg->fastmap = NULL;
3b0bdc72
UD
500 }
501
502 return (int) ret;
503}
504#ifdef _LIBC
505weak_alias (__regcomp, regcomp)
506#endif
507
508/* Returns a message corresponding to an error code, ERRCODE, returned
509 from either regcomp or regexec. We don't use PREG here. */
510
511size_t
512regerror (errcode, preg, errbuf, errbuf_size)
513 int errcode;
514 const regex_t *preg;
515 char *errbuf;
516 size_t errbuf_size;
517{
518 const char *msg;
519 size_t msg_size;
520
bc15410e 521 if (BE (errcode < 0
15a7d175
UD
522 || errcode >= (int) (sizeof (__re_error_msgid_idx)
523 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
524 /* Only error codes returned by the rest of the code should be passed
525 to this routine. If we are given anything else, or if other regex
526 code generates an invalid error code, then the program has a bug.
527 Dump core so we can fix it. */
528 abort ();
529
6455d255 530 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
531
532 msg_size = strlen (msg) + 1; /* Includes the null. */
533
bc15410e 534 if (BE (errbuf_size != 0, 1))
3b0bdc72 535 {
bc15410e 536 if (BE (msg_size > errbuf_size, 0))
15a7d175 537 {
3b0bdc72
UD
538#if defined HAVE_MEMPCPY || defined _LIBC
539 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
540#else
15a7d175
UD
541 memcpy (errbuf, msg, errbuf_size - 1);
542 errbuf[errbuf_size - 1] = 0;
3b0bdc72 543#endif
15a7d175 544 }
3b0bdc72 545 else
15a7d175 546 memcpy (errbuf, msg, msg_size);
3b0bdc72
UD
547 }
548
549 return msg_size;
550}
551#ifdef _LIBC
552weak_alias (__regerror, regerror)
553#endif
554
3b0bdc72 555
71ccd330
UD
556static void
557free_dfa_content (re_dfa_t *dfa)
3b0bdc72
UD
558{
559 int i, j;
3b0bdc72 560
71ccd330
UD
561 re_free (dfa->subexps);
562
563 for (i = 0; i < dfa->nodes_len; ++i)
564 {
565 re_token_t *node = dfa->nodes + i;
c0a0f9a3 566#ifdef RE_ENABLE_I18N
71ccd330
UD
567 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
568 free_charset (node->opr.mbcset);
569 else
c0a0f9a3 570#endif /* RE_ENABLE_I18N */
71ccd330
UD
571 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
572 re_free (node->opr.sbcset);
573 }
574 re_free (dfa->nexts);
575 for (i = 0; i < dfa->nodes_len; ++i)
576 {
577 if (dfa->eclosures != NULL)
578 re_node_set_free (dfa->eclosures + i);
579 if (dfa->inveclosures != NULL)
580 re_node_set_free (dfa->inveclosures + i);
581 if (dfa->edests != NULL)
582 re_node_set_free (dfa->edests + i);
583 }
584 re_free (dfa->edests);
585 re_free (dfa->eclosures);
586 re_free (dfa->inveclosures);
587 re_free (dfa->nodes);
3b0bdc72 588
71ccd330
UD
589 for (i = 0; i <= dfa->state_hash_mask; ++i)
590 {
591 struct re_state_table_entry *entry = dfa->state_table + i;
592 for (j = 0; j < entry->num; ++j)
593 {
594 re_dfastate_t *state = entry->array[j];
b14993a2 595 free_state (state);
71ccd330
UD
596 }
597 re_free (entry->array);
598 }
599 re_free (dfa->state_table);
3b0bdc72 600
71ccd330
UD
601 if (dfa->word_char != NULL)
602 re_free (dfa->word_char);
0742e48e 603#ifdef DEBUG
71ccd330 604 re_free (dfa->re_str);
0742e48e 605#endif
71ccd330
UD
606
607 re_free (dfa);
608}
609
610
611/* Free dynamically allocated space used by PREG. */
612
613void
614regfree (preg)
615 regex_t *preg;
616{
617 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
618 if (BE (dfa != NULL, 1))
619 free_dfa_content (dfa);
620
3b0bdc72
UD
621 re_free (preg->fastmap);
622}
623#ifdef _LIBC
624weak_alias (__regfree, regfree)
625#endif
626\f
627/* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
629
630#if defined _REGEX_RE_COMP || defined _LIBC
631
632/* BSD has one and only one pattern buffer. */
633static struct re_pattern_buffer re_comp_buf;
634
635char *
636# ifdef _LIBC
637/* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
640weak_function
641# endif
642re_comp (s)
643 const char *s;
644{
645 reg_errcode_t ret;
240e87c2 646 char *fastmap;
3b0bdc72
UD
647
648 if (!s)
649 {
650 if (!re_comp_buf.buffer)
651 return gettext ("No previous regular expression");
652 return 0;
653 }
654
240e87c2
RM
655 if (re_comp_buf.buffer)
656 {
657 fastmap = re_comp_buf.fastmap;
658 re_comp_buf.fastmap = NULL;
659 __regfree (&re_comp_buf);
1b2c2628 660 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
661 re_comp_buf.fastmap = fastmap;
662 }
663
664 if (re_comp_buf.fastmap == NULL)
3b0bdc72
UD
665 {
666 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
667 if (re_comp_buf.fastmap == NULL)
6455d255
UD
668 return (char *) gettext (__re_error_msgid
669 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
670 }
671
672 /* Since `re_exec' always passes NULL for the `regs' argument, we
673 don't need to initialize the pattern buffer fields which affect it. */
674
675 /* Match anchors at newlines. */
676 re_comp_buf.newline_anchor = 1;
677
678 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
679
680 if (!ret)
681 return NULL;
682
683 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 684 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 685}
240e87c2
RM
686
687#ifdef _LIBC
c877418f 688libc_freeres_fn (free_mem)
240e87c2
RM
689{
690 __regfree (&re_comp_buf);
691}
240e87c2
RM
692#endif
693
3b0bdc72
UD
694#endif /* _REGEX_RE_COMP */
695\f
696/* Internal entry point.
697 Compile the regular expression PATTERN, whose length is LENGTH.
698 SYNTAX indicate regular expression's syntax. */
699
700static reg_errcode_t
701re_compile_internal (preg, pattern, length, syntax)
702 regex_t *preg;
703 const char * pattern;
704 int length;
705 reg_syntax_t syntax;
706{
707 reg_errcode_t err = REG_NOERROR;
708 re_dfa_t *dfa;
709 re_string_t regexp;
710
711 /* Initialize the pattern buffer. */
712 preg->fastmap_accurate = 0;
713 preg->syntax = syntax;
714 preg->not_bol = preg->not_eol = 0;
715 preg->used = 0;
716 preg->re_nsub = 0;
1b2c2628
UD
717 preg->can_be_null = 0;
718 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
719
720 /* Initialize the dfa. */
721 dfa = (re_dfa_t *) preg->buffer;
722 if (preg->allocated < sizeof (re_dfa_t))
723 {
724 /* If zero allocated, but buffer is non-null, try to realloc
725 enough space. This loses if buffer's address is bogus, but
726 that is the user's responsibility. If ->buffer is NULL this
727 is a simple allocation. */
728 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
729 if (dfa == NULL)
730 return REG_ESPACE;
3b0bdc72
UD
731 preg->allocated = sizeof (re_dfa_t);
732 }
733 preg->buffer = (unsigned char *) dfa;
734 preg->used = sizeof (re_dfa_t);
735
736 err = init_dfa (dfa, length);
bc15410e 737 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
738 {
739 re_free (dfa);
740 preg->buffer = NULL;
ca3c505e 741 preg->allocated = 0;
3b0bdc72
UD
742 return err;
743 }
0742e48e
UD
744#ifdef DEBUG
745 dfa->re_str = re_malloc (char, length + 1);
746 strncpy (dfa->re_str, pattern, length + 1);
747#endif
3b0bdc72 748
612546c6 749 err = re_string_construct (&regexp, pattern, length, preg->translate,
3c0fb574
UD
750 syntax & RE_ICASE, dfa->mb_cur_max,
751 dfa->is_utf8);
bc15410e 752 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
753 {
754 re_free (dfa);
755 preg->buffer = NULL;
ca3c505e 756 preg->allocated = 0;
3b0bdc72
UD
757 return err;
758 }
759
760 /* Parse the regular expression, and build a structure tree. */
761 preg->re_nsub = 0;
762 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 763 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
764 goto re_compile_internal_free_return;
765
14744156
UD
766#ifdef RE_ENABLE_I18N
767 /* If possible, do searching in single byte encoding to speed things up. */
768 if (dfa->is_utf8 && !(syntax & RE_ICASE))
769 optimize_utf8 (dfa);
770#endif
771
3b0bdc72
UD
772 /* Analyze the tree and collect information which is necessary to
773 create the dfa. */
774 err = analyze (dfa);
bc15410e 775 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
776 goto re_compile_internal_free_return;
777
778 /* Then create the initial state of the dfa. */
779 err = create_initial_state (dfa);
3b0bdc72 780
3b0bdc72
UD
781 /* Release work areas. */
782 free_workarea_compile (preg);
783 re_string_destruct (&regexp);
784
71ccd330
UD
785 if (BE (err != REG_NOERROR, 0))
786 {
787 re_compile_internal_free_return:
788 free_dfa_content (dfa);
789 preg->buffer = NULL;
ca3c505e 790 preg->allocated = 0;
71ccd330
UD
791 }
792
3b0bdc72
UD
793 return err;
794}
795
796/* Initialize DFA. We use the length of the regular expression PAT_LEN
797 as the initial length of some arrays. */
798
799static reg_errcode_t
800init_dfa (dfa, pat_len)
801 re_dfa_t *dfa;
802 int pat_len;
803{
804 int table_size;
81c64d40
UD
805
806 memset (dfa, '\0', sizeof (re_dfa_t));
807
3b0bdc72
UD
808 dfa->nodes_alloc = pat_len + 1;
809 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
810
811 dfa->states_alloc = pat_len + 1;
812
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; table_size > 0; table_size <<= 1)
815 if (table_size > pat_len)
816 break;
817
818 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
819 dfa->state_hash_mask = table_size - 1;
820
821 dfa->subexps_alloc = 1;
822 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
823 dfa->word_char = NULL;
824
3c0fb574
UD
825 dfa->mb_cur_max = MB_CUR_MAX;
826#ifdef _LIBC
827 if (dfa->mb_cur_max > 1
828 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
829 dfa->is_utf8 = 1;
830#endif
831
bc15410e 832 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
15a7d175 833 || dfa->subexps == NULL, 0))
3b0bdc72
UD
834 {
835 /* We don't bother to free anything which was allocated. Very
836 soon the process will go down anyway. */
837 dfa->subexps = NULL;
838 dfa->state_table = NULL;
839 dfa->nodes = NULL;
840 return REG_ESPACE;
841 }
842 return REG_NOERROR;
843}
844
845/* Initialize WORD_CHAR table, which indicate which character is
846 "word". In this case "word" means that it is the word construction
847 character used by some operators like "\<", "\>", etc. */
848
a9388965 849static reg_errcode_t
3b0bdc72
UD
850init_word_char (dfa)
851 re_dfa_t *dfa;
852{
853 int i, j, ch;
854 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
bc15410e 855 if (BE (dfa->word_char == NULL, 0))
a9388965 856 return REG_ESPACE;
3b0bdc72
UD
857 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
858 for (j = 0; j < UINT_BITS; ++j, ++ch)
859 if (isalnum (ch) || ch == '_')
15a7d175 860 dfa->word_char[i] |= 1 << j;
a9388965 861 return REG_NOERROR;
3b0bdc72
UD
862}
863
864/* Free the work area which are only used while compiling. */
865
866static void
867free_workarea_compile (preg)
868 regex_t *preg;
869{
870 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
871 free_bin_tree (dfa->str_tree);
872 dfa->str_tree = NULL;
a7d5c291
UD
873 re_free (dfa->org_indices);
874 dfa->org_indices = NULL;
3b0bdc72
UD
875}
876
877/* Create initial states for all contexts. */
878
879static reg_errcode_t
880create_initial_state (dfa)
881 re_dfa_t *dfa;
882{
883 int first, i;
884 reg_errcode_t err;
885 re_node_set init_nodes;
886
887 /* Initial states have the epsilon closure of the node which is
888 the first node of the regular expression. */
889 first = dfa->str_tree->first;
890 dfa->init_node = first;
891 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 892 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
893 return err;
894
895 /* The back-references which are in initial states can epsilon transit,
896 since in this case all of the subexpressions can be null.
897 Then we add epsilon closures of the nodes which are the next nodes of
898 the back-references. */
899 if (dfa->nbackref > 0)
900 for (i = 0; i < init_nodes.nelem; ++i)
901 {
15a7d175
UD
902 int node_idx = init_nodes.elems[i];
903 re_token_type_t type = dfa->nodes[node_idx].type;
904
905 int clexp_idx;
906 if (type != OP_BACK_REF)
907 continue;
908 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
909 {
910 re_token_t *clexp_node;
911 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
912 if (clexp_node->type == OP_CLOSE_SUBEXP
913 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
914 break;
915 }
916 if (clexp_idx == init_nodes.nelem)
917 continue;
918
919 if (type == OP_BACK_REF)
920 {
921 int dest_idx = dfa->edests[node_idx].elems[0];
922 if (!re_node_set_contains (&init_nodes, dest_idx))
923 {
924 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
925 i = 0;
926 }
927 }
3b0bdc72
UD
928 }
929
930 /* It must be the first time to invoke acquire_state. */
a9388965
UD
931 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
932 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 933 if (BE (dfa->init_state == NULL, 0))
a9388965 934 return err;
3b0bdc72
UD
935 if (dfa->init_state->has_constraint)
936 {
a9388965 937 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 938 CONTEXT_WORD);
a9388965 939 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 940 CONTEXT_NEWLINE);
a9388965 941 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
942 &init_nodes,
943 CONTEXT_NEWLINE
944 | CONTEXT_BEGBUF);
bc15410e 945 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
946 || dfa->init_state_begbuf == NULL, 0))
947 return err;
3b0bdc72
UD
948 }
949 else
950 dfa->init_state_word = dfa->init_state_nl
951 = dfa->init_state_begbuf = dfa->init_state;
952
3b0bdc72
UD
953 re_node_set_free (&init_nodes);
954 return REG_NOERROR;
955}
956\f
14744156
UD
957#ifdef RE_ENABLE_I18N
958/* If it is possible to do searching in single byte encoding instead of UTF-8
959 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
960 DFA nodes where needed. */
961
962static void
963optimize_utf8 (dfa)
964 re_dfa_t *dfa;
965{
966 int node;
967
968 for (node = 0; node < dfa->nodes_len; ++node)
969 switch (dfa->nodes[node].type)
970 {
971 case CHARACTER:
972 /* Chars >= 0x80 are optimizable in some cases (e.g. when not
973 followed by DUP operator, not in bracket etc.).
974 For now punt on them all. */
975 if (dfa->nodes[node].opr.c >= 0x80)
976 return;
977 break;
978 case ANCHOR:
979 switch (dfa->nodes[node].opr.idx)
980 {
981 case LINE_FIRST:
982 case LINE_LAST:
983 case BUF_FIRST:
984 case BUF_LAST:
985 break;
986 default:
987 /* Word anchors etc. cannot be handled. */
988 return;
989 }
990 break;
991 case OP_BACK_REF:
992 case OP_ALT:
993 case END_OF_RE:
994 case BACK_SLASH:
995 case OP_DUP_ASTERISK:
996 case OP_DUP_QUESTION:
997 case OP_DUP_PLUS:
998 case OP_OPEN_SUBEXP:
999 case OP_CLOSE_SUBEXP:
1000 break;
1001 default:
1002 return;
1003 }
1004
1005 /* The search can be in single byte locale. */
1006 dfa->mb_cur_max = 1;
1007 dfa->is_utf8 = 0;
1008 dfa->has_mb_node = dfa->nbackref > 0;
1009}
1010#endif
1011\f
3b0bdc72
UD
1012/* Analyze the structure tree, and calculate "first", "next", "edest",
1013 "eclosure", and "inveclosure". */
1014
1015static reg_errcode_t
1016analyze (dfa)
1017 re_dfa_t *dfa;
1018{
1019 int i;
1020 reg_errcode_t ret;
1021
1022 /* Allocate arrays. */
3b0bdc72 1023 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
a7d5c291 1024 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3b0bdc72
UD
1025 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1026 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1027 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1028 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
15a7d175 1029 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
3b0bdc72
UD
1030 return REG_ESPACE;
1031 /* Initialize them. */
1032 for (i = 0; i < dfa->nodes_len; ++i)
1033 {
3b0bdc72
UD
1034 dfa->nexts[i] = -1;
1035 re_node_set_init_empty (dfa->edests + i);
1036 re_node_set_init_empty (dfa->eclosures + i);
1037 re_node_set_init_empty (dfa->inveclosures + i);
1038 }
1039
1040 ret = analyze_tree (dfa, dfa->str_tree);
bc15410e 1041 if (BE (ret == REG_NOERROR, 1))
3b0bdc72
UD
1042 {
1043 ret = calc_eclosure (dfa);
1044 if (ret == REG_NOERROR)
1045 calc_inveclosure (dfa);
1046 }
1047 return ret;
1048}
1049
1050/* Helper functions for analyze.
1051 This function calculate "first", "next", and "edest" for the subtree
1052 whose root is NODE. */
1053
1054static reg_errcode_t
1055analyze_tree (dfa, node)
1056 re_dfa_t *dfa;
1057 bin_tree_t *node;
1058{
1059 reg_errcode_t ret;
1060 if (node->first == -1)
1061 calc_first (dfa, node);
1062 if (node->next == -1)
1063 calc_next (dfa, node);
1064 if (node->eclosure.nelem == 0)
1065 calc_epsdest (dfa, node);
1066 /* Calculate "first" etc. for the left child. */
1067 if (node->left != NULL)
1068 {
1069 ret = analyze_tree (dfa, node->left);
bc15410e 1070 if (BE (ret != REG_NOERROR, 0))
15a7d175 1071 return ret;
3b0bdc72
UD
1072 }
1073 /* Calculate "first" etc. for the right child. */
1074 if (node->right != NULL)
1075 {
1076 ret = analyze_tree (dfa, node->right);
bc15410e 1077 if (BE (ret != REG_NOERROR, 0))
15a7d175 1078 return ret;
3b0bdc72
UD
1079 }
1080 return REG_NOERROR;
1081}
1082
1083/* Calculate "first" for the node NODE. */
1084static void
1085calc_first (dfa, node)
1086 re_dfa_t *dfa;
1087 bin_tree_t *node;
1088{
1089 int idx, type;
1090 idx = node->node_idx;
1091 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1092
1093 switch (type)
1094 {
1095#ifdef DEBUG
3b0bdc72
UD
1096 case OP_OPEN_BRACKET:
1097 case OP_CLOSE_BRACKET:
1098 case OP_OPEN_DUP_NUM:
1099 case OP_CLOSE_DUP_NUM:
1100 case OP_NON_MATCH_LIST:
1101 case OP_OPEN_COLL_ELEM:
1102 case OP_CLOSE_COLL_ELEM:
1103 case OP_OPEN_EQUIV_CLASS:
1104 case OP_CLOSE_EQUIV_CLASS:
1105 case OP_OPEN_CHAR_CLASS:
1106 case OP_CLOSE_CHAR_CLASS:
1107 /* These must not be appeared here. */
1108 assert (0);
1109#endif
1110 case END_OF_RE:
1111 case CHARACTER:
1112 case OP_PERIOD:
1113 case OP_DUP_ASTERISK:
1114 case OP_DUP_QUESTION:
c0a0f9a3 1115#ifdef RE_ENABLE_I18N
3b0bdc72 1116 case COMPLEX_BRACKET:
c0a0f9a3 1117#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1118 case SIMPLE_BRACKET:
1119 case OP_BACK_REF:
1120 case ANCHOR:
81c64d40
UD
1121 case OP_OPEN_SUBEXP:
1122 case OP_CLOSE_SUBEXP:
3b0bdc72
UD
1123 node->first = idx;
1124 break;
1125 case OP_DUP_PLUS:
1126#ifdef DEBUG
1127 assert (node->left != NULL);
1128#endif
1129 if (node->left->first == -1)
15a7d175 1130 calc_first (dfa, node->left);
3b0bdc72
UD
1131 node->first = node->left->first;
1132 break;
1133 case OP_ALT:
1134 node->first = idx;
1135 break;
3b0bdc72
UD
1136 /* else fall through */
1137 default:
1138#ifdef DEBUG
1139 assert (node->left != NULL);
1140#endif
1141 if (node->left->first == -1)
15a7d175 1142 calc_first (dfa, node->left);
3b0bdc72
UD
1143 node->first = node->left->first;
1144 break;
1145 }
3b0bdc72
UD
1146}
1147
1148/* Calculate "next" for the node NODE. */
1149
1150static void
1151calc_next (dfa, node)
1152 re_dfa_t *dfa;
1153 bin_tree_t *node;
1154{
1155 int idx, type;
1156 bin_tree_t *parent = node->parent;
1157 if (parent == NULL)
1158 {
1159 node->next = -1;
1160 idx = node->node_idx;
1161 if (node->type == 0)
15a7d175 1162 dfa->nexts[idx] = node->next;
3b0bdc72
UD
1163 return;
1164 }
1165
1166 idx = parent->node_idx;
1167 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1168
1169 switch (type)
1170 {
1171 case OP_DUP_ASTERISK:
1172 case OP_DUP_PLUS:
1173 node->next = idx;
1174 break;
1175 case CONCAT:
1176 if (parent->left == node)
15a7d175
UD
1177 {
1178 if (parent->right->first == -1)
1179 calc_first (dfa, parent->right);
1180 node->next = parent->right->first;
1181 break;
1182 }
3b0bdc72
UD
1183 /* else fall through */
1184 default:
1185 if (parent->next == -1)
15a7d175 1186 calc_next (dfa, parent);
3b0bdc72
UD
1187 node->next = parent->next;
1188 break;
1189 }
1190 idx = node->node_idx;
1191 if (node->type == 0)
1192 dfa->nexts[idx] = node->next;
1193}
1194
1195/* Calculate "edest" for the node NODE. */
1196
1197static void
1198calc_epsdest (dfa, node)
1199 re_dfa_t *dfa;
1200 bin_tree_t *node;
1201{
1202 int idx;
1203 idx = node->node_idx;
1204 if (node->type == 0)
1205 {
1206 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
15a7d175
UD
1207 || dfa->nodes[idx].type == OP_DUP_PLUS
1208 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1209 {
1210 if (node->left->first == -1)
1211 calc_first (dfa, node->left);
1212 if (node->next == -1)
1213 calc_next (dfa, node);
1214 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1215 node->next);
1216 }
3b0bdc72 1217 else if (dfa->nodes[idx].type == OP_ALT)
15a7d175
UD
1218 {
1219 int left, right;
1220 if (node->left != NULL)
1221 {
1222 if (node->left->first == -1)
1223 calc_first (dfa, node->left);
1224 left = node->left->first;
1225 }
1226 else
1227 {
1228 if (node->next == -1)
1229 calc_next (dfa, node);
1230 left = node->next;
1231 }
1232 if (node->right != NULL)
1233 {
1234 if (node->right->first == -1)
1235 calc_first (dfa, node->right);
1236 right = node->right->first;
1237 }
1238 else
1239 {
1240 if (node->next == -1)
1241 calc_next (dfa, node);
1242 right = node->next;
1243 }
1244 re_node_set_init_2 (dfa->edests + idx, left, right);
1245 }
81c64d40 1246 else if (dfa->nodes[idx].type == ANCHOR
15a7d175
UD
1247 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1248 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1249 || dfa->nodes[idx].type == OP_BACK_REF)
1250 re_node_set_init_1 (dfa->edests + idx, node->next);
1769a73f
UD
1251 else
1252 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
3b0bdc72
UD
1253 }
1254}
1255
485d775d
UD
1256/* Duplicate the epsilon closure of the node ROOT_NODE.
1257 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1258 to their own constraint. */
1259
1260static reg_errcode_t
1261duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
15a7d175 1262 init_constraint)
485d775d
UD
1263 re_dfa_t *dfa;
1264 int top_org_node, top_clone_node, root_node;
1265 unsigned int init_constraint;
1266{
1267 reg_errcode_t err;
1268 int org_node, clone_node, ret;
1269 unsigned int constraint = init_constraint;
1270 for (org_node = top_org_node, clone_node = top_clone_node;;)
1271 {
1272 int org_dest, clone_dest;
1273 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1274 {
485d775d
UD
1275 /* If the back reference epsilon-transit, its destination must
1276 also have the constraint. Then duplicate the epsilon closure
1277 of the destination of the back reference, and store it in
1278 edests of the back reference. */
15a7d175
UD
1279 org_dest = dfa->nexts[org_node];
1280 re_node_set_empty (dfa->edests + clone_node);
1281 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1282 if (BE (err != REG_NOERROR, 0))
1283 return err;
1284 dfa->nexts[clone_node] = dfa->nexts[org_node];
1285 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1286 if (BE (ret < 0, 0))
1287 return REG_ESPACE;
1288 }
485d775d 1289 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1290 {
485d775d
UD
1291 /* In case of the node can't epsilon-transit, don't duplicate the
1292 destination and store the original destination as the
1293 destination of the node. */
15a7d175
UD
1294 dfa->nexts[clone_node] = dfa->nexts[org_node];
1295 break;
1296 }
485d775d 1297 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1298 {
485d775d
UD
1299 /* In case of the node can epsilon-transit, and it has only one
1300 destination. */
15a7d175
UD
1301 org_dest = dfa->edests[org_node].elems[0];
1302 re_node_set_empty (dfa->edests + clone_node);
1303 if (dfa->nodes[org_node].type == ANCHOR)
1304 {
485d775d 1305 /* In case of the node has another constraint, append it. */
15a7d175
UD
1306 if (org_node == root_node && clone_node != org_node)
1307 {
485d775d
UD
1308 /* ...but if the node is root_node itself, it means the
1309 epsilon closure have a loop, then tie it to the
1310 destination of the root_node. */
15a7d175
UD
1311 ret = re_node_set_insert (dfa->edests + clone_node,
1312 org_dest);
1313 if (BE (ret < 0, 0))
1314 return REG_ESPACE;
1315 break;
1316 }
1317 constraint |= dfa->nodes[org_node].opr.ctx_type;
1318 }
1319 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1320 if (BE (err != REG_NOERROR, 0))
1321 return err;
1322 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1323 if (BE (ret < 0, 0))
1324 return REG_ESPACE;
1325 }
485d775d 1326 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1327 {
485d775d 1328 /* In case of the node can epsilon-transit, and it has two
a7d5c291 1329 destinations. E.g. '|', '*', '+', '?'. */
15a7d175
UD
1330 org_dest = dfa->edests[org_node].elems[0];
1331 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1332 /* Search for a duplicated node which satisfies the constraint. */
1333 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1334 if (clone_dest == -1)
1335 {
1336 /* There are no such a duplicated node, create a new one. */
1337 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1338 if (BE (err != REG_NOERROR, 0))
1339 return err;
1340 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1341 if (BE (ret < 0, 0))
1342 return REG_ESPACE;
1343 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1344 root_node, constraint);
1345 if (BE (err != REG_NOERROR, 0))
1346 return err;
1347 }
1348 else
1349 {
1350 /* There are a duplicated node which satisfy the constraint,
1351 use it to avoid infinite loop. */
1352 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1353 if (BE (ret < 0, 0))
1354 return REG_ESPACE;
1355 }
15a7d175
UD
1356
1357 org_dest = dfa->edests[org_node].elems[1];
1358 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1359 if (BE (err != REG_NOERROR, 0))
1360 return err;
1361 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1362 if (BE (ret < 0, 0))
1363 return REG_ESPACE;
1364 }
485d775d
UD
1365 org_node = org_dest;
1366 clone_node = clone_dest;
1367 }
1368 return REG_NOERROR;
1369}
1370
a7d5c291
UD
1371/* Search for a node which is duplicated from the node ORG_NODE, and
1372 satisfies the constraint CONSTRAINT. */
1373
1374static int
1375search_duplicated_node (dfa, org_node, constraint)
1376 re_dfa_t *dfa;
1377 int org_node;
1378 unsigned int constraint;
1379{
1380 int idx;
1381 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1382 {
1383 if (org_node == dfa->org_indices[idx]
1384 && constraint == dfa->nodes[idx].constraint)
1385 return idx; /* Found. */
1386 }
1387 return -1; /* Not found. */
1388}
1389
a9388965
UD
1390/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1391 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1392 otherwise return the error code. */
1393
1394static reg_errcode_t
1395duplicate_node (new_idx, dfa, org_idx, constraint)
3b0bdc72 1396 re_dfa_t *dfa;
a9388965 1397 int *new_idx, org_idx;
3b0bdc72
UD
1398 unsigned int constraint;
1399{
1400 re_token_t dup;
1401 int dup_idx;
1402
485d775d 1403 dup = dfa->nodes[org_idx];
a9388965 1404 dup_idx = re_dfa_add_node (dfa, dup, 1);
485d775d 1405 if (BE (dup_idx == -1, 0))
a9388965 1406 return REG_ESPACE;
485d775d
UD
1407 dfa->nodes[dup_idx].constraint = constraint;
1408 if (dfa->nodes[org_idx].type == ANCHOR)
1409 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
a9388965 1410 dfa->nodes[dup_idx].duplicated = 1;
485d775d
UD
1411 re_node_set_init_empty (dfa->edests + dup_idx);
1412 re_node_set_init_empty (dfa->eclosures + dup_idx);
1413 re_node_set_init_empty (dfa->inveclosures + dup_idx);
3b0bdc72 1414
a7d5c291
UD
1415 /* Store the index of the original node. */
1416 dfa->org_indices[dup_idx] = org_idx;
a9388965
UD
1417 *new_idx = dup_idx;
1418 return REG_NOERROR;
3b0bdc72
UD
1419}
1420
1421static void
1422calc_inveclosure (dfa)
1423 re_dfa_t *dfa;
1424{
485d775d 1425 int src, idx, dest;
3b0bdc72
UD
1426 for (src = 0; src < dfa->nodes_len; ++src)
1427 {
1428 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175
UD
1429 {
1430 dest = dfa->eclosures[src].elems[idx];
1431 re_node_set_insert (dfa->inveclosures + dest, src);
1432 }
3b0bdc72
UD
1433 }
1434}
1435
1436/* Calculate "eclosure" for all the node in DFA. */
1437
1438static reg_errcode_t
1439calc_eclosure (dfa)
1440 re_dfa_t *dfa;
1441{
485d775d 1442 int node_idx, incomplete;
3b0bdc72
UD
1443#ifdef DEBUG
1444 assert (dfa->nodes_len > 0);
1445#endif
485d775d 1446 incomplete = 0;
3b0bdc72 1447 /* For each nodes, calculate epsilon closure. */
485d775d 1448 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1449 {
a9388965 1450 reg_errcode_t err;
3b0bdc72 1451 re_node_set eclosure_elem;
485d775d 1452 if (node_idx == dfa->nodes_len)
15a7d175
UD
1453 {
1454 if (!incomplete)
1455 break;
1456 incomplete = 0;
1457 node_idx = 0;
1458 }
3b0bdc72
UD
1459
1460#ifdef DEBUG
3b0bdc72
UD
1461 assert (dfa->eclosures[node_idx].nelem != -1);
1462#endif
1463 /* If we have already calculated, skip it. */
1464 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1465 continue;
3b0bdc72 1466 /* Calculate epsilon closure of `node_idx'. */
a9388965 1467 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1468 if (BE (err != REG_NOERROR, 0))
15a7d175 1469 return err;
3b0bdc72
UD
1470
1471 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175
UD
1472 {
1473 incomplete = 1;
1474 re_node_set_free (&eclosure_elem);
1475 }
3b0bdc72 1476 }
3b0bdc72
UD
1477 return REG_NOERROR;
1478}
1479
1480/* Calculate epsilon closure of NODE. */
1481
a9388965
UD
1482static reg_errcode_t
1483calc_eclosure_iter (new_set, dfa, node, root)
1484 re_node_set *new_set;
3b0bdc72
UD
1485 re_dfa_t *dfa;
1486 int node, root;
1487{
a9388965 1488 reg_errcode_t err;
3b0bdc72 1489 unsigned int constraint;
485d775d 1490 int i, incomplete;
3b0bdc72 1491 re_node_set eclosure;
485d775d 1492 incomplete = 0;
a9388965 1493 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1494 if (BE (err != REG_NOERROR, 0))
a9388965 1495 return err;
3b0bdc72
UD
1496
1497 /* This indicates that we are calculating this node now.
1498 We reference this value to avoid infinite loop. */
1499 dfa->eclosures[node].nelem = -1;
1500
1501 constraint = ((dfa->nodes[node].type == ANCHOR)
15a7d175 1502 ? dfa->nodes[node].opr.ctx_type : 0);
485d775d
UD
1503 /* If the current node has constraints, duplicate all nodes.
1504 Since they must inherit the constraints. */
1505 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1506 {
1507 int org_node, cur_node;
1508 org_node = cur_node = node;
1509 err = duplicate_node_closure (dfa, node, node, node, constraint);
1510 if (BE (err != REG_NOERROR, 0))
15a7d175 1511 return err;
485d775d 1512 }
3b0bdc72
UD
1513
1514 /* Expand each epsilon destination nodes. */
485d775d 1515 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1516 for (i = 0; i < dfa->edests[node].nelem; ++i)
1517 {
15a7d175
UD
1518 re_node_set eclosure_elem;
1519 int edest = dfa->edests[node].elems[i];
1520 /* If calculating the epsilon closure of `edest' is in progress,
1521 return intermediate result. */
1522 if (dfa->eclosures[edest].nelem == -1)
1523 {
1524 incomplete = 1;
1525 continue;
1526 }
1527 /* If we haven't calculated the epsilon closure of `edest' yet,
1528 calculate now. Otherwise use calculated epsilon closure. */
1529 if (dfa->eclosures[edest].nelem == 0)
1530 {
1531 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1532 if (BE (err != REG_NOERROR, 0))
1533 return err;
1534 }
1535 else
1536 eclosure_elem = dfa->eclosures[edest];
1537 /* Merge the epsilon closure of `edest'. */
1538 re_node_set_merge (&eclosure, &eclosure_elem);
1539 /* If the epsilon closure of `edest' is incomplete,
1540 the epsilon closure of this node is also incomplete. */
1541 if (dfa->eclosures[edest].nelem == 0)
1542 {
1543 incomplete = 1;
1544 re_node_set_free (&eclosure_elem);
1545 }
3b0bdc72
UD
1546 }
1547
3b0bdc72
UD
1548 /* Epsilon closures include itself. */
1549 re_node_set_insert (&eclosure, node);
1550 if (incomplete && !root)
1551 dfa->eclosures[node].nelem = 0;
1552 else
1553 dfa->eclosures[node] = eclosure;
a9388965
UD
1554 *new_set = eclosure;
1555 return REG_NOERROR;
3b0bdc72
UD
1556}
1557\f
1558/* Functions for token which are used in the parser. */
1559
1560/* Fetch a token from INPUT.
1561 We must not use this function inside bracket expressions. */
1562
1563static re_token_t
1564fetch_token (input, syntax)
1565 re_string_t *input;
1566 reg_syntax_t syntax;
1567{
1568 re_token_t token;
1569 int consumed_byte;
1570 consumed_byte = peek_token (&token, input, syntax);
1571 re_string_skip_bytes (input, consumed_byte);
1572 return token;
1573}
1574
1575/* Peek a token from INPUT, and return the length of the token.
1576 We must not use this function inside bracket expressions. */
1577
1578static int
1579peek_token (token, input, syntax)
1580 re_token_t *token;
1581 re_string_t *input;
1582 reg_syntax_t syntax;
1583{
1584 unsigned char c;
1585
1586 if (re_string_eoi (input))
1587 {
1588 token->type = END_OF_RE;
1589 return 0;
1590 }
1591
1592 c = re_string_peek_byte (input, 0);
1593 token->opr.c = c;
1594
1595#ifdef RE_ENABLE_I18N
1596 token->mb_partial = 0;
3c0fb574 1597 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1598 !re_string_first_byte (input, re_string_cur_idx (input)))
1599 {
1600 token->type = CHARACTER;
1601 token->mb_partial = 1;
1602 return 1;
1603 }
1604#endif
1605 if (c == '\\')
1606 {
1607 unsigned char c2;
1608 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1609 {
1610 token->type = BACK_SLASH;
1611 return 1;
1612 }
3b0bdc72
UD
1613
1614 c2 = re_string_peek_byte_case (input, 1);
1615 token->opr.c = c2;
1616 token->type = CHARACTER;
1617 switch (c2)
15a7d175
UD
1618 {
1619 case '|':
1620 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1621 token->type = OP_ALT;
1622 break;
1623 case '1': case '2': case '3': case '4': case '5':
1624 case '6': case '7': case '8': case '9':
1625 if (!(syntax & RE_NO_BK_REFS))
1626 {
1627 token->type = OP_BACK_REF;
1628 token->opr.idx = c2 - '0';
1629 }
1630 break;
1631 case '<':
1632 if (!(syntax & RE_NO_GNU_OPS))
1633 {
1634 token->type = ANCHOR;
1635 token->opr.idx = WORD_FIRST;
1636 }
1637 break;
1638 case '>':
1639 if (!(syntax & RE_NO_GNU_OPS))
1640 {
1641 token->type = ANCHOR;
1642 token->opr.idx = WORD_LAST;
1643 }
1644 break;
1645 case 'b':
1646 if (!(syntax & RE_NO_GNU_OPS))
1647 {
1648 token->type = ANCHOR;
1649 token->opr.idx = WORD_DELIM;
1650 }
1651 break;
1652 case 'B':
1653 if (!(syntax & RE_NO_GNU_OPS))
1654 {
1655 token->type = ANCHOR;
1656 token->opr.idx = INSIDE_WORD;
1657 }
1658 break;
1659 case 'w':
1660 if (!(syntax & RE_NO_GNU_OPS))
1661 token->type = OP_WORD;
1662 break;
1663 case 'W':
1664 if (!(syntax & RE_NO_GNU_OPS))
1665 token->type = OP_NOTWORD;
1666 break;
e2b6bfa3
UD
1667 case 's':
1668 if (!(syntax & RE_NO_GNU_OPS))
1669 token->type = OP_SPACE;
1670 break;
1671 case 'S':
1672 if (!(syntax & RE_NO_GNU_OPS))
1673 token->type = OP_NOTSPACE;
1674 break;
15a7d175
UD
1675 case '`':
1676 if (!(syntax & RE_NO_GNU_OPS))
1677 {
1678 token->type = ANCHOR;
1679 token->opr.idx = BUF_FIRST;
1680 }
1681 break;
1682 case '\'':
1683 if (!(syntax & RE_NO_GNU_OPS))
1684 {
1685 token->type = ANCHOR;
1686 token->opr.idx = BUF_LAST;
1687 }
1688 break;
1689 case '(':
1690 if (!(syntax & RE_NO_BK_PARENS))
1691 token->type = OP_OPEN_SUBEXP;
1692 break;
1693 case ')':
1694 if (!(syntax & RE_NO_BK_PARENS))
1695 token->type = OP_CLOSE_SUBEXP;
1696 break;
1697 case '+':
1698 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1699 token->type = OP_DUP_PLUS;
1700 break;
1701 case '?':
1702 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1703 token->type = OP_DUP_QUESTION;
1704 break;
1705 case '{':
1706 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1707 token->type = OP_OPEN_DUP_NUM;
1708 break;
1709 case '}':
1710 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1711 token->type = OP_CLOSE_DUP_NUM;
1712 break;
1713 default:
1714 break;
1715 }
3b0bdc72
UD
1716 return 2;
1717 }
1718
1719 token->type = CHARACTER;
1720 switch (c)
1721 {
1722 case '\n':
1723 if (syntax & RE_NEWLINE_ALT)
15a7d175 1724 token->type = OP_ALT;
3b0bdc72
UD
1725 break;
1726 case '|':
1727 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1728 token->type = OP_ALT;
3b0bdc72
UD
1729 break;
1730 case '*':
1731 token->type = OP_DUP_ASTERISK;
1732 break;
1733 case '+':
1734 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1735 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1736 break;
1737 case '?':
1738 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1739 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1740 break;
1741 case '{':
1742 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1743 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1744 break;
1745 case '}':
1746 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1747 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1748 break;
1749 case '(':
1750 if (syntax & RE_NO_BK_PARENS)
15a7d175 1751 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1752 break;
1753 case ')':
1754 if (syntax & RE_NO_BK_PARENS)
15a7d175 1755 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1756 break;
1757 case '[':
1758 token->type = OP_OPEN_BRACKET;
1759 break;
1760 case '.':
1761 token->type = OP_PERIOD;
1762 break;
1763 case '^':
134abcb5 1764 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
1765 re_string_cur_idx (input) != 0)
1766 {
1767 char prev = re_string_peek_byte (input, -1);
134abcb5 1768 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1769 break;
1770 }
3b0bdc72
UD
1771 token->type = ANCHOR;
1772 token->opr.idx = LINE_FIRST;
1773 break;
1774 case '$':
1775 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
1776 re_string_cur_idx (input) + 1 != re_string_length (input))
1777 {
1778 re_token_t next;
1779 re_string_skip_bytes (input, 1);
1780 peek_token (&next, input, syntax);
1781 re_string_skip_bytes (input, -1);
1782 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1783 break;
1784 }
3b0bdc72
UD
1785 token->type = ANCHOR;
1786 token->opr.idx = LINE_LAST;
1787 break;
1788 default:
1789 break;
1790 }
1791 return 1;
1792}
1793
1794/* Peek a token from INPUT, and return the length of the token.
1795 We must not use this function out of bracket expressions. */
1796
1797static int
1798peek_token_bracket (token, input, syntax)
1799 re_token_t *token;
1800 re_string_t *input;
1801 reg_syntax_t syntax;
1802{
1803 unsigned char c;
1804 if (re_string_eoi (input))
1805 {
1806 token->type = END_OF_RE;
1807 return 0;
1808 }
1809 c = re_string_peek_byte (input, 0);
1810 token->opr.c = c;
1811
1812#ifdef RE_ENABLE_I18N
3c0fb574 1813 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1814 !re_string_first_byte (input, re_string_cur_idx (input)))
1815 {
1816 token->type = CHARACTER;
1817 return 1;
1818 }
1819#endif /* RE_ENABLE_I18N */
1820
1821 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1822 {
1823 /* In this case, '\' escape a character. */
1824 unsigned char c2;
82bbb29e
UD
1825 re_string_skip_bytes (input, 1);
1826 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
1827 token->opr.c = c2;
1828 token->type = CHARACTER;
1829 return 1;
1830 }
1831 if (c == '[') /* '[' is a special char in a bracket exps. */
1832 {
1833 unsigned char c2;
1834 int token_len;
1835 c2 = re_string_peek_byte (input, 1);
1836 token->opr.c = c2;
1837 token_len = 2;
1838 switch (c2)
15a7d175
UD
1839 {
1840 case '.':
1841 token->type = OP_OPEN_COLL_ELEM;
1842 break;
1843 case '=':
1844 token->type = OP_OPEN_EQUIV_CLASS;
1845 break;
1846 case ':':
1847 if (syntax & RE_CHAR_CLASSES)
1848 {
1849 token->type = OP_OPEN_CHAR_CLASS;
1850 break;
1851 }
1852 /* else fall through. */
1853 default:
1854 token->type = CHARACTER;
1855 token->opr.c = c;
1856 token_len = 1;
1857 break;
1858 }
3b0bdc72
UD
1859 return token_len;
1860 }
1861 switch (c)
1862 {
1863 case '-':
1864 token->type = OP_CHARSET_RANGE;
1865 break;
1866 case ']':
1867 token->type = OP_CLOSE_BRACKET;
1868 break;
1869 case '^':
1870 token->type = OP_NON_MATCH_LIST;
1871 break;
1872 default:
1873 token->type = CHARACTER;
1874 }
1875 return 1;
1876}
1877\f
1878/* Functions for parser. */
1879
1880/* Entry point of the parser.
1881 Parse the regular expression REGEXP and return the structure tree.
1882 If an error is occured, ERR is set by error code, and return NULL.
1883 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
1884 CAT
1885 / \
1886 / \
3b0bdc72
UD
1887 <reg_exp> EOR
1888
1889 CAT means concatenation.
1890 EOR means end of regular expression. */
1891
1892static bin_tree_t *
1893parse (regexp, preg, syntax, err)
1894 re_string_t *regexp;
1895 regex_t *preg;
1896 reg_syntax_t syntax;
1897 reg_errcode_t *err;
1898{
1899 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1900 bin_tree_t *tree, *eor, *root;
1901 re_token_t current_token;
1902 int new_idx;
134abcb5 1903 current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 1904 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 1905 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1906 return NULL;
1907 new_idx = re_dfa_add_node (dfa, current_token, 0);
1908 eor = create_tree (NULL, NULL, 0, new_idx);
1909 if (tree != NULL)
1910 root = create_tree (tree, eor, CONCAT, 0);
1911 else
1912 root = eor;
bc15410e 1913 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
485d775d
UD
1914 {
1915 *err = REG_ESPACE;
1916 return NULL;
1917 }
3b0bdc72
UD
1918 return root;
1919}
1920
1921/* This function build the following tree, from regular expression
1922 <branch1>|<branch2>:
15a7d175
UD
1923 ALT
1924 / \
1925 / \
3b0bdc72
UD
1926 <branch1> <branch2>
1927
1928 ALT means alternative, which represents the operator `|'. */
1929
1930static bin_tree_t *
1931parse_reg_exp (regexp, preg, token, syntax, nest, err)
1932 re_string_t *regexp;
1933 regex_t *preg;
1934 re_token_t *token;
1935 reg_syntax_t syntax;
1936 int nest;
1937 reg_errcode_t *err;
1938{
1939 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1940 bin_tree_t *tree, *branch = NULL;
1941 int new_idx;
1942 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 1943 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1944 return NULL;
1945
1946 while (token->type == OP_ALT)
1947 {
1948 re_token_t alt_token = *token;
1949 new_idx = re_dfa_add_node (dfa, alt_token, 0);
134abcb5 1950 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 1951 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
1952 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1953 {
1954 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1955 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1956 {
1957 free_bin_tree (tree);
1958 return NULL;
1959 }
1960 }
9b88fc16
UD
1961 else
1962 branch = NULL;
3b0bdc72 1963 tree = create_tree (tree, branch, 0, new_idx);
bc15410e 1964 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
1965 {
1966 *err = REG_ESPACE;
1967 return NULL;
1968 }
0742e48e 1969 dfa->has_plural_match = 1;
3b0bdc72
UD
1970 }
1971 return tree;
1972}
1973
1974/* This function build the following tree, from regular expression
1975 <exp1><exp2>:
15a7d175
UD
1976 CAT
1977 / \
3b0bdc72
UD
1978 / \
1979 <exp1> <exp2>
1980
1981 CAT means concatenation. */
1982
1983static bin_tree_t *
1984parse_branch (regexp, preg, token, syntax, nest, err)
1985 re_string_t *regexp;
1986 regex_t *preg;
1987 re_token_t *token;
1988 reg_syntax_t syntax;
1989 int nest;
1990 reg_errcode_t *err;
1991{
1992 bin_tree_t *tree, *exp;
1993 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 1994 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1995 return NULL;
1996
1997 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 1998 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72
UD
1999 {
2000 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2001 if (BE (*err != REG_NOERROR && exp == NULL, 0))
15a7d175
UD
2002 {
2003 free_bin_tree (tree);
2004 return NULL;
2005 }
3b0bdc72 2006 if (tree != NULL && exp != NULL)
15a7d175
UD
2007 {
2008 tree = create_tree (tree, exp, CONCAT, 0);
2009 if (tree == NULL)
2010 {
2011 *err = REG_ESPACE;
2012 return NULL;
2013 }
2014 }
3b0bdc72 2015 else if (tree == NULL)
15a7d175 2016 tree = exp;
3b0bdc72
UD
2017 /* Otherwise exp == NULL, we don't need to create new tree. */
2018 }
2019 return tree;
2020}
2021
2022/* This function build the following tree, from regular expression a*:
15a7d175
UD
2023 *
2024 |
2025 a
3b0bdc72
UD
2026*/
2027
2028static bin_tree_t *
2029parse_expression (regexp, preg, token, syntax, nest, err)
2030 re_string_t *regexp;
2031 regex_t *preg;
2032 re_token_t *token;
2033 reg_syntax_t syntax;
2034 int nest;
2035 reg_errcode_t *err;
2036{
2037 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2038 bin_tree_t *tree;
2039 int new_idx;
2040 switch (token->type)
2041 {
2042 case CHARACTER:
2043 new_idx = re_dfa_add_node (dfa, *token, 0);
2044 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2045 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2046 {
2047 *err = REG_ESPACE;
2048 return NULL;
2049 }
3b0bdc72 2050#ifdef RE_ENABLE_I18N
3c0fb574 2051 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2052 {
2053 while (!re_string_eoi (regexp)
2054 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2055 {
15a7d175
UD
2056 bin_tree_t *mbc_remain;
2057 *token = fetch_token (regexp, syntax);
2058 new_idx = re_dfa_add_node (dfa, *token, 0);
2059 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
2060 tree = create_tree (tree, mbc_remain, CONCAT, 0);
2061 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2062 {
2063 *err = REG_ESPACE;
2064 return NULL;
2065 }
15a7d175 2066 }
3b0bdc72
UD
2067 }
2068#endif
2069 break;
2070 case OP_OPEN_SUBEXP:
2071 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2072 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2073 return NULL;
3b0bdc72
UD
2074 break;
2075 case OP_OPEN_BRACKET:
2076 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2078 return NULL;
3b0bdc72
UD
2079 break;
2080 case OP_BACK_REF:
bc15410e 2081 if (BE (preg->re_nsub < token->opr.idx
15a7d175
UD
2082 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2083 {
2084 *err = REG_ESUBREG;
2085 return NULL;
2086 }
6291ee3c 2087 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
3b0bdc72
UD
2088 new_idx = re_dfa_add_node (dfa, *token, 0);
2089 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2090 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2091 {
2092 *err = REG_ESPACE;
2093 return NULL;
2094 }
3b0bdc72
UD
2095 ++dfa->nbackref;
2096 dfa->has_mb_node = 1;
2097 break;
06e8303a
UD
2098 case OP_OPEN_DUP_NUM:
2099 if (syntax & RE_CONTEXT_INVALID_DUP)
2100 {
2101 *err = REG_BADRPT;
2102 return NULL;
2103 }
2104 /* FALLTHROUGH */
3b0bdc72
UD
2105 case OP_DUP_ASTERISK:
2106 case OP_DUP_PLUS:
2107 case OP_DUP_QUESTION:
3b0bdc72 2108 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2109 {
2110 *err = REG_BADRPT;
2111 return NULL;
2112 }
3b0bdc72 2113 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175
UD
2114 {
2115 *token = fetch_token (regexp, syntax);
2116 return parse_expression (regexp, preg, token, syntax, nest, err);
2117 }
3b0bdc72
UD
2118 /* else fall through */
2119 case OP_CLOSE_SUBEXP:
2120 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2121 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2122 {
2123 *err = REG_ERPAREN;
2124 return NULL;
2125 }
3b0bdc72
UD
2126 /* else fall through */
2127 case OP_CLOSE_DUP_NUM:
2128 /* We treat it as a normal character. */
2129
2130 /* Then we can these characters as normal characters. */
2131 token->type = CHARACTER;
2132 new_idx = re_dfa_add_node (dfa, *token, 0);
2133 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2134 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2135 {
2136 *err = REG_ESPACE;
2137 return NULL;
2138 }
3b0bdc72
UD
2139 break;
2140 case ANCHOR:
2141 if (dfa->word_char == NULL)
15a7d175
UD
2142 {
2143 *err = init_word_char (dfa);
2144 if (BE (*err != REG_NOERROR, 0))
2145 return NULL;
2146 }
3b0bdc72 2147 if (token->opr.ctx_type == WORD_DELIM)
15a7d175
UD
2148 {
2149 bin_tree_t *tree_first, *tree_last;
2150 int idx_first, idx_last;
2151 token->opr.ctx_type = WORD_FIRST;
2152 idx_first = re_dfa_add_node (dfa, *token, 0);
2153 tree_first = create_tree (NULL, NULL, 0, idx_first);
2154 token->opr.ctx_type = WORD_LAST;
2155 idx_last = re_dfa_add_node (dfa, *token, 0);
2156 tree_last = create_tree (NULL, NULL, 0, idx_last);
2157 token->type = OP_ALT;
2158 new_idx = re_dfa_add_node (dfa, *token, 0);
2159 tree = create_tree (tree_first, tree_last, 0, new_idx);
2160 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2161 || tree_first == NULL || tree_last == NULL
2162 || tree == NULL, 0))
2163 {
2164 *err = REG_ESPACE;
2165 return NULL;
2166 }
2167 }
3b0bdc72 2168 else
15a7d175
UD
2169 {
2170 new_idx = re_dfa_add_node (dfa, *token, 0);
2171 tree = create_tree (NULL, NULL, 0, new_idx);
2172 if (BE (new_idx == -1 || tree == NULL, 0))
54e1cabc
UD
2173 {
2174 *err = REG_ESPACE;
2175 return NULL;
2176 }
15a7d175 2177 }
3b0bdc72 2178 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2179 by repetition operators.
2180 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2181 it must not be "<ANCHOR(^)><REPEAT(*)>". */
3b0bdc72
UD
2182 *token = fetch_token (regexp, syntax);
2183 return tree;
2184 case OP_PERIOD:
2185 new_idx = re_dfa_add_node (dfa, *token, 0);
2186 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2187 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2188 {
2189 *err = REG_ESPACE;
2190 return NULL;
2191 }
3c0fb574 2192 if (dfa->mb_cur_max > 1)
15a7d175 2193 dfa->has_mb_node = 1;
3b0bdc72
UD
2194 break;
2195 case OP_WORD:
e2b6bfa3 2196 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
bc15410e 2197 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2198 return NULL;
3b0bdc72
UD
2199 break;
2200 case OP_NOTWORD:
e2b6bfa3
UD
2201 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2202 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2203 return NULL;
2204 break;
2205 case OP_SPACE:
2206 tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2207 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2208 return NULL;
2209 break;
2210 case OP_NOTSPACE:
2211 tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
bc15410e 2212 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2213 return NULL;
3b0bdc72
UD
2214 break;
2215 case OP_ALT:
2216 case END_OF_RE:
2217 return NULL;
2218 case BACK_SLASH:
2219 *err = REG_EESCAPE;
2220 return NULL;
2221 default:
2222 /* Must not happen? */
2223#ifdef DEBUG
2224 assert (0);
2225#endif
2226 return NULL;
2227 }
2228 *token = fetch_token (regexp, syntax);
2229
2230 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2231 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72
UD
2232 {
2233 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
bc15410e 2234 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2235 return NULL;
c34bfc8d
UD
2236 /* In BRE consecutive duplications are not allowed. */
2237 if ((syntax & RE_CONTEXT_INVALID_DUP)
2238 && (token->type == OP_DUP_ASTERISK
2239 || token->type == OP_OPEN_DUP_NUM))
2240 {
2241 *err = REG_BADRPT;
2242 return NULL;
2243 }
0742e48e 2244 dfa->has_plural_match = 1;
3b0bdc72
UD
2245 }
2246
2247 return tree;
2248}
2249
2250/* This function build the following tree, from regular expression
2251 (<reg_exp>):
15a7d175
UD
2252 SUBEXP
2253 |
2254 <reg_exp>
3b0bdc72
UD
2255*/
2256
2257static bin_tree_t *
2258parse_sub_exp (regexp, preg, token, syntax, nest, err)
2259 re_string_t *regexp;
2260 regex_t *preg;
2261 re_token_t *token;
2262 reg_syntax_t syntax;
2263 int nest;
2264 reg_errcode_t *err;
2265{
2266 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 2267 bin_tree_t *tree, *left_par, *right_par;
3b0bdc72 2268 size_t cur_nsub;
81c64d40 2269 int new_idx;
3b0bdc72
UD
2270 cur_nsub = preg->re_nsub++;
2271 if (dfa->subexps_alloc < preg->re_nsub)
2272 {
2273 re_subexp_t *new_array;
2274 dfa->subexps_alloc *= 2;
2275 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
bc15410e 2276 if (BE (new_array == NULL, 0))
3b0bdc72
UD
2277 {
2278 dfa->subexps_alloc /= 2;
2279 *err = REG_ESPACE;
2280 return NULL;
2281 }
2282 dfa->subexps = new_array;
2283 }
2284 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2285 dfa->subexps[cur_nsub].end = -1;
81c64d40
UD
2286
2287 new_idx = re_dfa_add_node (dfa, *token, 0);
2288 left_par = create_tree (NULL, NULL, 0, new_idx);
2289 if (BE (new_idx == -1 || left_par == NULL, 0))
485d775d
UD
2290 {
2291 *err = REG_ESPACE;
2292 return NULL;
2293 }
81c64d40 2294 dfa->nodes[new_idx].opr.idx = cur_nsub;
134abcb5 2295 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2296
2297 /* The subexpression may be a null string. */
2298 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2299 tree = NULL;
3b0bdc72
UD
2300 else
2301 {
2302 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
bc15410e 2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2304 return NULL;
3b0bdc72 2305 }
81c64d40
UD
2306 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2307 {
2308 free_bin_tree (tree);
06e8303a 2309 *err = REG_EPAREN;
81c64d40
UD
2310 return NULL;
2311 }
2312 new_idx = re_dfa_add_node (dfa, *token, 0);
2313 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2314 right_par = create_tree (NULL, NULL, 0, new_idx);
2315 tree = ((tree == NULL) ? right_par
15a7d175 2316 : create_tree (tree, right_par, CONCAT, 0));
81c64d40
UD
2317 tree = create_tree (left_par, tree, CONCAT, 0);
2318 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
485d775d
UD
2319 {
2320 *err = REG_ESPACE;
2321 return NULL;
2322 }
81c64d40
UD
2323 dfa->nodes[new_idx].opr.idx = cur_nsub;
2324
3b0bdc72
UD
2325 return tree;
2326}
2327
2328/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2329
2330static bin_tree_t *
2331parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2332 bin_tree_t *dup_elem;
2333 re_string_t *regexp;
2334 re_dfa_t *dfa;
2335 re_token_t *token;
2336 reg_syntax_t syntax;
2337 reg_errcode_t *err;
2338{
2339 re_token_t dup_token;
2340 bin_tree_t *tree = dup_elem, *work_tree;
2341 int new_idx, start_idx = re_string_cur_idx (regexp);
2342 re_token_t start_token = *token;
2343 if (token->type == OP_OPEN_DUP_NUM)
2344 {
602c2f9d
UD
2345 int i;
2346 int end = 0;
2347 int start = fetch_number (regexp, token, syntax);
3b0bdc72
UD
2348 bin_tree_t *elem;
2349 if (start == -1)
15a7d175
UD
2350 {
2351 if (token->type == CHARACTER && token->opr.c == ',')
2352 start = 0; /* We treat "{,m}" as "{0,m}". */
2353 else
2354 {
2355 *err = REG_BADBR; /* <re>{} is invalid. */
2356 return NULL;
2357 }
2358 }
602c2f9d 2359 if (BE (start != -2, 1))
15a7d175
UD
2360 {
2361 /* We treat "{n}" as "{n,n}". */
2362 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2363 : ((token->type == CHARACTER && token->opr.c == ',')
2364 ? fetch_number (regexp, token, syntax) : -2));
2365 }
602c2f9d 2366 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2367 {
2368 /* Invalid sequence. */
2369 if (token->type == OP_CLOSE_DUP_NUM)
2370 goto parse_dup_op_invalid_interval;
2371 else
2372 goto parse_dup_op_ebrace;
2373 }
602c2f9d 2374 if (BE (start == 0 && end == 0, 0))
15a7d175
UD
2375 {
2376 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2377 *token = fetch_token (regexp, syntax);
2378 free_bin_tree (dup_elem);
2379 return NULL;
2380 }
602c2f9d 2381
3b0bdc72
UD
2382 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2383 elem = tree;
2384 for (i = 0; i < start; ++i)
15a7d175
UD
2385 if (i != 0)
2386 {
2387 work_tree = duplicate_tree (elem, dfa);
2388 tree = create_tree (tree, work_tree, CONCAT, 0);
2389 if (BE (work_tree == NULL || tree == NULL, 0))
2390 goto parse_dup_op_espace;
2391 }
3b0bdc72
UD
2392
2393 if (end == -1)
15a7d175
UD
2394 {
2395 /* We treat "<re>{0,}" as "<re>*". */
2396 dup_token.type = OP_DUP_ASTERISK;
2397 if (start > 0)
2398 {
2399 elem = duplicate_tree (elem, dfa);
2400 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2401 work_tree = create_tree (elem, NULL, 0, new_idx);
2402 tree = create_tree (tree, work_tree, CONCAT, 0);
2403 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2404 || tree == NULL, 0))
2405 goto parse_dup_op_espace;
2406 }
2407 else
2408 {
2409 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2410 tree = create_tree (elem, NULL, 0, new_idx);
2411 if (BE (new_idx == -1 || tree == NULL, 0))
2412 goto parse_dup_op_espace;
2413 }
2414 }
deeb4aae
UD
2415 else if (BE (start > end, 0))
2416 {
2417 /* First number greater than first. */
2418 *err = REG_BADBR;
2419 return NULL;
2420 }
3b0bdc72 2421 else if (end - start > 0)
15a7d175
UD
2422 {
2423 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2424 dup_token.type = OP_DUP_QUESTION;
2425 if (start > 0)
2426 {
2427 elem = duplicate_tree (elem, dfa);
2428 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2429 elem = create_tree (elem, NULL, 0, new_idx);
2430 tree = create_tree (tree, elem, CONCAT, 0);
2431 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2432 goto parse_dup_op_espace;
2433 }
2434 else
2435 {
2436 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2437 tree = elem = create_tree (elem, NULL, 0, new_idx);
2438 if (BE (new_idx == -1 || tree == NULL, 0))
2439 goto parse_dup_op_espace;
2440 }
2441 for (i = 1; i < end - start; ++i)
2442 {
2443 work_tree = duplicate_tree (elem, dfa);
2444 tree = create_tree (tree, work_tree, CONCAT, 0);
2445 if (BE (work_tree == NULL || tree == NULL, 0))
2446 {
2447 *err = REG_ESPACE;
2448 return NULL;
2449 }
2450 }
2451 }
3b0bdc72
UD
2452 }
2453 else
2454 {
2455 new_idx = re_dfa_add_node (dfa, *token, 0);
2456 tree = create_tree (tree, NULL, 0, new_idx);
bc15410e 2457 if (BE (new_idx == -1 || tree == NULL, 0))
15a7d175
UD
2458 {
2459 *err = REG_ESPACE;
2460 return NULL;
2461 }
3b0bdc72
UD
2462 }
2463 *token = fetch_token (regexp, syntax);
2464 return tree;
2465
2466 parse_dup_op_espace:
2467 free_bin_tree (tree);
2468 *err = REG_ESPACE;
2469 return NULL;
2470
602c2f9d 2471 parse_dup_op_ebrace:
bc15410e 2472 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
3b0bdc72
UD
2473 {
2474 *err = REG_EBRACE;
2475 return NULL;
2476 }
602c2f9d
UD
2477 goto parse_dup_op_rollback;
2478 parse_dup_op_invalid_interval:
2479 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2480 {
2481 *err = REG_BADBR;
2482 return NULL;
2483 }
2484 parse_dup_op_rollback:
3b0bdc72
UD
2485 re_string_set_index (regexp, start_idx);
2486 *token = start_token;
2487 token->type = CHARACTER;
2488 return dup_elem;
2489}
2490
2491/* Size of the names for collating symbol/equivalence_class/character_class.
2492 I'm not sure, but maybe enough. */
2493#define BRACKET_NAME_BUF_SIZE 32
2494
434d3784
UD
2495#ifndef _LIBC
2496 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2497 Build the range expression which starts from START_ELEM, and ends
2498 at END_ELEM. The result are written to MBCSET and SBCSET.
2499 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2500 mbcset->range_ends, is a pointer argument sinse we may
2501 update it. */
2502
2503static reg_errcode_t
c0a0f9a3
UD
2504# ifdef RE_ENABLE_I18N
2505build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
434d3784 2506 re_charset_t *mbcset;
434d3784 2507 int *range_alloc;
c0a0f9a3
UD
2508# else /* not RE_ENABLE_I18N */
2509build_range_exp (sbcset, start_elem, end_elem)
2510# endif /* not RE_ENABLE_I18N */
2511 re_bitset_ptr_t sbcset;
434d3784
UD
2512 bracket_elem_t *start_elem, *end_elem;
2513{
2514 unsigned int start_ch, end_ch;
2515 /* Equivalence Classes and Character Classes can't be a range start/end. */
2516 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2517 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2518 0))
434d3784
UD
2519 return REG_ERANGE;
2520
2521 /* We can handle no multi character collating elements without libc
2522 support. */
62439eac 2523 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2524 && strlen ((char *) start_elem->opr.name) > 1)
2525 || (end_elem->type == COLL_SYM
2526 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2527 return REG_ECOLLATE;
2528
2529# ifdef RE_ENABLE_I18N
2530 {
2531 wchar_t wc, start_wc, end_wc;
2532 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2533
2534 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2535 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2536 : 0));
434d3784 2537 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2538 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2539 : 0));
434d3784 2540 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
15a7d175 2541 ? __btowc (start_ch) : start_elem->opr.wch);
434d3784 2542 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
15a7d175 2543 ? __btowc (end_ch) : end_elem->opr.wch);
434d3784
UD
2544 cmp_buf[0] = start_wc;
2545 cmp_buf[4] = end_wc;
2546 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2547 return REG_ERANGE;
2548
2549 /* Check the space of the arrays. */
2550 if (*range_alloc == mbcset->nranges)
2551 {
15a7d175
UD
2552 /* There are not enough space, need realloc. */
2553 wchar_t *new_array_start, *new_array_end;
2554 int new_nranges;
2555
2556 /* +1 in case of mbcset->nranges is 0. */
2557 new_nranges = 2 * mbcset->nranges + 1;
2558 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2559 are NULL if *range_alloc == 0. */
2560 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2561 new_nranges);
2562 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2563 new_nranges);
2564
2565 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2566 return REG_ESPACE;
2567
2568 mbcset->range_starts = new_array_start;
2569 mbcset->range_ends = new_array_end;
2570 *range_alloc = new_nranges;
434d3784
UD
2571 }
2572
2573 mbcset->range_starts[mbcset->nranges] = start_wc;
2574 mbcset->range_ends[mbcset->nranges++] = end_wc;
2575
2576 /* Build the table for single byte characters. */
2577 for (wc = 0; wc <= SBC_MAX; ++wc)
2578 {
15a7d175
UD
2579 cmp_buf[2] = wc;
2580 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2581 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2582 bitset_set (sbcset, wc);
434d3784
UD
2583 }
2584 }
2585# else /* not RE_ENABLE_I18N */
2586 {
2587 unsigned int ch;
2588 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2589 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2590 : 0));
434d3784 2591 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2592 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2593 : 0));
434d3784
UD
2594 if (start_ch > end_ch)
2595 return REG_ERANGE;
2596 /* Build the table for single byte characters. */
2597 for (ch = 0; ch <= SBC_MAX; ++ch)
2598 if (start_ch <= ch && ch <= end_ch)
15a7d175 2599 bitset_set (sbcset, ch);
434d3784
UD
2600 }
2601# endif /* not RE_ENABLE_I18N */
2602 return REG_NOERROR;
2603}
2604#endif /* not _LIBC */
2605
2606#ifndef _LIBC
2607/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2608 Build the collating element which is represented by NAME.
2609 The result are written to MBCSET and SBCSET.
2610 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2611 pointer argument since we may update it. */
2612
2613static reg_errcode_t
c0a0f9a3
UD
2614# ifdef RE_ENABLE_I18N
2615build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
434d3784 2616 re_charset_t *mbcset;
434d3784 2617 int *coll_sym_alloc;
c0a0f9a3
UD
2618# else /* not RE_ENABLE_I18N */
2619build_collating_symbol (sbcset, name)
2620# endif /* not RE_ENABLE_I18N */
2621 re_bitset_ptr_t sbcset;
62439eac 2622 const unsigned char *name;
434d3784 2623{
62439eac
UD
2624 size_t name_len = strlen ((const char *) name);
2625 if (BE (name_len != 1, 0))
434d3784
UD
2626 return REG_ECOLLATE;
2627 else
2628 {
62439eac 2629 bitset_set (sbcset, name[0]);
434d3784
UD
2630 return REG_NOERROR;
2631 }
2632}
2633#endif /* not _LIBC */
2634
3b0bdc72
UD
2635/* This function parse bracket expression like "[abc]", "[a-c]",
2636 "[[.a-a.]]" etc. */
2637
2638static bin_tree_t *
2639parse_bracket_exp (regexp, dfa, token, syntax, err)
2640 re_string_t *regexp;
2641 re_dfa_t *dfa;
2642 re_token_t *token;
2643 reg_syntax_t syntax;
2644 reg_errcode_t *err;
2645{
2646#ifdef _LIBC
62439eac
UD
2647 const unsigned char *collseqmb;
2648 const char *collseqwc;
3b0bdc72
UD
2649 uint32_t nrules;
2650 int32_t table_size;
2651 const int32_t *symb_table;
2652 const unsigned char *extra;
2653
434d3784 2654 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2655 Seek the collating symbol entry correspondings to NAME.
2656 Return the index of the symbol in the SYMB_TABLE. */
2657
2658 static inline int32_t
25337753 2659 __attribute ((always_inline))
3b0bdc72 2660 seek_collating_symbol_entry (name, name_len)
15a7d175
UD
2661 const unsigned char *name;
2662 size_t name_len;
3b0bdc72 2663 {
62439eac 2664 int32_t hash = elem_hash ((const char *) name, name_len);
3b0bdc72
UD
2665 int32_t elem = hash % table_size;
2666 int32_t second = hash % (table_size - 2);
2667 while (symb_table[2 * elem] != 0)
15a7d175
UD
2668 {
2669 /* First compare the hashing value. */
2670 if (symb_table[2 * elem] == hash
2671 /* Compare the length of the name. */
2672 && name_len == extra[symb_table[2 * elem + 1]]
2673 /* Compare the name. */
2674 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2675 name_len) == 0)
2676 {
2677 /* Yep, this is the entry. */
2678 break;
2679 }
2680
2681 /* Next entry. */
2682 elem += second;
2683 }
3b0bdc72
UD
2684 return elem;
2685 }
2686
434d3784 2687 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2688 Look up the collation sequence value of BR_ELEM.
2689 Return the value if succeeded, UINT_MAX otherwise. */
2690
2691 static inline unsigned int
25337753 2692 __attribute ((always_inline))
3b0bdc72 2693 lookup_collation_sequence_value (br_elem)
15a7d175 2694 bracket_elem_t *br_elem;
3b0bdc72
UD
2695 {
2696 if (br_elem->type == SB_CHAR)
15a7d175
UD
2697 {
2698 /*
2699 if (MB_CUR_MAX == 1)
2700 */
2701 if (nrules == 0)
2702 return collseqmb[br_elem->opr.ch];
2703 else
2704 {
2705 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2706 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2707 }
2708 }
3b0bdc72 2709 else if (br_elem->type == MB_CHAR)
15a7d175 2710 {
25337753 2711 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2712 }
3b0bdc72 2713 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2714 {
2715 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2716 if (nrules != 0)
2717 {
2718 int32_t elem, idx;
2719 elem = seek_collating_symbol_entry (br_elem->opr.name,
2720 sym_name_len);
2721 if (symb_table[2 * elem] != 0)
2722 {
2723 /* We found the entry. */
2724 idx = symb_table[2 * elem + 1];
2725 /* Skip the name of collating element name. */
2726 idx += 1 + extra[idx];
2727 /* Skip the byte sequence of the collating element. */
2728 idx += 1 + extra[idx];
2729 /* Adjust for the alignment. */
2730 idx = (idx + 3) & ~3;
2731 /* Skip the multibyte collation sequence value. */
2732 idx += sizeof (unsigned int);
2733 /* Skip the wide char sequence of the collating element. */
2734 idx += sizeof (unsigned int) *
2735 (1 + *(unsigned int *) (extra + idx));
2736 /* Return the collation sequence value. */
2737 return *(unsigned int *) (extra + idx);
2738 }
2739 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2740 {
2741 /* No valid character. Match it as a single byte
2742 character. */
2743 return collseqmb[br_elem->opr.name[0]];
2744 }
2745 }
2746 else if (sym_name_len == 1)
2747 return collseqmb[br_elem->opr.name[0]];
2748 }
3b0bdc72
UD
2749 return UINT_MAX;
2750 }
2751
434d3784 2752 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2753 Build the range expression which starts from START_ELEM, and ends
2754 at END_ELEM. The result are written to MBCSET and SBCSET.
2755 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2756 mbcset->range_ends, is a pointer argument sinse we may
2757 update it. */
2758
2759 static inline reg_errcode_t
25337753 2760 __attribute ((always_inline))
c0a0f9a3
UD
2761# ifdef RE_ENABLE_I18N
2762 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
15a7d175
UD
2763 re_charset_t *mbcset;
2764 int *range_alloc;
c0a0f9a3
UD
2765# else /* not RE_ENABLE_I18N */
2766 build_range_exp (sbcset, start_elem, end_elem)
2767# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2768 re_bitset_ptr_t sbcset;
2769 bracket_elem_t *start_elem, *end_elem;
3b0bdc72
UD
2770 {
2771 unsigned int ch;
2772 uint32_t start_collseq;
2773 uint32_t end_collseq;
2774
c0a0f9a3 2775# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2776 /* Check the space of the arrays. */
2777 if (*range_alloc == mbcset->nranges)
15a7d175
UD
2778 {
2779 /* There are not enough space, need realloc. */
2780 uint32_t *new_array_start;
2781 uint32_t *new_array_end;
3b0bdc72
UD
2782 int new_nranges;
2783
15a7d175
UD
2784 /* +1 in case of mbcset->nranges is 0. */
2785 new_nranges = 2 * mbcset->nranges + 1;
a9388965 2786 /* Use realloc since mbcset->range_starts and mbcset->range_ends
15a7d175
UD
2787 are NULL if *range_alloc == 0. */
2788 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2789 new_nranges);
2790 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2791 new_nranges);
a9388965 2792
15a7d175
UD
2793 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2794 return REG_ESPACE;
3b0bdc72 2795
15a7d175
UD
2796 mbcset->range_starts = new_array_start;
2797 mbcset->range_ends = new_array_end;
3b0bdc72 2798 *range_alloc = new_nranges;
15a7d175 2799 }
c0a0f9a3 2800# endif /* RE_ENABLE_I18N */
3b0bdc72 2801
434d3784 2802 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2803 start/end. */
bc15410e 2804 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2805 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2806 0))
2807 return REG_ERANGE;
3b0bdc72
UD
2808
2809 start_collseq = lookup_collation_sequence_value (start_elem);
2810 end_collseq = lookup_collation_sequence_value (end_elem);
2811 /* Check start/end collation sequence values. */
bc15410e 2812 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2813 return REG_ECOLLATE;
bc15410e 2814 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2815 return REG_ERANGE;
3b0bdc72 2816
c0a0f9a3 2817# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2818 /* Got valid collation sequence values, add them as a new entry. */
2819 mbcset->range_starts[mbcset->nranges] = start_collseq;
2820 mbcset->range_ends[mbcset->nranges++] = end_collseq;
c0a0f9a3 2821# endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2822
2823 /* Build the table for single byte characters. */
2824 for (ch = 0; ch <= SBC_MAX; ch++)
15a7d175
UD
2825 {
2826 uint32_t ch_collseq;
2827 /*
2828 if (MB_CUR_MAX == 1)
2829 */
2830 if (nrules == 0)
2831 ch_collseq = collseqmb[ch];
2832 else
25337753 2833 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
2834 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2835 bitset_set (sbcset, ch);
2836 }
3b0bdc72
UD
2837 return REG_NOERROR;
2838 }
3b0bdc72 2839
434d3784 2840 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2841 Build the collating element which is represented by NAME.
2842 The result are written to MBCSET and SBCSET.
2843 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2844 pointer argument sinse we may update it. */
2845
2846 static inline reg_errcode_t
25337753 2847 __attribute ((always_inline))
c0a0f9a3
UD
2848# ifdef RE_ENABLE_I18N
2849 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
15a7d175
UD
2850 re_charset_t *mbcset;
2851 int *coll_sym_alloc;
c0a0f9a3
UD
2852# else /* not RE_ENABLE_I18N */
2853 build_collating_symbol (sbcset, name)
2854# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2855 re_bitset_ptr_t sbcset;
2856 const unsigned char *name;
3b0bdc72 2857 {
3b0bdc72 2858 int32_t elem, idx;
62439eac 2859 size_t name_len = strlen ((const char *) name);
3b0bdc72 2860 if (nrules != 0)
15a7d175
UD
2861 {
2862 elem = seek_collating_symbol_entry (name, name_len);
2863 if (symb_table[2 * elem] != 0)
2864 {
2865 /* We found the entry. */
2866 idx = symb_table[2 * elem + 1];
2867 /* Skip the name of collating element name. */
2868 idx += 1 + extra[idx];
2869 }
2870 else if (symb_table[2 * elem] == 0 && name_len == 1)
2871 {
2872 /* No valid character, treat it as a normal
2873 character. */
2874 bitset_set (sbcset, name[0]);
2875 return REG_NOERROR;
2876 }
2877 else
2878 return REG_ECOLLATE;
3b0bdc72 2879
c0a0f9a3 2880# ifdef RE_ENABLE_I18N
15a7d175
UD
2881 /* Got valid collation sequence, add it as a new entry. */
2882 /* Check the space of the arrays. */
2883 if (*coll_sym_alloc == mbcset->ncoll_syms)
2884 {
2885 /* Not enough, realloc it. */
2886 /* +1 in case of mbcset->ncoll_syms is 0. */
2887 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2888 /* Use realloc since mbcset->coll_syms is NULL
2889 if *alloc == 0. */
2890 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2891 *coll_sym_alloc);
2892 if (BE (mbcset->coll_syms == NULL, 0))
2893 return REG_ESPACE;
2894 }
2895 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
c0a0f9a3 2896# endif /* RE_ENABLE_I18N */
15a7d175
UD
2897 return REG_NOERROR;
2898 }
3b0bdc72 2899 else
15a7d175
UD
2900 {
2901 if (BE (name_len != 1, 0))
2902 return REG_ECOLLATE;
2903 else
2904 {
2905 bitset_set (sbcset, name[0]);
2906 return REG_NOERROR;
2907 }
2908 }
3b0bdc72 2909 }
434d3784
UD
2910#endif
2911
3b0bdc72
UD
2912 re_token_t br_token;
2913 re_bitset_ptr_t sbcset;
c0a0f9a3 2914#ifdef RE_ENABLE_I18N
3b0bdc72 2915 re_charset_t *mbcset;
3b0bdc72
UD
2916 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2917 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3
UD
2918#else /* not RE_ENABLE_I18N */
2919 int non_match = 0;
2920#endif /* not RE_ENABLE_I18N */
2921 bin_tree_t *work_tree;
2922 int token_len, new_idx;
3b0bdc72 2923#ifdef _LIBC
62439eac
UD
2924 collseqmb = (const unsigned char *)
2925 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
2926 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2927 if (nrules)
2928 {
2929 /*
2930 if (MB_CUR_MAX > 1)
2931 */
15a7d175 2932 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
2933 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2934 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 2935 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 2936 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 2937 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
2938 }
2939#endif
2940 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 2941#ifdef RE_ENABLE_I18N
3b0bdc72 2942 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
2943#endif /* RE_ENABLE_I18N */
2944#ifdef RE_ENABLE_I18N
bc15410e 2945 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
2946#else
2947 if (BE (sbcset == NULL, 0))
2948#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2949 {
2950 *err = REG_ESPACE;
2951 return NULL;
2952 }
2953
2954 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2955 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2956 {
3b0bdc72 2957 *err = REG_BADPAT;
434d3784 2958 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2959 }
2960 if (token->type == OP_NON_MATCH_LIST)
2961 {
c0a0f9a3 2962#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2963 int i;
2964 mbcset->non_match = 1;
c0a0f9a3
UD
2965#else /* not RE_ENABLE_I18N */
2966 non_match = 1;
2967#endif /* not RE_ENABLE_I18N */
3b0bdc72 2968 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 2969 bitset_set (sbcset, '\0');
3b0bdc72
UD
2970 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2971 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2972 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
2973 {
2974 *err = REG_BADPAT;
2975 goto parse_bracket_exp_free_return;
2976 }
c0a0f9a3 2977#ifdef RE_ENABLE_I18N
3c0fb574 2978 if (dfa->mb_cur_max > 1)
15a7d175
UD
2979 for (i = 0; i < SBC_MAX; ++i)
2980 if (__btowc (i) == WEOF)
2981 bitset_set (sbcset, i);
c0a0f9a3 2982#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2983 }
2984
2985 /* We treat the first ']' as a normal character. */
2986 if (token->type == OP_CLOSE_BRACKET)
2987 token->type = CHARACTER;
2988
2989 while (1)
2990 {
2991 bracket_elem_t start_elem, end_elem;
62439eac
UD
2992 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2993 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72
UD
2994 reg_errcode_t ret;
2995 int token_len2 = 0, is_range_exp = 0;
2996 re_token_t token2;
2997
2998 start_elem.opr.name = start_name_buf;
2999 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
15a7d175 3000 syntax);
bc15410e 3001 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
3002 {
3003 *err = ret;
3004 goto parse_bracket_exp_free_return;
3005 }
3b0bdc72
UD
3006
3007 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 3008 if (BE (token->type == END_OF_RE, 0))
15a7d175 3009 {
f6953f16 3010 *err = REG_EBRACK;
15a7d175
UD
3011 goto parse_bracket_exp_free_return;
3012 }
3b0bdc72 3013 if (token->type == OP_CHARSET_RANGE)
15a7d175
UD
3014 {
3015 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3016 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3017 if (BE (token->type == END_OF_RE, 0))
3018 {
3019 *err = REG_BADPAT;
3020 goto parse_bracket_exp_free_return;
3021 }
3022 if (token2.type == OP_CLOSE_BRACKET)
3023 {
3024 /* We treat the last '-' as a normal character. */
3025 re_string_skip_bytes (regexp, -token_len);
3026 token->type = CHARACTER;
3027 }
3028 else
3029 is_range_exp = 1;
3030 }
3b0bdc72
UD
3031
3032 if (is_range_exp == 1)
15a7d175
UD
3033 {
3034 end_elem.opr.name = end_name_buf;
3035 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3036 dfa, syntax);
3037 if (BE (ret != REG_NOERROR, 0))
3038 {
3039 *err = ret;
3040 goto parse_bracket_exp_free_return;
3041 }
3042
3043 token_len = peek_token_bracket (token, regexp, syntax);
3044 if (BE (token->type == END_OF_RE, 0))
3045 {
3046 *err = REG_BADPAT;
3047 goto parse_bracket_exp_free_return;
3048 }
3049 *err = build_range_exp (sbcset,
c0a0f9a3 3050#ifdef RE_ENABLE_I18N
15a7d175 3051 mbcset, &range_alloc,
c0a0f9a3 3052#endif /* RE_ENABLE_I18N */
15a7d175
UD
3053 &start_elem, &end_elem);
3054 if (BE (*err != REG_NOERROR, 0))
3055 goto parse_bracket_exp_free_return;
3056 }
3b0bdc72 3057 else
15a7d175
UD
3058 {
3059 switch (start_elem.type)
3060 {
3061 case SB_CHAR:
3062 bitset_set (sbcset, start_elem.opr.ch);
3063 break;
c0a0f9a3 3064#ifdef RE_ENABLE_I18N
15a7d175
UD
3065 case MB_CHAR:
3066 /* Check whether the array has enough space. */
3067 if (mbchar_alloc == mbcset->nmbchars)
3068 {
3069 /* Not enough, realloc it. */
3070 /* +1 in case of mbcset->nmbchars is 0. */
3071 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3072 /* Use realloc since array is NULL if *alloc == 0. */
3073 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
3074 mbchar_alloc);
3075 if (BE (mbcset->mbchars == NULL, 0))
3076 goto parse_bracket_exp_espace;
3077 }
3078 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3079 break;
c0a0f9a3 3080#endif /* RE_ENABLE_I18N */
15a7d175
UD
3081 case EQUIV_CLASS:
3082 *err = build_equiv_class (sbcset,
c0a0f9a3 3083#ifdef RE_ENABLE_I18N
15a7d175 3084 mbcset, &equiv_class_alloc,
c0a0f9a3 3085#endif /* RE_ENABLE_I18N */
3b0bdc72 3086 start_elem.opr.name);
15a7d175
UD
3087 if (BE (*err != REG_NOERROR, 0))
3088 goto parse_bracket_exp_free_return;
3089 break;
3090 case COLL_SYM:
3091 *err = build_collating_symbol (sbcset,
c0a0f9a3 3092#ifdef RE_ENABLE_I18N
15a7d175 3093 mbcset, &coll_sym_alloc,
c0a0f9a3 3094#endif /* RE_ENABLE_I18N */
3b0bdc72 3095 start_elem.opr.name);
15a7d175
UD
3096 if (BE (*err != REG_NOERROR, 0))
3097 goto parse_bracket_exp_free_return;
3098 break;
3099 case CHAR_CLASS:
66b110e8 3100 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3101#ifdef RE_ENABLE_I18N
7b7b9e70 3102 mbcset, &char_class_alloc,
c0a0f9a3 3103#endif /* RE_ENABLE_I18N */
7b7b9e70
UD
3104 start_elem.opr.name, syntax);
3105 if (BE (*err != REG_NOERROR, 0))
3106 goto parse_bracket_exp_free_return;
15a7d175
UD
3107 break;
3108 default:
3109 assert (0);
3110 break;
3111 }
3112 }
3b0bdc72 3113 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3114 break;
3b0bdc72
UD
3115 }
3116
3117 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3118
3119 /* If it is non-matching list. */
c0a0f9a3 3120#ifdef RE_ENABLE_I18N
3b0bdc72 3121 if (mbcset->non_match)
c0a0f9a3
UD
3122#else /* not RE_ENABLE_I18N */
3123 if (non_match)
3124#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3125 bitset_not (sbcset);
3126
3127 /* Build a tree for simple bracket. */
3128 br_token.type = SIMPLE_BRACKET;
3129 br_token.opr.sbcset = sbcset;
3130 new_idx = re_dfa_add_node (dfa, br_token, 0);
3131 work_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3132 if (BE (new_idx == -1 || work_tree == NULL, 0))
3b0bdc72
UD
3133 goto parse_bracket_exp_espace;
3134
c0a0f9a3 3135#ifdef RE_ENABLE_I18N
3b0bdc72 3136 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3137 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3138 || mbcset->non_match)))
3b0bdc72
UD
3139 {
3140 re_token_t alt_token;
3141 bin_tree_t *mbc_tree;
3142 /* Build a tree for complex bracket. */
3143 br_token.type = COMPLEX_BRACKET;
3144 br_token.opr.mbcset = mbcset;
3145 dfa->has_mb_node = 1;
3146 new_idx = re_dfa_add_node (dfa, br_token, 0);
3147 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3148 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
15a7d175 3149 goto parse_bracket_exp_espace;
3b0bdc72 3150 /* Then join them by ALT node. */
0742e48e 3151 dfa->has_plural_match = 1;
3b0bdc72
UD
3152 alt_token.type = OP_ALT;
3153 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3154 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
bc15410e 3155 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
15a7d175 3156 return work_tree;
3b0bdc72
UD
3157 }
3158 else
3159 {
3160 free_charset (mbcset);
3161 return work_tree;
3162 }
c0a0f9a3
UD
3163#else /* not RE_ENABLE_I18N */
3164 return work_tree;
3165#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3166
3167 parse_bracket_exp_espace:
3b0bdc72 3168 *err = REG_ESPACE;
434d3784
UD
3169 parse_bracket_exp_free_return:
3170 re_free (sbcset);
c0a0f9a3 3171#ifdef RE_ENABLE_I18N
434d3784 3172 free_charset (mbcset);
c0a0f9a3 3173#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3174 return NULL;
3175}
3176
434d3784
UD
3177/* Parse an element in the bracket expression. */
3178
3b0bdc72
UD
3179static reg_errcode_t
3180parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3181 bracket_elem_t *elem;
3182 re_string_t *regexp;
3183 re_token_t *token;
3184 int token_len;
3185 re_dfa_t *dfa;
3186 reg_syntax_t syntax;
3187{
3188#ifdef RE_ENABLE_I18N
3189 int cur_char_size;
3190 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3191 if (cur_char_size > 1)
3192 {
3193 elem->type = MB_CHAR;
3194 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3195 re_string_skip_bytes (regexp, cur_char_size);
3196 return REG_NOERROR;
3197 }
3198#endif /* RE_ENABLE_I18N */
3199 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3200 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3201 || token->type == OP_OPEN_EQUIV_CLASS)
3202 return parse_bracket_symbol (elem, regexp, token);
3203 elem->type = SB_CHAR;
3204 elem->opr.ch = token->opr.c;
3205 return REG_NOERROR;
3206}
3207
434d3784
UD
3208/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3209 such as [:<character_class>:], [.<collating_element>.], and
3210 [=<equivalent_class>=]. */
3211
3b0bdc72
UD
3212static reg_errcode_t
3213parse_bracket_symbol (elem, regexp, token)
3214 bracket_elem_t *elem;
3215 re_string_t *regexp;
3216 re_token_t *token;
3217{
3218 unsigned char ch, delim = token->opr.c;
3219 int i = 0;
602c2f9d 3220 for (;; ++i)
3b0bdc72 3221 {
602c2f9d 3222 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3223 return REG_EBRACK;
3b0bdc72 3224 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3225 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3226 else
15a7d175 3227 ch = re_string_fetch_byte (regexp);
3b0bdc72 3228 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3229 break;
3b0bdc72
UD
3230 elem->opr.name[i] = ch;
3231 }
3232 re_string_skip_bytes (regexp, 1);
3233 elem->opr.name[i] = '\0';
3234 switch (token->type)
3235 {
3236 case OP_OPEN_COLL_ELEM:
3237 elem->type = COLL_SYM;
3238 break;
3239 case OP_OPEN_EQUIV_CLASS:
3240 elem->type = EQUIV_CLASS;
3241 break;
3242 case OP_OPEN_CHAR_CLASS:
3243 elem->type = CHAR_CLASS;
3244 break;
3245 default:
3246 break;
3247 }
3248 return REG_NOERROR;
3249}
3250
3251 /* Helper function for parse_bracket_exp.
3252 Build the equivalence class which is represented by NAME.
3253 The result are written to MBCSET and SBCSET.
3254 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3255 is a pointer argument sinse we may update it. */
3256
3257static reg_errcode_t
c0a0f9a3
UD
3258#ifdef RE_ENABLE_I18N
3259build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3b0bdc72 3260 re_charset_t *mbcset;
3b0bdc72 3261 int *equiv_class_alloc;
c0a0f9a3
UD
3262#else /* not RE_ENABLE_I18N */
3263build_equiv_class (sbcset, name)
3264#endif /* not RE_ENABLE_I18N */
3265 re_bitset_ptr_t sbcset;
62439eac 3266 const unsigned char *name;
3b0bdc72 3267{
c0a0f9a3 3268#if defined _LIBC && defined RE_ENABLE_I18N
3b0bdc72
UD
3269 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3270 if (nrules != 0)
3271 {
3272 const int32_t *table, *indirect;
3273 const unsigned char *weights, *extra, *cp;
62439eac 3274 unsigned char char_buf[2];
3b0bdc72
UD
3275 int32_t idx1, idx2;
3276 unsigned int ch;
3277 size_t len;
3278 /* This #include defines a local function! */
3279# include <locale/weight.h>
3280 /* Calculate the index for equivalence class. */
62439eac 3281 cp = name;
3b0bdc72
UD
3282 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3283 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3284 _NL_COLLATE_WEIGHTMB);
3285 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3286 _NL_COLLATE_EXTRAMB);
3b0bdc72 3287 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3288 _NL_COLLATE_INDIRECTMB);
3b0bdc72 3289 idx1 = findidx (&cp);
62439eac 3290 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
15a7d175
UD
3291 /* This isn't a valid character. */
3292 return REG_ECOLLATE;
3b0bdc72
UD
3293
3294 /* Build single byte matcing table for this equivalence class. */
62439eac 3295 char_buf[1] = (unsigned char) '\0';
3b0bdc72
UD
3296 len = weights[idx1];
3297 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3298 {
3299 char_buf[0] = ch;
3300 cp = char_buf;
3301 idx2 = findidx (&cp);
3b0bdc72 3302/*
15a7d175 3303 idx2 = table[ch];
3b0bdc72 3304*/
15a7d175
UD
3305 if (idx2 == 0)
3306 /* This isn't a valid character. */
3307 continue;
3308 if (len == weights[idx2])
3309 {
3310 int cnt = 0;
3311 while (cnt <= len &&
3312 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3313 ++cnt;
3314
3315 if (cnt > len)
3316 bitset_set (sbcset, ch);
3317 }
3318 }
a9388965
UD
3319 /* Check whether the array has enough space. */
3320 if (*equiv_class_alloc == mbcset->nequiv_classes)
15a7d175
UD
3321 {
3322 /* Not enough, realloc it. */
3323 /* +1 in case of mbcset->nequiv_classes is 0. */
3324 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3325 /* Use realloc since the array is NULL if *alloc == 0. */
3326 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3327 *equiv_class_alloc);
3328 if (BE (mbcset->equiv_classes == NULL, 0))
3329 return REG_ESPACE;
3330 }
3b0bdc72
UD
3331 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3332 }
3333 else
c0a0f9a3 3334#endif /* _LIBC && RE_ENABLE_I18N */
3b0bdc72 3335 {
62439eac 3336 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3337 return REG_ECOLLATE;
62439eac 3338 bitset_set (sbcset, *name);
3b0bdc72
UD
3339 }
3340 return REG_NOERROR;
3341}
3342
3343 /* Helper function for parse_bracket_exp.
3344 Build the character class which is represented by NAME.
3345 The result are written to MBCSET and SBCSET.
3346 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3347 is a pointer argument sinse we may update it. */
3348
3349static reg_errcode_t
c0a0f9a3 3350#ifdef RE_ENABLE_I18N
66b110e8 3351build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3b0bdc72 3352 re_charset_t *mbcset;
3b0bdc72 3353 int *char_class_alloc;
c0a0f9a3 3354#else /* not RE_ENABLE_I18N */
66b110e8 3355build_charclass (trans, sbcset, class_name, syntax)
c0a0f9a3 3356#endif /* not RE_ENABLE_I18N */
66b110e8 3357 RE_TRANSLATE_TYPE trans;
c0a0f9a3 3358 re_bitset_ptr_t sbcset;
62439eac 3359 const unsigned char *class_name;
602c2f9d 3360 reg_syntax_t syntax;
3b0bdc72
UD
3361{
3362 int i;
62439eac 3363 const char *name = (const char *) class_name;
c0a0f9a3
UD
3364
3365 /* In case of REG_ICASE "upper" and "lower" match the both of
3366 upper and lower cases. */
3367 if ((syntax & RE_ICASE)
3368 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3369 name = "alpha";
3370
3371#ifdef RE_ENABLE_I18N
3b0bdc72 3372 /* Check the space of the arrays. */
a9388965
UD
3373 if (*char_class_alloc == mbcset->nchar_classes)
3374 {
3375 /* Not enough, realloc it. */
3376 /* +1 in case of mbcset->nchar_classes is 0. */
3377 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3378 /* Use realloc since array is NULL if *alloc == 0. */
3379 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
15a7d175 3380 *char_class_alloc);
bc15410e 3381 if (BE (mbcset->char_classes == NULL, 0))
15a7d175 3382 return REG_ESPACE;
a9388965 3383 }
3b0bdc72 3384 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3385#endif /* RE_ENABLE_I18N */
3b0bdc72 3386
66b110e8
UD
3387#define BUILD_CHARCLASS_LOOP(ctype_func) \
3388 for (i = 0; i < SBC_MAX; ++i) \
3389 { \
3390 if (ctype_func (i)) \
3391 { \
3392 int ch = trans ? trans[i] : i; \
3393 bitset_set (sbcset, ch); \
3394 } \
3b0bdc72
UD
3395 }
3396
3397 if (strcmp (name, "alnum") == 0)
3398 BUILD_CHARCLASS_LOOP (isalnum)
3399 else if (strcmp (name, "cntrl") == 0)
3400 BUILD_CHARCLASS_LOOP (iscntrl)
3401 else if (strcmp (name, "lower") == 0)
3402 BUILD_CHARCLASS_LOOP (islower)
3403 else if (strcmp (name, "space") == 0)
3404 BUILD_CHARCLASS_LOOP (isspace)
3405 else if (strcmp (name, "alpha") == 0)
3406 BUILD_CHARCLASS_LOOP (isalpha)
3407 else if (strcmp (name, "digit") == 0)
3408 BUILD_CHARCLASS_LOOP (isdigit)
3409 else if (strcmp (name, "print") == 0)
3410 BUILD_CHARCLASS_LOOP (isprint)
3411 else if (strcmp (name, "upper") == 0)
3412 BUILD_CHARCLASS_LOOP (isupper)
3413 else if (strcmp (name, "blank") == 0)
3414 BUILD_CHARCLASS_LOOP (isblank)
3415 else if (strcmp (name, "graph") == 0)
3416 BUILD_CHARCLASS_LOOP (isgraph)
3417 else if (strcmp (name, "punct") == 0)
3418 BUILD_CHARCLASS_LOOP (ispunct)
3419 else if (strcmp (name, "xdigit") == 0)
3420 BUILD_CHARCLASS_LOOP (isxdigit)
3421 else
3422 return REG_ECTYPE;
3423
3424 return REG_NOERROR;
3425}
3426
3427static bin_tree_t *
e2b6bfa3 3428build_charclass_op (dfa, trans, class_name, extra, not, err)
3b0bdc72 3429 re_dfa_t *dfa;
66b110e8 3430 RE_TRANSLATE_TYPE trans;
e2b6bfa3
UD
3431 const unsigned char *class_name;
3432 const unsigned char *extra;
3b0bdc72
UD
3433 int not;
3434 reg_errcode_t *err;
3435{
3436 re_bitset_ptr_t sbcset;
c0a0f9a3 3437#ifdef RE_ENABLE_I18N
3b0bdc72 3438 re_charset_t *mbcset;
c0a0f9a3
UD
3439 int alloc = 0;
3440#else /* not RE_ENABLE_I18N */
3441 int non_match = 0;
3442#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3443 reg_errcode_t ret;
3444 re_token_t br_token;
3445 bin_tree_t *tree;
c0a0f9a3 3446 int new_idx;
3b0bdc72
UD
3447
3448 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 3449#ifdef RE_ENABLE_I18N
3b0bdc72 3450 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3451#endif /* RE_ENABLE_I18N */
3452
3453#ifdef RE_ENABLE_I18N
bc15410e 3454 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3455#else /* not RE_ENABLE_I18N */
3456 if (BE (sbcset == NULL, 0))
3457#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3458 {
3459 *err = REG_ESPACE;
3460 return NULL;
3461 }
3462
3463 if (not)
3464 {
c0a0f9a3 3465#ifdef RE_ENABLE_I18N
3b0bdc72 3466 int i;
3b0bdc72
UD
3467 /*
3468 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 3469 bitset_set(cset->sbcset, '\0');
3b0bdc72 3470 */
c0a0f9a3 3471 mbcset->non_match = 1;
3c0fb574 3472 if (dfa->mb_cur_max > 1)
15a7d175
UD
3473 for (i = 0; i < SBC_MAX; ++i)
3474 if (__btowc (i) == WEOF)
3475 bitset_set (sbcset, i);
c0a0f9a3
UD
3476#else /* not RE_ENABLE_I18N */
3477 non_match = 1;
3478#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3479 }
3480
602c2f9d 3481 /* We don't care the syntax in this case. */
66b110e8 3482 ret = build_charclass (trans, sbcset,
c0a0f9a3 3483#ifdef RE_ENABLE_I18N
15a7d175 3484 mbcset, &alloc,
c0a0f9a3 3485#endif /* RE_ENABLE_I18N */
e2b6bfa3 3486 class_name, 0);
c0a0f9a3 3487
bc15410e 3488 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3489 {
3490 re_free (sbcset);
c0a0f9a3 3491#ifdef RE_ENABLE_I18N
3b0bdc72 3492 free_charset (mbcset);
c0a0f9a3 3493#endif /* RE_ENABLE_I18N */
7b7b9e70 3494 *err = ret;
3b0bdc72
UD
3495 return NULL;
3496 }
434d3784 3497 /* \w match '_' also. */
e2b6bfa3
UD
3498 for (; *extra; extra++)
3499 bitset_set (sbcset, *extra);
3b0bdc72
UD
3500
3501 /* If it is non-matching list. */
c0a0f9a3 3502#ifdef RE_ENABLE_I18N
3b0bdc72 3503 if (mbcset->non_match)
c0a0f9a3
UD
3504#else /* not RE_ENABLE_I18N */
3505 if (non_match)
3506#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3507 bitset_not (sbcset);
3508
3509 /* Build a tree for simple bracket. */
3510 br_token.type = SIMPLE_BRACKET;
3511 br_token.opr.sbcset = sbcset;
3512 new_idx = re_dfa_add_node (dfa, br_token, 0);
3513 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3514 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
3515 goto build_word_op_espace;
3516
c0a0f9a3 3517#ifdef RE_ENABLE_I18N
3c0fb574 3518 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
3519 {
3520 re_token_t alt_token;
3521 bin_tree_t *mbc_tree;
3522 /* Build a tree for complex bracket. */
3523 br_token.type = COMPLEX_BRACKET;
3524 br_token.opr.mbcset = mbcset;
3525 dfa->has_mb_node = 1;
3526 new_idx = re_dfa_add_node (dfa, br_token, 0);
3527 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3528 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
15a7d175 3529 goto build_word_op_espace;
3b0bdc72
UD
3530 /* Then join them by ALT node. */
3531 alt_token.type = OP_ALT;
3532 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3533 tree = create_tree (tree, mbc_tree, 0, new_idx);
bc15410e 3534 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
15a7d175 3535 return tree;
3b0bdc72
UD
3536 }
3537 else
3538 {
3539 free_charset (mbcset);
3540 return tree;
3541 }
c0a0f9a3
UD
3542#else /* not RE_ENABLE_I18N */
3543 return tree;
3544#endif /* not RE_ENABLE_I18N */
3545
3b0bdc72
UD
3546 build_word_op_espace:
3547 re_free (sbcset);
c0a0f9a3 3548#ifdef RE_ENABLE_I18N
3b0bdc72 3549 free_charset (mbcset);
c0a0f9a3 3550#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3551 *err = REG_ESPACE;
3552 return NULL;
3553}
3554
3555/* This is intended for the expressions like "a{1,3}".
3556 Fetch a number from `input', and return the number.
3557 Return -1, if the number field is empty like "{,1}".
3558 Return -2, If an error is occured. */
3559
3560static int
3561fetch_number (input, token, syntax)
3562 re_string_t *input;
3563 re_token_t *token;
3564 reg_syntax_t syntax;
3565{
3566 int num = -1;
3567 unsigned char c;
3568 while (1)
3569 {
3570 *token = fetch_token (input, syntax);
3571 c = token->opr.c;
602c2f9d 3572 if (BE (token->type == END_OF_RE, 0))
15a7d175 3573 return -2;
3b0bdc72 3574 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3575 break;
602c2f9d 3576 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
15a7d175 3577 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
602c2f9d 3578 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3579 }
3b0bdc72
UD
3580 return num;
3581}
3582\f
c0a0f9a3 3583#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3584static void
3585free_charset (re_charset_t *cset)
3586{
3587 re_free (cset->mbchars);
c0a0f9a3 3588# ifdef _LIBC
3b0bdc72
UD
3589 re_free (cset->coll_syms);
3590 re_free (cset->equiv_classes);
3591 re_free (cset->range_starts);
3592 re_free (cset->range_ends);
c0a0f9a3 3593# endif
3b0bdc72
UD
3594 re_free (cset->char_classes);
3595 re_free (cset);
3596}
c0a0f9a3 3597#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3598\f
3599/* Functions for binary tree operation. */
3600
3601/* Create a node of tree.
3602 Note: This function automatically free left and right if malloc fails. */
3603
3604static bin_tree_t *
3605create_tree (left, right, type, index)
3606 bin_tree_t *left;
3607 bin_tree_t *right;
3608 re_token_type_t type;
3609 int index;
3610{
3611 bin_tree_t *tree;
3612 tree = re_malloc (bin_tree_t, 1);
bc15410e 3613 if (BE (tree == NULL, 0))
3b0bdc72
UD
3614 {
3615 free_bin_tree (left);
3616 free_bin_tree (right);
3617 return NULL;
3618 }
3619 tree->parent = NULL;
3620 tree->left = left;
3621 tree->right = right;
3622 tree->type = type;
3623 tree->node_idx = index;
3624 tree->first = -1;
3625 tree->next = -1;
3626 re_node_set_init_empty (&tree->eclosure);
3627
3628 if (left != NULL)
3629 left->parent = tree;
3630 if (right != NULL)
3631 right->parent = tree;
3632 return tree;
3633}
3634
3635/* Free the sub tree pointed by TREE. */
3636
3637static void
3638free_bin_tree (tree)
3639 bin_tree_t *tree;
3640{
3641 if (tree == NULL)
3642 return;
3643 /*re_node_set_free (&tree->eclosure);*/
3644 free_bin_tree (tree->left);
3645 free_bin_tree (tree->right);
3646 re_free (tree);
3647}
3648
3649/* Duplicate the node SRC, and return new node. */
3650
3651static bin_tree_t *
3652duplicate_tree (src, dfa)
3653 const bin_tree_t *src;
3654 re_dfa_t *dfa;
3655{
3656 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3657 int new_node_idx;
3658 /* Since node indies must be according to Post-order of the tree,
3659 we must duplicate the left at first. */
3660 if (src->left != NULL)
3661 {
3662 left = duplicate_tree (src->left, dfa);
3663 if (left == NULL)
15a7d175 3664 return NULL;
3b0bdc72
UD
3665 }
3666
3667 /* Secondaly, duplicate the right. */
3668 if (src->right != NULL)
3669 {
3670 right = duplicate_tree (src->right, dfa);
3671 if (right == NULL)
15a7d175
UD
3672 {
3673 free_bin_tree (left);
3674 return NULL;
3675 }
3b0bdc72
UD
3676 }
3677
3678 /* At last, duplicate itself. */
3679 if (src->type == NON_TYPE)
3680 {
3681 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3682 dfa->nodes[new_node_idx].duplicated = 1;
bc15410e 3683 if (BE (new_node_idx == -1, 0))
15a7d175
UD
3684 {
3685 free_bin_tree (left);
3686 free_bin_tree (right);
3687 return NULL;
3688 }
3b0bdc72
UD
3689 }
3690 else
3691 new_node_idx = src->type;
3692
3693 new_tree = create_tree (left, right, src->type, new_node_idx);
bc15410e 3694 if (BE (new_tree == NULL, 0))
3b0bdc72
UD
3695 {
3696 free_bin_tree (left);
3697 free_bin_tree (right);
3698 }
3699 return new_tree;
3700}