]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
Update.
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
ca3c505e 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
3b0bdc72 21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
15a7d175 22 int length, reg_syntax_t syntax);
3b0bdc72 23static void re_compile_fastmap_iter (regex_t *bufp,
15a7d175
UD
24 const re_dfastate_t *init_state,
25 char *fastmap);
3b0bdc72 26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
a9388965 27static reg_errcode_t init_word_char (re_dfa_t *dfa);
c0a0f9a3 28#ifdef RE_ENABLE_I18N
3b0bdc72 29static void free_charset (re_charset_t *cset);
c0a0f9a3 30#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
14744156
UD
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
3b0bdc72
UD
36static reg_errcode_t analyze (re_dfa_t *dfa);
37static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
485d775d 41static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
15a7d175
UD
42 int top_clone_node, int root_node,
43 unsigned int constraint);
a9388965 44static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
15a7d175 45 unsigned int constraint);
a7d5c291
UD
46static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
3b0bdc72 48static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965 49static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
15a7d175 50 int node, int root);
3b0bdc72
UD
51static void calc_inveclosure (re_dfa_t *dfa);
52static int fetch_number (re_string_t *input, re_token_t *token,
15a7d175 53 reg_syntax_t syntax);
f0d77aa8
UD
54static void fetch_token (re_token_t *result, re_string_t *input,
55 reg_syntax_t syntax);
3b0bdc72 56static int peek_token (re_token_t *token, re_string_t *input,
15a7d175 57 reg_syntax_t syntax);
3b0bdc72 58static int peek_token_bracket (re_token_t *token, re_string_t *input,
15a7d175 59 reg_syntax_t syntax);
3b0bdc72 60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
15a7d175 61 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
3b0bdc72 65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
15a7d175
UD
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
3b0bdc72 68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
15a7d175
UD
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
3b0bdc72 71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
15a7d175
UD
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
3b0bdc72 74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
15a7d175
UD
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
3b0bdc72 77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
15a7d175
UD
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
3b0bdc72 80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
15a7d175
UD
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
78d8b07a
UD
84 reg_syntax_t syntax,
85 int accept_hyphen);
3b0bdc72 86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
15a7d175
UD
87 re_string_t *regexp,
88 re_token_t *token);
434d3784 89#ifndef _LIBC
c0a0f9a3
UD
90# ifdef RE_ENABLE_I18N
91static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
92 re_charset_t *mbcset, int *range_alloc,
93 bracket_elem_t *start_elem,
94 bracket_elem_t *end_elem);
c0a0f9a3 95static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175
UD
96 re_charset_t *mbcset,
97 int *coll_sym_alloc,
98 const unsigned char *name);
c0a0f9a3
UD
99# else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
15a7d175
UD
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
c0a0f9a3 103static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
15a7d175 104 const unsigned char *name);
c0a0f9a3 105# endif /* not RE_ENABLE_I18N */
434d3784 106#endif /* not _LIBC */
c0a0f9a3
UD
107#ifdef RE_ENABLE_I18N
108static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175
UD
109 re_charset_t *mbcset,
110 int *equiv_class_alloc,
111 const unsigned char *name);
66b110e8
UD
112static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
113 re_bitset_ptr_t sbcset,
15a7d175
UD
114 re_charset_t *mbcset,
115 int *char_class_alloc,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
c0a0f9a3
UD
118#else /* not RE_ENABLE_I18N */
119static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
15a7d175 120 const unsigned char *name);
66b110e8
UD
121static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
122 re_bitset_ptr_t sbcset,
15a7d175
UD
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
c0a0f9a3 125#endif /* not RE_ENABLE_I18N */
e2b6bfa3 126static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
06e8303a 127 const unsigned char *class_name,
e2b6bfa3
UD
128 const unsigned char *extra, int not,
129 reg_errcode_t *err);
ee70274a
UD
130static bin_tree_t *create_tree (re_dfa_t *dfa,
131 bin_tree_t *left, bin_tree_t *right,
15a7d175 132 re_token_type_t type, int index);
ee70274a
UD
133static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
134 bin_tree_t *left, bin_tree_t *right,
c5c9c929 135 const re_token_t *token)
ee70274a 136 __attribute ((noinline));
3b0bdc72
UD
137static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
138\f
139/* This table gives an error message for each of the error codes listed
140 in regex.h. Obviously the order here has to be same as there.
141 POSIX doesn't require that we do anything for REG_NOERROR,
142 but why not be nice? */
143
6455d255 144const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
145 {
146#define REG_NOERROR_IDX 0
147 gettext_noop ("Success") /* REG_NOERROR */
148 "\0"
149#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
150 gettext_noop ("No match") /* REG_NOMATCH */
151 "\0"
152#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
153 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
154 "\0"
155#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
156 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
157 "\0"
158#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
159 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
160 "\0"
161#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
162 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
163 "\0"
164#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
165 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
166 "\0"
167#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
168 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
169 "\0"
170#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
171 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
172 "\0"
173#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
174 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
175 "\0"
176#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
177 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
178 "\0"
179#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
180 gettext_noop ("Invalid range end") /* REG_ERANGE */
181 "\0"
182#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
183 gettext_noop ("Memory exhausted") /* REG_ESPACE */
184 "\0"
185#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
186 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
187 "\0"
188#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
189 gettext_noop ("Premature end of regular expression") /* REG_EEND */
190 "\0"
191#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
192 gettext_noop ("Regular expression too big") /* REG_ESIZE */
193 "\0"
194#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
195 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
196 };
197
6455d255 198const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
199 {
200 REG_NOERROR_IDX,
201 REG_NOMATCH_IDX,
202 REG_BADPAT_IDX,
203 REG_ECOLLATE_IDX,
204 REG_ECTYPE_IDX,
205 REG_EESCAPE_IDX,
206 REG_ESUBREG_IDX,
207 REG_EBRACK_IDX,
208 REG_EPAREN_IDX,
209 REG_EBRACE_IDX,
210 REG_BADBR_IDX,
211 REG_ERANGE_IDX,
212 REG_ESPACE_IDX,
213 REG_BADRPT_IDX,
214 REG_EEND_IDX,
215 REG_ESIZE_IDX,
216 REG_ERPAREN_IDX
217 };
218\f
219/* Entry points for GNU code. */
220
221/* re_compile_pattern is the GNU regular expression compiler: it
ac3d553b 222 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
3b0bdc72
UD
223 Returns 0 if the pattern was valid, otherwise an error string.
224
225 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
226 are set in BUFP on entry. */
227
228const char *
229re_compile_pattern (pattern, length, bufp)
230 const char *pattern;
231 size_t length;
232 struct re_pattern_buffer *bufp;
233{
234 reg_errcode_t ret;
235
3b0bdc72
UD
236 /* And GNU code determines whether or not to get register information
237 by passing null for the REGS argument to re_match, etc., not by
238 setting no_sub. */
239 bufp->no_sub = 0;
240
241 /* Match anchors at newline. */
242 bufp->newline_anchor = 1;
243
75e4a282 244 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
3b0bdc72
UD
245
246 if (!ret)
247 return NULL;
6455d255 248 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
249}
250#ifdef _LIBC
251weak_alias (__re_compile_pattern, re_compile_pattern)
252#endif
253
254/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
255 also be assigned to arbitrarily: each pattern buffer stores its own
256 syntax, so it can be changed between regex compilations. */
257/* This has no initializer because initialized variables in Emacs
258 become read-only after dumping. */
259reg_syntax_t re_syntax_options;
260
261
262/* Specify the precise syntax of regexps for compilation. This provides
263 for compatibility for various utilities which historically have
264 different, incompatible syntaxes.
265
266 The argument SYNTAX is a bit mask comprised of the various bits
267 defined in regex.h. We return the old syntax. */
268
269reg_syntax_t
270re_set_syntax (syntax)
271 reg_syntax_t syntax;
272{
273 reg_syntax_t ret = re_syntax_options;
274
275 re_syntax_options = syntax;
276 return ret;
277}
278#ifdef _LIBC
279weak_alias (__re_set_syntax, re_set_syntax)
280#endif
281
282int
283re_compile_fastmap (bufp)
284 struct re_pattern_buffer *bufp;
285{
286 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
287 char *fastmap = bufp->fastmap;
288
289 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
290 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
291 if (dfa->init_state != dfa->init_state_word)
292 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
293 if (dfa->init_state != dfa->init_state_nl)
294 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
295 if (dfa->init_state != dfa->init_state_begbuf)
296 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
297 bufp->fastmap_accurate = 1;
298 return 0;
299}
300#ifdef _LIBC
301weak_alias (__re_compile_fastmap, re_compile_fastmap)
302#endif
303
1b2c2628 304static inline void
25337753 305__attribute ((always_inline))
1b2c2628
UD
306re_set_fastmap (char *fastmap, int icase, int ch)
307{
308 fastmap[ch] = 1;
309 if (icase)
310 fastmap[tolower (ch)] = 1;
311}
312
3b0bdc72
UD
313/* Helper function for re_compile_fastmap.
314 Compile fastmap for the initial_state INIT_STATE. */
315
316static void
317re_compile_fastmap_iter (bufp, init_state, fastmap)
318 regex_t *bufp;
319 const re_dfastate_t *init_state;
320 char *fastmap;
321{
322 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
323 int node_cnt;
3c0fb574 324 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
3b0bdc72
UD
325 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
326 {
327 int node = init_state->nodes.elems[node_cnt];
328 re_token_type_t type = dfa->nodes[node].type;
3b0bdc72
UD
329
330 if (type == CHARACTER)
74e12fbc
UD
331 {
332 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
333#ifdef RE_ENABLE_I18N
14744156 334 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc 335 {
3c0fb574 336 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
74e12fbc
UD
337 wchar_t wc;
338 mbstate_t state;
339
340 p = buf;
341 *p++ = dfa->nodes[node].opr.c;
342 while (++node < dfa->nodes_len
343 && dfa->nodes[node].type == CHARACTER
344 && dfa->nodes[node].mb_partial)
345 *p++ = dfa->nodes[node].opr.c;
346 memset (&state, 0, sizeof (state));
347 if (mbrtowc (&wc, (const char *) buf, p - buf,
348 &state) == p - buf
349 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
350 re_set_fastmap (fastmap, 0, buf[0]);
351 }
352#endif
353 }
3b0bdc72 354 else if (type == SIMPLE_BRACKET)
15a7d175
UD
355 {
356 int i, j, ch;
357 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
358 for (j = 0; j < UINT_BITS; ++j, ++ch)
359 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
360 re_set_fastmap (fastmap, icase, ch);
361 }
c0a0f9a3 362#ifdef RE_ENABLE_I18N
3b0bdc72 363 else if (type == COMPLEX_BRACKET)
15a7d175
UD
364 {
365 int i;
366 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
367 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
368 || cset->nranges || cset->nchar_classes)
369 {
c0a0f9a3 370# ifdef _LIBC
15a7d175
UD
371 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
372 {
373 /* In this case we want to catch the bytes which are
374 the first byte of any collation elements.
375 e.g. In da_DK, we want to catch 'a' since "aa"
376 is a valid collation element, and don't catch
377 'b' since 'b' is the only collation element
378 which starts from 'b'. */
379 int j, ch;
380 const int32_t *table = (const int32_t *)
381 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
382 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
383 for (j = 0; j < UINT_BITS; ++j, ++ch)
384 if (table[ch] < 0)
385 re_set_fastmap (fastmap, icase, ch);
386 }
c0a0f9a3 387# else
3c0fb574 388 if (dfa->mb_cur_max > 1)
15a7d175
UD
389 for (i = 0; i < SBC_MAX; ++i)
390 if (__btowc (i) == WEOF)
391 re_set_fastmap (fastmap, icase, i);
c0a0f9a3 392# endif /* not _LIBC */
15a7d175
UD
393 }
394 for (i = 0; i < cset->nmbchars; ++i)
395 {
396 char buf[256];
397 mbstate_t state;
398 memset (&state, '\0', sizeof (state));
399 __wcrtomb (buf, cset->mbchars[i], &state);
400 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
14744156 401 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
74e12fbc
UD
402 {
403 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
404 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
405 }
15a7d175
UD
406 }
407 }
c0a0f9a3 408#endif /* RE_ENABLE_I18N */
ad7f28c2
UD
409 else if (type == END_OF_RE || type == OP_PERIOD
410 || type == OP_UTF8_PERIOD)
15a7d175
UD
411 {
412 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
413 if (type == END_OF_RE)
414 bufp->can_be_null = 1;
415 return;
416 }
3b0bdc72
UD
417 }
418}
419\f
420/* Entry point for POSIX code. */
421/* regcomp takes a regular expression as a string and compiles it.
422
423 PREG is a regex_t *. We do not expect any fields to be initialized,
424 since POSIX says we shouldn't. Thus, we set
425
426 `buffer' to the compiled pattern;
427 `used' to the length of the compiled pattern;
428 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
429 REG_EXTENDED bit in CFLAGS is set; otherwise, to
430 RE_SYNTAX_POSIX_BASIC;
431 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
432 `fastmap' to an allocated space for the fastmap;
433 `fastmap_accurate' to zero;
434 `re_nsub' to the number of subexpressions in PATTERN.
435
436 PATTERN is the address of the pattern string.
437
438 CFLAGS is a series of bits which affect compilation.
439
440 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
441 use POSIX basic syntax.
442
443 If REG_NEWLINE is set, then . and [^...] don't match newline.
444 Also, regexec will try a match beginning after every newline.
445
446 If REG_ICASE is set, then we considers upper- and lowercase
447 versions of letters to be equivalent when matching.
448
449 If REG_NOSUB is set, then when PREG is passed to regexec, that
450 routine will report only success or failure, and nothing about the
451 registers.
452
453 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
454 the return codes and their meanings.) */
455
456int
457regcomp (preg, pattern, cflags)
75e4a282
UD
458 regex_t *__restrict preg;
459 const char *__restrict pattern;
3b0bdc72
UD
460 int cflags;
461{
462 reg_errcode_t ret;
463 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
15a7d175 464 : RE_SYNTAX_POSIX_BASIC);
3b0bdc72
UD
465
466 preg->buffer = NULL;
467 preg->allocated = 0;
468 preg->used = 0;
469
470 /* Try to allocate space for the fastmap. */
471 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 472 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
473 return REG_ESPACE;
474
475 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
476
477 /* If REG_NEWLINE is set, newlines are treated differently. */
478 if (cflags & REG_NEWLINE)
479 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
480 syntax &= ~RE_DOT_NEWLINE;
481 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
482 /* It also changes the matching behavior. */
483 preg->newline_anchor = 1;
484 }
485 else
486 preg->newline_anchor = 0;
487 preg->no_sub = !!(cflags & REG_NOSUB);
488 preg->translate = NULL;
489
490 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
491
492 /* POSIX doesn't distinguish between an unmatched open-group and an
493 unmatched close-group: both are REG_EPAREN. */
494 if (ret == REG_ERPAREN)
495 ret = REG_EPAREN;
496
a9388965 497 /* We have already checked preg->fastmap != NULL. */
bc15410e 498 if (BE (ret == REG_NOERROR, 1))
71ccd330
UD
499 /* Compute the fastmap now, since regexec cannot modify the pattern
500 buffer. This function nevers fails in this implementation. */
83b038f2 501 (void) re_compile_fastmap (preg);
71ccd330 502 else
3b0bdc72 503 {
71ccd330
UD
504 /* Some error occurred while compiling the expression. */
505 re_free (preg->fastmap);
506 preg->fastmap = NULL;
3b0bdc72
UD
507 }
508
509 return (int) ret;
510}
511#ifdef _LIBC
512weak_alias (__regcomp, regcomp)
513#endif
514
515/* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
517
518size_t
519regerror (errcode, preg, errbuf, errbuf_size)
520 int errcode;
521 const regex_t *preg;
522 char *errbuf;
523 size_t errbuf_size;
524{
525 const char *msg;
526 size_t msg_size;
527
bc15410e 528 if (BE (errcode < 0
15a7d175
UD
529 || errcode >= (int) (sizeof (__re_error_msgid_idx)
530 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
531 /* Only error codes returned by the rest of the code should be passed
532 to this routine. If we are given anything else, or if other regex
533 code generates an invalid error code, then the program has a bug.
534 Dump core so we can fix it. */
535 abort ();
536
6455d255 537 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
538
539 msg_size = strlen (msg) + 1; /* Includes the null. */
540
bc15410e 541 if (BE (errbuf_size != 0, 1))
3b0bdc72 542 {
bc15410e 543 if (BE (msg_size > errbuf_size, 0))
15a7d175 544 {
3b0bdc72
UD
545#if defined HAVE_MEMPCPY || defined _LIBC
546 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
547#else
15a7d175
UD
548 memcpy (errbuf, msg, errbuf_size - 1);
549 errbuf[errbuf_size - 1] = 0;
3b0bdc72 550#endif
15a7d175 551 }
3b0bdc72 552 else
15a7d175 553 memcpy (errbuf, msg, msg_size);
3b0bdc72
UD
554 }
555
556 return msg_size;
557}
558#ifdef _LIBC
559weak_alias (__regerror, regerror)
560#endif
561
3b0bdc72 562
71ccd330
UD
563static void
564free_dfa_content (re_dfa_t *dfa)
3b0bdc72
UD
565{
566 int i, j;
3b0bdc72 567
71ccd330
UD
568 re_free (dfa->subexps);
569
ee70274a
UD
570 if (dfa->nodes)
571 for (i = 0; i < dfa->nodes_len; ++i)
572 {
573 re_token_t *node = dfa->nodes + i;
c0a0f9a3 574#ifdef RE_ENABLE_I18N
ee70274a
UD
575 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
576 free_charset (node->opr.mbcset);
577 else
c0a0f9a3 578#endif /* RE_ENABLE_I18N */
ee70274a
UD
579 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
580 re_free (node->opr.sbcset);
581 }
71ccd330
UD
582 re_free (dfa->nexts);
583 for (i = 0; i < dfa->nodes_len; ++i)
584 {
585 if (dfa->eclosures != NULL)
586 re_node_set_free (dfa->eclosures + i);
587 if (dfa->inveclosures != NULL)
588 re_node_set_free (dfa->inveclosures + i);
589 if (dfa->edests != NULL)
590 re_node_set_free (dfa->edests + i);
591 }
592 re_free (dfa->edests);
593 re_free (dfa->eclosures);
594 re_free (dfa->inveclosures);
595 re_free (dfa->nodes);
3b0bdc72 596
ee70274a
UD
597 if (dfa->state_table)
598 for (i = 0; i <= dfa->state_hash_mask; ++i)
599 {
600 struct re_state_table_entry *entry = dfa->state_table + i;
601 for (j = 0; j < entry->num; ++j)
602 {
603 re_dfastate_t *state = entry->array[j];
604 free_state (state);
605 }
606 re_free (entry->array);
607 }
71ccd330 608 re_free (dfa->state_table);
ee70274a 609 re_free (dfa->word_char);
0742e48e 610#ifdef DEBUG
71ccd330 611 re_free (dfa->re_str);
0742e48e 612#endif
71ccd330
UD
613
614 re_free (dfa);
615}
616
617
618/* Free dynamically allocated space used by PREG. */
619
620void
621regfree (preg)
622 regex_t *preg;
623{
624 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
625 if (BE (dfa != NULL, 1))
626 free_dfa_content (dfa);
627
3b0bdc72
UD
628 re_free (preg->fastmap);
629}
630#ifdef _LIBC
631weak_alias (__regfree, regfree)
632#endif
633\f
634/* Entry points compatible with 4.2 BSD regex library. We don't define
635 them unless specifically requested. */
636
637#if defined _REGEX_RE_COMP || defined _LIBC
638
639/* BSD has one and only one pattern buffer. */
640static struct re_pattern_buffer re_comp_buf;
641
642char *
643# ifdef _LIBC
644/* Make these definitions weak in libc, so POSIX programs can redefine
645 these names if they don't use our functions, and still use
646 regcomp/regexec above without link errors. */
647weak_function
648# endif
649re_comp (s)
650 const char *s;
651{
652 reg_errcode_t ret;
240e87c2 653 char *fastmap;
3b0bdc72
UD
654
655 if (!s)
656 {
657 if (!re_comp_buf.buffer)
658 return gettext ("No previous regular expression");
659 return 0;
660 }
661
240e87c2
RM
662 if (re_comp_buf.buffer)
663 {
664 fastmap = re_comp_buf.fastmap;
665 re_comp_buf.fastmap = NULL;
666 __regfree (&re_comp_buf);
1b2c2628 667 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
240e87c2
RM
668 re_comp_buf.fastmap = fastmap;
669 }
670
671 if (re_comp_buf.fastmap == NULL)
3b0bdc72
UD
672 {
673 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
674 if (re_comp_buf.fastmap == NULL)
6455d255
UD
675 return (char *) gettext (__re_error_msgid
676 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
677 }
678
679 /* Since `re_exec' always passes NULL for the `regs' argument, we
680 don't need to initialize the pattern buffer fields which affect it. */
681
682 /* Match anchors at newlines. */
683 re_comp_buf.newline_anchor = 1;
684
685 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
686
687 if (!ret)
688 return NULL;
689
690 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 691 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72 692}
240e87c2
RM
693
694#ifdef _LIBC
c877418f 695libc_freeres_fn (free_mem)
240e87c2
RM
696{
697 __regfree (&re_comp_buf);
698}
240e87c2
RM
699#endif
700
3b0bdc72
UD
701#endif /* _REGEX_RE_COMP */
702\f
703/* Internal entry point.
704 Compile the regular expression PATTERN, whose length is LENGTH.
705 SYNTAX indicate regular expression's syntax. */
706
707static reg_errcode_t
708re_compile_internal (preg, pattern, length, syntax)
709 regex_t *preg;
710 const char * pattern;
711 int length;
712 reg_syntax_t syntax;
713{
714 reg_errcode_t err = REG_NOERROR;
715 re_dfa_t *dfa;
716 re_string_t regexp;
717
718 /* Initialize the pattern buffer. */
719 preg->fastmap_accurate = 0;
720 preg->syntax = syntax;
721 preg->not_bol = preg->not_eol = 0;
722 preg->used = 0;
723 preg->re_nsub = 0;
1b2c2628
UD
724 preg->can_be_null = 0;
725 preg->regs_allocated = REGS_UNALLOCATED;
3b0bdc72
UD
726
727 /* Initialize the dfa. */
728 dfa = (re_dfa_t *) preg->buffer;
951d6408 729 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3b0bdc72
UD
730 {
731 /* If zero allocated, but buffer is non-null, try to realloc
732 enough space. This loses if buffer's address is bogus, but
733 that is the user's responsibility. If ->buffer is NULL this
734 is a simple allocation. */
735 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
736 if (dfa == NULL)
737 return REG_ESPACE;
3b0bdc72 738 preg->allocated = sizeof (re_dfa_t);
951d6408 739 preg->buffer = (unsigned char *) dfa;
3b0bdc72 740 }
3b0bdc72
UD
741 preg->used = sizeof (re_dfa_t);
742
743 err = init_dfa (dfa, length);
bc15410e 744 if (BE (err != REG_NOERROR, 0))
3b0bdc72 745 {
ee70274a 746 free_dfa_content (dfa);
3b0bdc72 747 preg->buffer = NULL;
ca3c505e 748 preg->allocated = 0;
3b0bdc72
UD
749 return err;
750 }
0742e48e
UD
751#ifdef DEBUG
752 dfa->re_str = re_malloc (char, length + 1);
753 strncpy (dfa->re_str, pattern, length + 1);
754#endif
3b0bdc72 755
612546c6 756 err = re_string_construct (&regexp, pattern, length, preg->translate,
f0c7c524 757 syntax & RE_ICASE, dfa);
bc15410e 758 if (BE (err != REG_NOERROR, 0))
3b0bdc72 759 {
ee70274a
UD
760 re_compile_internal_free_return:
761 free_workarea_compile (preg);
762 re_string_destruct (&regexp);
763 free_dfa_content (dfa);
3b0bdc72 764 preg->buffer = NULL;
ca3c505e 765 preg->allocated = 0;
3b0bdc72
UD
766 return err;
767 }
768
769 /* Parse the regular expression, and build a structure tree. */
770 preg->re_nsub = 0;
771 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 772 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
773 goto re_compile_internal_free_return;
774
14744156
UD
775#ifdef RE_ENABLE_I18N
776 /* If possible, do searching in single byte encoding to speed things up. */
ad7f28c2 777 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
14744156
UD
778 optimize_utf8 (dfa);
779#endif
780
3b0bdc72
UD
781 /* Analyze the tree and collect information which is necessary to
782 create the dfa. */
783 err = analyze (dfa);
bc15410e 784 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
785 goto re_compile_internal_free_return;
786
787 /* Then create the initial state of the dfa. */
788 err = create_initial_state (dfa);
3b0bdc72 789
3b0bdc72
UD
790 /* Release work areas. */
791 free_workarea_compile (preg);
792 re_string_destruct (&regexp);
793
71ccd330
UD
794 if (BE (err != REG_NOERROR, 0))
795 {
71ccd330
UD
796 free_dfa_content (dfa);
797 preg->buffer = NULL;
ca3c505e 798 preg->allocated = 0;
71ccd330
UD
799 }
800
3b0bdc72
UD
801 return err;
802}
803
804/* Initialize DFA. We use the length of the regular expression PAT_LEN
805 as the initial length of some arrays. */
806
807static reg_errcode_t
808init_dfa (dfa, pat_len)
809 re_dfa_t *dfa;
810 int pat_len;
811{
812 int table_size;
81c64d40
UD
813
814 memset (dfa, '\0', sizeof (re_dfa_t));
815
ee70274a
UD
816 /* Force allocation of str_tree_storage the first time. */
817 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
818
3b0bdc72
UD
819 dfa->nodes_alloc = pat_len + 1;
820 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
821
822 dfa->states_alloc = pat_len + 1;
823
824 /* table_size = 2 ^ ceil(log pat_len) */
825 for (table_size = 1; table_size > 0; table_size <<= 1)
826 if (table_size > pat_len)
827 break;
828
829 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
830 dfa->state_hash_mask = table_size - 1;
831
832 dfa->subexps_alloc = 1;
833 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
834 dfa->word_char = NULL;
835
3c0fb574
UD
836 dfa->mb_cur_max = MB_CUR_MAX;
837#ifdef _LIBC
838 if (dfa->mb_cur_max > 1
839 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
840 dfa->is_utf8 = 1;
f0c7c524
UD
841 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
842 != 0);
3c0fb574
UD
843#endif
844
bc15410e 845 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
15a7d175 846 || dfa->subexps == NULL, 0))
ee70274a 847 return REG_ESPACE;
3b0bdc72
UD
848 return REG_NOERROR;
849}
850
851/* Initialize WORD_CHAR table, which indicate which character is
852 "word". In this case "word" means that it is the word construction
853 character used by some operators like "\<", "\>", etc. */
854
a9388965 855static reg_errcode_t
3b0bdc72
UD
856init_word_char (dfa)
857 re_dfa_t *dfa;
858{
859 int i, j, ch;
860 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
bc15410e 861 if (BE (dfa->word_char == NULL, 0))
a9388965 862 return REG_ESPACE;
3b0bdc72
UD
863 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
864 for (j = 0; j < UINT_BITS; ++j, ++ch)
865 if (isalnum (ch) || ch == '_')
15a7d175 866 dfa->word_char[i] |= 1 << j;
a9388965 867 return REG_NOERROR;
3b0bdc72
UD
868}
869
870/* Free the work area which are only used while compiling. */
871
872static void
873free_workarea_compile (preg)
874 regex_t *preg;
875{
876 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
ee70274a
UD
877 bin_tree_storage_t *storage, *next;
878 for (storage = dfa->str_tree_storage; storage; storage = next)
879 {
880 next = storage->next;
881 re_free (storage);
882 }
883 dfa->str_tree_storage = NULL;
884 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3b0bdc72 885 dfa->str_tree = NULL;
a7d5c291
UD
886 re_free (dfa->org_indices);
887 dfa->org_indices = NULL;
3b0bdc72
UD
888}
889
890/* Create initial states for all contexts. */
891
892static reg_errcode_t
893create_initial_state (dfa)
894 re_dfa_t *dfa;
895{
896 int first, i;
897 reg_errcode_t err;
898 re_node_set init_nodes;
899
900 /* Initial states have the epsilon closure of the node which is
901 the first node of the regular expression. */
902 first = dfa->str_tree->first;
903 dfa->init_node = first;
904 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 905 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
906 return err;
907
908 /* The back-references which are in initial states can epsilon transit,
909 since in this case all of the subexpressions can be null.
910 Then we add epsilon closures of the nodes which are the next nodes of
911 the back-references. */
912 if (dfa->nbackref > 0)
913 for (i = 0; i < init_nodes.nelem; ++i)
914 {
15a7d175
UD
915 int node_idx = init_nodes.elems[i];
916 re_token_type_t type = dfa->nodes[node_idx].type;
917
918 int clexp_idx;
919 if (type != OP_BACK_REF)
920 continue;
921 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
922 {
923 re_token_t *clexp_node;
924 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
925 if (clexp_node->type == OP_CLOSE_SUBEXP
926 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
927 break;
928 }
929 if (clexp_idx == init_nodes.nelem)
930 continue;
931
932 if (type == OP_BACK_REF)
933 {
934 int dest_idx = dfa->edests[node_idx].elems[0];
935 if (!re_node_set_contains (&init_nodes, dest_idx))
936 {
937 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
938 i = 0;
939 }
940 }
3b0bdc72
UD
941 }
942
943 /* It must be the first time to invoke acquire_state. */
a9388965
UD
944 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
945 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 946 if (BE (dfa->init_state == NULL, 0))
a9388965 947 return err;
3b0bdc72
UD
948 if (dfa->init_state->has_constraint)
949 {
a9388965 950 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 951 CONTEXT_WORD);
a9388965 952 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
15a7d175 953 CONTEXT_NEWLINE);
a9388965 954 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
15a7d175
UD
955 &init_nodes,
956 CONTEXT_NEWLINE
957 | CONTEXT_BEGBUF);
bc15410e 958 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
15a7d175
UD
959 || dfa->init_state_begbuf == NULL, 0))
960 return err;
3b0bdc72
UD
961 }
962 else
963 dfa->init_state_word = dfa->init_state_nl
964 = dfa->init_state_begbuf = dfa->init_state;
965
3b0bdc72
UD
966 re_node_set_free (&init_nodes);
967 return REG_NOERROR;
968}
969\f
14744156
UD
970#ifdef RE_ENABLE_I18N
971/* If it is possible to do searching in single byte encoding instead of UTF-8
972 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
973 DFA nodes where needed. */
974
975static void
976optimize_utf8 (dfa)
977 re_dfa_t *dfa;
978{
ad7f28c2 979 int node, i, mb_chars = 0, has_period = 0;
14744156
UD
980
981 for (node = 0; node < dfa->nodes_len; ++node)
982 switch (dfa->nodes[node].type)
983 {
984 case CHARACTER:
14744156 985 if (dfa->nodes[node].opr.c >= 0x80)
5f93cd52 986 mb_chars = 1;
14744156
UD
987 break;
988 case ANCHOR:
989 switch (dfa->nodes[node].opr.idx)
990 {
991 case LINE_FIRST:
992 case LINE_LAST:
993 case BUF_FIRST:
994 case BUF_LAST:
995 break;
996 default:
997 /* Word anchors etc. cannot be handled. */
998 return;
999 }
1000 break;
ad7f28c2
UD
1001 case OP_PERIOD:
1002 has_period = 1;
1003 break;
14744156
UD
1004 case OP_BACK_REF:
1005 case OP_ALT:
1006 case END_OF_RE:
14744156
UD
1007 case OP_DUP_ASTERISK:
1008 case OP_DUP_QUESTION:
1009 case OP_DUP_PLUS:
1010 case OP_OPEN_SUBEXP:
1011 case OP_CLOSE_SUBEXP:
1012 break;
a8067e8f
UD
1013 case SIMPLE_BRACKET:
1014 /* Just double check. */
1015 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1016 if (dfa->nodes[node].opr.sbcset[i])
1017 return;
1018 break;
14744156
UD
1019 default:
1020 return;
1021 }
1022
ad7f28c2 1023 if (mb_chars || has_period)
5f93cd52 1024 for (node = 0; node < dfa->nodes_len; ++node)
ad7f28c2
UD
1025 {
1026 if (dfa->nodes[node].type == CHARACTER
1027 && dfa->nodes[node].opr.c >= 0x80)
1028 dfa->nodes[node].mb_partial = 0;
1029 else if (dfa->nodes[node].type == OP_PERIOD)
1030 dfa->nodes[node].type = OP_UTF8_PERIOD;
1031 }
5f93cd52 1032
14744156
UD
1033 /* The search can be in single byte locale. */
1034 dfa->mb_cur_max = 1;
1035 dfa->is_utf8 = 0;
ad7f28c2 1036 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
14744156
UD
1037}
1038#endif
1039\f
3b0bdc72
UD
1040/* Analyze the structure tree, and calculate "first", "next", "edest",
1041 "eclosure", and "inveclosure". */
1042
1043static reg_errcode_t
1044analyze (dfa)
1045 re_dfa_t *dfa;
1046{
1047 int i;
1048 reg_errcode_t ret;
1049
1050 /* Allocate arrays. */
3b0bdc72 1051 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
a7d5c291 1052 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3b0bdc72
UD
1053 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1054 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1055 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
a7d5c291 1056 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
15a7d175 1057 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
3b0bdc72
UD
1058 return REG_ESPACE;
1059 /* Initialize them. */
1060 for (i = 0; i < dfa->nodes_len; ++i)
1061 {
3b0bdc72
UD
1062 dfa->nexts[i] = -1;
1063 re_node_set_init_empty (dfa->edests + i);
1064 re_node_set_init_empty (dfa->eclosures + i);
1065 re_node_set_init_empty (dfa->inveclosures + i);
1066 }
1067
1068 ret = analyze_tree (dfa, dfa->str_tree);
bc15410e 1069 if (BE (ret == REG_NOERROR, 1))
3b0bdc72
UD
1070 {
1071 ret = calc_eclosure (dfa);
1072 if (ret == REG_NOERROR)
1073 calc_inveclosure (dfa);
1074 }
1075 return ret;
1076}
1077
1078/* Helper functions for analyze.
1079 This function calculate "first", "next", and "edest" for the subtree
1080 whose root is NODE. */
1081
1082static reg_errcode_t
1083analyze_tree (dfa, node)
1084 re_dfa_t *dfa;
1085 bin_tree_t *node;
1086{
1087 reg_errcode_t ret;
1088 if (node->first == -1)
1089 calc_first (dfa, node);
1090 if (node->next == -1)
1091 calc_next (dfa, node);
1092 if (node->eclosure.nelem == 0)
1093 calc_epsdest (dfa, node);
1094 /* Calculate "first" etc. for the left child. */
1095 if (node->left != NULL)
1096 {
1097 ret = analyze_tree (dfa, node->left);
bc15410e 1098 if (BE (ret != REG_NOERROR, 0))
15a7d175 1099 return ret;
3b0bdc72
UD
1100 }
1101 /* Calculate "first" etc. for the right child. */
1102 if (node->right != NULL)
1103 {
1104 ret = analyze_tree (dfa, node->right);
bc15410e 1105 if (BE (ret != REG_NOERROR, 0))
15a7d175 1106 return ret;
3b0bdc72
UD
1107 }
1108 return REG_NOERROR;
1109}
1110
1111/* Calculate "first" for the node NODE. */
1112static void
1113calc_first (dfa, node)
1114 re_dfa_t *dfa;
1115 bin_tree_t *node;
1116{
1117 int idx, type;
1118 idx = node->node_idx;
1119 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1120
1121 switch (type)
1122 {
1123#ifdef DEBUG
3b0bdc72
UD
1124 case OP_OPEN_BRACKET:
1125 case OP_CLOSE_BRACKET:
1126 case OP_OPEN_DUP_NUM:
1127 case OP_CLOSE_DUP_NUM:
1128 case OP_NON_MATCH_LIST:
1129 case OP_OPEN_COLL_ELEM:
1130 case OP_CLOSE_COLL_ELEM:
1131 case OP_OPEN_EQUIV_CLASS:
1132 case OP_CLOSE_EQUIV_CLASS:
1133 case OP_OPEN_CHAR_CLASS:
1134 case OP_CLOSE_CHAR_CLASS:
1135 /* These must not be appeared here. */
1136 assert (0);
1137#endif
1138 case END_OF_RE:
1139 case CHARACTER:
1140 case OP_PERIOD:
1141 case OP_DUP_ASTERISK:
1142 case OP_DUP_QUESTION:
c0a0f9a3 1143#ifdef RE_ENABLE_I18N
ad7f28c2 1144 case OP_UTF8_PERIOD:
3b0bdc72 1145 case COMPLEX_BRACKET:
c0a0f9a3 1146#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1147 case SIMPLE_BRACKET:
1148 case OP_BACK_REF:
1149 case ANCHOR:
81c64d40
UD
1150 case OP_OPEN_SUBEXP:
1151 case OP_CLOSE_SUBEXP:
3b0bdc72
UD
1152 node->first = idx;
1153 break;
1154 case OP_DUP_PLUS:
1155#ifdef DEBUG
1156 assert (node->left != NULL);
1157#endif
1158 if (node->left->first == -1)
15a7d175 1159 calc_first (dfa, node->left);
3b0bdc72
UD
1160 node->first = node->left->first;
1161 break;
1162 case OP_ALT:
1163 node->first = idx;
1164 break;
3b0bdc72
UD
1165 /* else fall through */
1166 default:
1167#ifdef DEBUG
1168 assert (node->left != NULL);
1169#endif
1170 if (node->left->first == -1)
15a7d175 1171 calc_first (dfa, node->left);
3b0bdc72
UD
1172 node->first = node->left->first;
1173 break;
1174 }
3b0bdc72
UD
1175}
1176
1177/* Calculate "next" for the node NODE. */
1178
1179static void
1180calc_next (dfa, node)
1181 re_dfa_t *dfa;
1182 bin_tree_t *node;
1183{
1184 int idx, type;
1185 bin_tree_t *parent = node->parent;
1186 if (parent == NULL)
1187 {
1188 node->next = -1;
1189 idx = node->node_idx;
1190 if (node->type == 0)
15a7d175 1191 dfa->nexts[idx] = node->next;
3b0bdc72
UD
1192 return;
1193 }
1194
1195 idx = parent->node_idx;
1196 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1197
1198 switch (type)
1199 {
1200 case OP_DUP_ASTERISK:
1201 case OP_DUP_PLUS:
1202 node->next = idx;
1203 break;
1204 case CONCAT:
1205 if (parent->left == node)
15a7d175
UD
1206 {
1207 if (parent->right->first == -1)
1208 calc_first (dfa, parent->right);
1209 node->next = parent->right->first;
1210 break;
1211 }
3b0bdc72
UD
1212 /* else fall through */
1213 default:
1214 if (parent->next == -1)
15a7d175 1215 calc_next (dfa, parent);
3b0bdc72
UD
1216 node->next = parent->next;
1217 break;
1218 }
1219 idx = node->node_idx;
1220 if (node->type == 0)
1221 dfa->nexts[idx] = node->next;
1222}
1223
1224/* Calculate "edest" for the node NODE. */
1225
1226static void
1227calc_epsdest (dfa, node)
1228 re_dfa_t *dfa;
1229 bin_tree_t *node;
1230{
1231 int idx;
1232 idx = node->node_idx;
1233 if (node->type == 0)
1234 {
1235 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
15a7d175
UD
1236 || dfa->nodes[idx].type == OP_DUP_PLUS
1237 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1238 {
1239 if (node->left->first == -1)
1240 calc_first (dfa, node->left);
1241 if (node->next == -1)
1242 calc_next (dfa, node);
1243 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1244 node->next);
1245 }
3b0bdc72 1246 else if (dfa->nodes[idx].type == OP_ALT)
15a7d175
UD
1247 {
1248 int left, right;
1249 if (node->left != NULL)
1250 {
1251 if (node->left->first == -1)
1252 calc_first (dfa, node->left);
1253 left = node->left->first;
1254 }
1255 else
1256 {
1257 if (node->next == -1)
1258 calc_next (dfa, node);
1259 left = node->next;
1260 }
1261 if (node->right != NULL)
1262 {
1263 if (node->right->first == -1)
1264 calc_first (dfa, node->right);
1265 right = node->right->first;
1266 }
1267 else
1268 {
1269 if (node->next == -1)
1270 calc_next (dfa, node);
1271 right = node->next;
1272 }
1273 re_node_set_init_2 (dfa->edests + idx, left, right);
1274 }
81c64d40 1275 else if (dfa->nodes[idx].type == ANCHOR
15a7d175
UD
1276 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1277 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1278 || dfa->nodes[idx].type == OP_BACK_REF)
1279 re_node_set_init_1 (dfa->edests + idx, node->next);
1769a73f
UD
1280 else
1281 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
3b0bdc72
UD
1282 }
1283}
1284
485d775d
UD
1285/* Duplicate the epsilon closure of the node ROOT_NODE.
1286 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1287 to their own constraint. */
1288
1289static reg_errcode_t
1290duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
15a7d175 1291 init_constraint)
485d775d
UD
1292 re_dfa_t *dfa;
1293 int top_org_node, top_clone_node, root_node;
1294 unsigned int init_constraint;
1295{
1296 reg_errcode_t err;
1297 int org_node, clone_node, ret;
1298 unsigned int constraint = init_constraint;
1299 for (org_node = top_org_node, clone_node = top_clone_node;;)
1300 {
1301 int org_dest, clone_dest;
1302 if (dfa->nodes[org_node].type == OP_BACK_REF)
15a7d175 1303 {
485d775d
UD
1304 /* If the back reference epsilon-transit, its destination must
1305 also have the constraint. Then duplicate the epsilon closure
1306 of the destination of the back reference, and store it in
1307 edests of the back reference. */
15a7d175
UD
1308 org_dest = dfa->nexts[org_node];
1309 re_node_set_empty (dfa->edests + clone_node);
1310 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1311 if (BE (err != REG_NOERROR, 0))
1312 return err;
1313 dfa->nexts[clone_node] = dfa->nexts[org_node];
1314 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1315 if (BE (ret < 0, 0))
1316 return REG_ESPACE;
1317 }
485d775d 1318 else if (dfa->edests[org_node].nelem == 0)
15a7d175 1319 {
485d775d
UD
1320 /* In case of the node can't epsilon-transit, don't duplicate the
1321 destination and store the original destination as the
1322 destination of the node. */
15a7d175
UD
1323 dfa->nexts[clone_node] = dfa->nexts[org_node];
1324 break;
1325 }
485d775d 1326 else if (dfa->edests[org_node].nelem == 1)
15a7d175 1327 {
485d775d
UD
1328 /* In case of the node can epsilon-transit, and it has only one
1329 destination. */
15a7d175
UD
1330 org_dest = dfa->edests[org_node].elems[0];
1331 re_node_set_empty (dfa->edests + clone_node);
1332 if (dfa->nodes[org_node].type == ANCHOR)
1333 {
485d775d 1334 /* In case of the node has another constraint, append it. */
15a7d175
UD
1335 if (org_node == root_node && clone_node != org_node)
1336 {
485d775d
UD
1337 /* ...but if the node is root_node itself, it means the
1338 epsilon closure have a loop, then tie it to the
1339 destination of the root_node. */
15a7d175
UD
1340 ret = re_node_set_insert (dfa->edests + clone_node,
1341 org_dest);
1342 if (BE (ret < 0, 0))
1343 return REG_ESPACE;
1344 break;
1345 }
1346 constraint |= dfa->nodes[org_node].opr.ctx_type;
1347 }
1348 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1349 if (BE (err != REG_NOERROR, 0))
1350 return err;
1351 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1352 if (BE (ret < 0, 0))
1353 return REG_ESPACE;
1354 }
485d775d 1355 else /* dfa->edests[org_node].nelem == 2 */
15a7d175 1356 {
485d775d 1357 /* In case of the node can epsilon-transit, and it has two
a7d5c291 1358 destinations. E.g. '|', '*', '+', '?'. */
15a7d175
UD
1359 org_dest = dfa->edests[org_node].elems[0];
1360 re_node_set_empty (dfa->edests + clone_node);
a7d5c291
UD
1361 /* Search for a duplicated node which satisfies the constraint. */
1362 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1363 if (clone_dest == -1)
1364 {
1365 /* There are no such a duplicated node, create a new one. */
1366 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1367 if (BE (err != REG_NOERROR, 0))
1368 return err;
1369 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1370 if (BE (ret < 0, 0))
1371 return REG_ESPACE;
1372 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1373 root_node, constraint);
1374 if (BE (err != REG_NOERROR, 0))
1375 return err;
1376 }
1377 else
1378 {
1379 /* There are a duplicated node which satisfy the constraint,
1380 use it to avoid infinite loop. */
1381 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1382 if (BE (ret < 0, 0))
1383 return REG_ESPACE;
1384 }
15a7d175
UD
1385
1386 org_dest = dfa->edests[org_node].elems[1];
1387 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1388 if (BE (err != REG_NOERROR, 0))
1389 return err;
1390 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1391 if (BE (ret < 0, 0))
1392 return REG_ESPACE;
1393 }
485d775d
UD
1394 org_node = org_dest;
1395 clone_node = clone_dest;
1396 }
1397 return REG_NOERROR;
1398}
1399
a7d5c291
UD
1400/* Search for a node which is duplicated from the node ORG_NODE, and
1401 satisfies the constraint CONSTRAINT. */
1402
1403static int
1404search_duplicated_node (dfa, org_node, constraint)
1405 re_dfa_t *dfa;
1406 int org_node;
1407 unsigned int constraint;
1408{
1409 int idx;
1410 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1411 {
1412 if (org_node == dfa->org_indices[idx]
1413 && constraint == dfa->nodes[idx].constraint)
1414 return idx; /* Found. */
1415 }
1416 return -1; /* Not found. */
1417}
1418
a9388965
UD
1419/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1420 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1421 otherwise return the error code. */
1422
1423static reg_errcode_t
1424duplicate_node (new_idx, dfa, org_idx, constraint)
3b0bdc72 1425 re_dfa_t *dfa;
a9388965 1426 int *new_idx, org_idx;
3b0bdc72
UD
1427 unsigned int constraint;
1428{
c5c9c929 1429 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
485d775d 1430 if (BE (dup_idx == -1, 0))
a9388965 1431 return REG_ESPACE;
485d775d
UD
1432 dfa->nodes[dup_idx].constraint = constraint;
1433 if (dfa->nodes[org_idx].type == ANCHOR)
1434 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
a9388965 1435 dfa->nodes[dup_idx].duplicated = 1;
485d775d
UD
1436 re_node_set_init_empty (dfa->edests + dup_idx);
1437 re_node_set_init_empty (dfa->eclosures + dup_idx);
1438 re_node_set_init_empty (dfa->inveclosures + dup_idx);
3b0bdc72 1439
a7d5c291
UD
1440 /* Store the index of the original node. */
1441 dfa->org_indices[dup_idx] = org_idx;
a9388965
UD
1442 *new_idx = dup_idx;
1443 return REG_NOERROR;
3b0bdc72
UD
1444}
1445
1446static void
1447calc_inveclosure (dfa)
1448 re_dfa_t *dfa;
1449{
485d775d 1450 int src, idx, dest;
3b0bdc72
UD
1451 for (src = 0; src < dfa->nodes_len; ++src)
1452 {
1453 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
15a7d175
UD
1454 {
1455 dest = dfa->eclosures[src].elems[idx];
1456 re_node_set_insert (dfa->inveclosures + dest, src);
1457 }
3b0bdc72
UD
1458 }
1459}
1460
1461/* Calculate "eclosure" for all the node in DFA. */
1462
1463static reg_errcode_t
1464calc_eclosure (dfa)
1465 re_dfa_t *dfa;
1466{
485d775d 1467 int node_idx, incomplete;
3b0bdc72
UD
1468#ifdef DEBUG
1469 assert (dfa->nodes_len > 0);
1470#endif
485d775d 1471 incomplete = 0;
3b0bdc72 1472 /* For each nodes, calculate epsilon closure. */
485d775d 1473 for (node_idx = 0; ; ++node_idx)
3b0bdc72 1474 {
a9388965 1475 reg_errcode_t err;
3b0bdc72 1476 re_node_set eclosure_elem;
485d775d 1477 if (node_idx == dfa->nodes_len)
15a7d175
UD
1478 {
1479 if (!incomplete)
1480 break;
1481 incomplete = 0;
1482 node_idx = 0;
1483 }
3b0bdc72
UD
1484
1485#ifdef DEBUG
3b0bdc72
UD
1486 assert (dfa->eclosures[node_idx].nelem != -1);
1487#endif
1488 /* If we have already calculated, skip it. */
1489 if (dfa->eclosures[node_idx].nelem != 0)
15a7d175 1490 continue;
3b0bdc72 1491 /* Calculate epsilon closure of `node_idx'. */
a9388965 1492 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1493 if (BE (err != REG_NOERROR, 0))
15a7d175 1494 return err;
3b0bdc72
UD
1495
1496 if (dfa->eclosures[node_idx].nelem == 0)
15a7d175
UD
1497 {
1498 incomplete = 1;
1499 re_node_set_free (&eclosure_elem);
1500 }
3b0bdc72 1501 }
3b0bdc72
UD
1502 return REG_NOERROR;
1503}
1504
1505/* Calculate epsilon closure of NODE. */
1506
a9388965
UD
1507static reg_errcode_t
1508calc_eclosure_iter (new_set, dfa, node, root)
1509 re_node_set *new_set;
3b0bdc72
UD
1510 re_dfa_t *dfa;
1511 int node, root;
1512{
a9388965 1513 reg_errcode_t err;
3b0bdc72 1514 unsigned int constraint;
485d775d 1515 int i, incomplete;
3b0bdc72 1516 re_node_set eclosure;
485d775d 1517 incomplete = 0;
a9388965 1518 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1519 if (BE (err != REG_NOERROR, 0))
a9388965 1520 return err;
3b0bdc72
UD
1521
1522 /* This indicates that we are calculating this node now.
1523 We reference this value to avoid infinite loop. */
1524 dfa->eclosures[node].nelem = -1;
1525
1526 constraint = ((dfa->nodes[node].type == ANCHOR)
15a7d175 1527 ? dfa->nodes[node].opr.ctx_type : 0);
485d775d
UD
1528 /* If the current node has constraints, duplicate all nodes.
1529 Since they must inherit the constraints. */
1530 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1531 {
1532 int org_node, cur_node;
1533 org_node = cur_node = node;
1534 err = duplicate_node_closure (dfa, node, node, node, constraint);
1535 if (BE (err != REG_NOERROR, 0))
15a7d175 1536 return err;
485d775d 1537 }
3b0bdc72
UD
1538
1539 /* Expand each epsilon destination nodes. */
485d775d 1540 if (IS_EPSILON_NODE(dfa->nodes[node].type))
3b0bdc72
UD
1541 for (i = 0; i < dfa->edests[node].nelem; ++i)
1542 {
15a7d175
UD
1543 re_node_set eclosure_elem;
1544 int edest = dfa->edests[node].elems[i];
1545 /* If calculating the epsilon closure of `edest' is in progress,
1546 return intermediate result. */
1547 if (dfa->eclosures[edest].nelem == -1)
1548 {
1549 incomplete = 1;
1550 continue;
1551 }
1552 /* If we haven't calculated the epsilon closure of `edest' yet,
1553 calculate now. Otherwise use calculated epsilon closure. */
1554 if (dfa->eclosures[edest].nelem == 0)
1555 {
1556 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1557 if (BE (err != REG_NOERROR, 0))
1558 return err;
1559 }
1560 else
1561 eclosure_elem = dfa->eclosures[edest];
1562 /* Merge the epsilon closure of `edest'. */
1563 re_node_set_merge (&eclosure, &eclosure_elem);
1564 /* If the epsilon closure of `edest' is incomplete,
1565 the epsilon closure of this node is also incomplete. */
1566 if (dfa->eclosures[edest].nelem == 0)
1567 {
1568 incomplete = 1;
1569 re_node_set_free (&eclosure_elem);
1570 }
3b0bdc72
UD
1571 }
1572
3b0bdc72
UD
1573 /* Epsilon closures include itself. */
1574 re_node_set_insert (&eclosure, node);
1575 if (incomplete && !root)
1576 dfa->eclosures[node].nelem = 0;
1577 else
1578 dfa->eclosures[node] = eclosure;
a9388965
UD
1579 *new_set = eclosure;
1580 return REG_NOERROR;
3b0bdc72
UD
1581}
1582\f
1583/* Functions for token which are used in the parser. */
1584
1585/* Fetch a token from INPUT.
1586 We must not use this function inside bracket expressions. */
1587
f0d77aa8
UD
1588static void
1589fetch_token (result, input, syntax)
1590 re_token_t *result;
3b0bdc72
UD
1591 re_string_t *input;
1592 reg_syntax_t syntax;
1593{
f0d77aa8 1594 re_string_skip_bytes (input, peek_token (result, input, syntax));
3b0bdc72
UD
1595}
1596
1597/* Peek a token from INPUT, and return the length of the token.
1598 We must not use this function inside bracket expressions. */
1599
1600static int
1601peek_token (token, input, syntax)
1602 re_token_t *token;
1603 re_string_t *input;
1604 reg_syntax_t syntax;
1605{
1606 unsigned char c;
1607
1608 if (re_string_eoi (input))
1609 {
1610 token->type = END_OF_RE;
1611 return 0;
1612 }
1613
1614 c = re_string_peek_byte (input, 0);
1615 token->opr.c = c;
1616
1617#ifdef RE_ENABLE_I18N
1618 token->mb_partial = 0;
3c0fb574 1619 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1620 !re_string_first_byte (input, re_string_cur_idx (input)))
1621 {
1622 token->type = CHARACTER;
1623 token->mb_partial = 1;
1624 return 1;
1625 }
1626#endif
1627 if (c == '\\')
1628 {
1629 unsigned char c2;
1630 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
15a7d175
UD
1631 {
1632 token->type = BACK_SLASH;
1633 return 1;
1634 }
3b0bdc72
UD
1635
1636 c2 = re_string_peek_byte_case (input, 1);
1637 token->opr.c = c2;
1638 token->type = CHARACTER;
1639 switch (c2)
15a7d175
UD
1640 {
1641 case '|':
1642 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1643 token->type = OP_ALT;
1644 break;
1645 case '1': case '2': case '3': case '4': case '5':
1646 case '6': case '7': case '8': case '9':
1647 if (!(syntax & RE_NO_BK_REFS))
1648 {
1649 token->type = OP_BACK_REF;
1650 token->opr.idx = c2 - '0';
1651 }
1652 break;
1653 case '<':
1654 if (!(syntax & RE_NO_GNU_OPS))
1655 {
1656 token->type = ANCHOR;
1657 token->opr.idx = WORD_FIRST;
1658 }
1659 break;
1660 case '>':
1661 if (!(syntax & RE_NO_GNU_OPS))
1662 {
1663 token->type = ANCHOR;
1664 token->opr.idx = WORD_LAST;
1665 }
1666 break;
1667 case 'b':
1668 if (!(syntax & RE_NO_GNU_OPS))
1669 {
1670 token->type = ANCHOR;
1671 token->opr.idx = WORD_DELIM;
1672 }
1673 break;
1674 case 'B':
1675 if (!(syntax & RE_NO_GNU_OPS))
1676 {
1677 token->type = ANCHOR;
1678 token->opr.idx = INSIDE_WORD;
1679 }
1680 break;
1681 case 'w':
1682 if (!(syntax & RE_NO_GNU_OPS))
1683 token->type = OP_WORD;
1684 break;
1685 case 'W':
1686 if (!(syntax & RE_NO_GNU_OPS))
1687 token->type = OP_NOTWORD;
1688 break;
e2b6bfa3
UD
1689 case 's':
1690 if (!(syntax & RE_NO_GNU_OPS))
1691 token->type = OP_SPACE;
1692 break;
1693 case 'S':
1694 if (!(syntax & RE_NO_GNU_OPS))
1695 token->type = OP_NOTSPACE;
1696 break;
15a7d175
UD
1697 case '`':
1698 if (!(syntax & RE_NO_GNU_OPS))
1699 {
1700 token->type = ANCHOR;
1701 token->opr.idx = BUF_FIRST;
1702 }
1703 break;
1704 case '\'':
1705 if (!(syntax & RE_NO_GNU_OPS))
1706 {
1707 token->type = ANCHOR;
1708 token->opr.idx = BUF_LAST;
1709 }
1710 break;
1711 case '(':
1712 if (!(syntax & RE_NO_BK_PARENS))
1713 token->type = OP_OPEN_SUBEXP;
1714 break;
1715 case ')':
1716 if (!(syntax & RE_NO_BK_PARENS))
1717 token->type = OP_CLOSE_SUBEXP;
1718 break;
1719 case '+':
1720 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1721 token->type = OP_DUP_PLUS;
1722 break;
1723 case '?':
1724 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1725 token->type = OP_DUP_QUESTION;
1726 break;
1727 case '{':
1728 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1729 token->type = OP_OPEN_DUP_NUM;
1730 break;
1731 case '}':
1732 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1733 token->type = OP_CLOSE_DUP_NUM;
1734 break;
1735 default:
1736 break;
1737 }
3b0bdc72
UD
1738 return 2;
1739 }
1740
1741 token->type = CHARACTER;
1742 switch (c)
1743 {
1744 case '\n':
1745 if (syntax & RE_NEWLINE_ALT)
15a7d175 1746 token->type = OP_ALT;
3b0bdc72
UD
1747 break;
1748 case '|':
1749 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
15a7d175 1750 token->type = OP_ALT;
3b0bdc72
UD
1751 break;
1752 case '*':
1753 token->type = OP_DUP_ASTERISK;
1754 break;
1755 case '+':
1756 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1757 token->type = OP_DUP_PLUS;
3b0bdc72
UD
1758 break;
1759 case '?':
1760 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
15a7d175 1761 token->type = OP_DUP_QUESTION;
3b0bdc72
UD
1762 break;
1763 case '{':
1764 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1765 token->type = OP_OPEN_DUP_NUM;
3b0bdc72
UD
1766 break;
1767 case '}':
1768 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
15a7d175 1769 token->type = OP_CLOSE_DUP_NUM;
3b0bdc72
UD
1770 break;
1771 case '(':
1772 if (syntax & RE_NO_BK_PARENS)
15a7d175 1773 token->type = OP_OPEN_SUBEXP;
3b0bdc72
UD
1774 break;
1775 case ')':
1776 if (syntax & RE_NO_BK_PARENS)
15a7d175 1777 token->type = OP_CLOSE_SUBEXP;
3b0bdc72
UD
1778 break;
1779 case '[':
1780 token->type = OP_OPEN_BRACKET;
1781 break;
1782 case '.':
1783 token->type = OP_PERIOD;
1784 break;
1785 case '^':
134abcb5 1786 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
15a7d175
UD
1787 re_string_cur_idx (input) != 0)
1788 {
1789 char prev = re_string_peek_byte (input, -1);
134abcb5 1790 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
15a7d175
UD
1791 break;
1792 }
3b0bdc72
UD
1793 token->type = ANCHOR;
1794 token->opr.idx = LINE_FIRST;
1795 break;
1796 case '$':
1797 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
15a7d175
UD
1798 re_string_cur_idx (input) + 1 != re_string_length (input))
1799 {
1800 re_token_t next;
1801 re_string_skip_bytes (input, 1);
1802 peek_token (&next, input, syntax);
1803 re_string_skip_bytes (input, -1);
1804 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1805 break;
1806 }
3b0bdc72
UD
1807 token->type = ANCHOR;
1808 token->opr.idx = LINE_LAST;
1809 break;
1810 default:
1811 break;
1812 }
1813 return 1;
1814}
1815
1816/* Peek a token from INPUT, and return the length of the token.
1817 We must not use this function out of bracket expressions. */
1818
1819static int
1820peek_token_bracket (token, input, syntax)
1821 re_token_t *token;
1822 re_string_t *input;
1823 reg_syntax_t syntax;
1824{
1825 unsigned char c;
1826 if (re_string_eoi (input))
1827 {
1828 token->type = END_OF_RE;
1829 return 0;
1830 }
1831 c = re_string_peek_byte (input, 0);
1832 token->opr.c = c;
1833
1834#ifdef RE_ENABLE_I18N
3c0fb574 1835 if (input->mb_cur_max > 1 &&
3b0bdc72
UD
1836 !re_string_first_byte (input, re_string_cur_idx (input)))
1837 {
1838 token->type = CHARACTER;
1839 return 1;
1840 }
1841#endif /* RE_ENABLE_I18N */
1842
1843 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1844 {
1845 /* In this case, '\' escape a character. */
1846 unsigned char c2;
82bbb29e
UD
1847 re_string_skip_bytes (input, 1);
1848 c2 = re_string_peek_byte (input, 0);
3b0bdc72
UD
1849 token->opr.c = c2;
1850 token->type = CHARACTER;
1851 return 1;
1852 }
1853 if (c == '[') /* '[' is a special char in a bracket exps. */
1854 {
1855 unsigned char c2;
1856 int token_len;
1857 c2 = re_string_peek_byte (input, 1);
1858 token->opr.c = c2;
1859 token_len = 2;
1860 switch (c2)
15a7d175
UD
1861 {
1862 case '.':
1863 token->type = OP_OPEN_COLL_ELEM;
1864 break;
1865 case '=':
1866 token->type = OP_OPEN_EQUIV_CLASS;
1867 break;
1868 case ':':
1869 if (syntax & RE_CHAR_CLASSES)
1870 {
1871 token->type = OP_OPEN_CHAR_CLASS;
1872 break;
1873 }
1874 /* else fall through. */
1875 default:
1876 token->type = CHARACTER;
1877 token->opr.c = c;
1878 token_len = 1;
1879 break;
1880 }
3b0bdc72
UD
1881 return token_len;
1882 }
1883 switch (c)
1884 {
1885 case '-':
1886 token->type = OP_CHARSET_RANGE;
1887 break;
1888 case ']':
1889 token->type = OP_CLOSE_BRACKET;
1890 break;
1891 case '^':
1892 token->type = OP_NON_MATCH_LIST;
1893 break;
1894 default:
1895 token->type = CHARACTER;
1896 }
1897 return 1;
1898}
1899\f
1900/* Functions for parser. */
1901
1902/* Entry point of the parser.
1903 Parse the regular expression REGEXP and return the structure tree.
1904 If an error is occured, ERR is set by error code, and return NULL.
1905 This function build the following tree, from regular expression <reg_exp>:
15a7d175
UD
1906 CAT
1907 / \
1908 / \
3b0bdc72
UD
1909 <reg_exp> EOR
1910
1911 CAT means concatenation.
1912 EOR means end of regular expression. */
1913
1914static bin_tree_t *
1915parse (regexp, preg, syntax, err)
1916 re_string_t *regexp;
1917 regex_t *preg;
1918 reg_syntax_t syntax;
1919 reg_errcode_t *err;
1920{
1921 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1922 bin_tree_t *tree, *eor, *root;
1923 re_token_t current_token;
f0d77aa8 1924 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 1925 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 1926 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72 1927 return NULL;
c5c9c929 1928 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
3b0bdc72 1929 if (tree != NULL)
ee70274a 1930 root = create_tree (dfa, tree, eor, CONCAT, 0);
3b0bdc72
UD
1931 else
1932 root = eor;
ee70274a 1933 if (BE (eor == NULL || root == NULL, 0))
485d775d
UD
1934 {
1935 *err = REG_ESPACE;
1936 return NULL;
1937 }
3b0bdc72
UD
1938 return root;
1939}
1940
1941/* This function build the following tree, from regular expression
1942 <branch1>|<branch2>:
15a7d175
UD
1943 ALT
1944 / \
1945 / \
3b0bdc72
UD
1946 <branch1> <branch2>
1947
1948 ALT means alternative, which represents the operator `|'. */
1949
1950static bin_tree_t *
1951parse_reg_exp (regexp, preg, token, syntax, nest, err)
1952 re_string_t *regexp;
1953 regex_t *preg;
1954 re_token_t *token;
1955 reg_syntax_t syntax;
1956 int nest;
1957 reg_errcode_t *err;
1958{
1959 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1960 bin_tree_t *tree, *branch = NULL;
3b0bdc72 1961 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 1962 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1963 return NULL;
1964
1965 while (token->type == OP_ALT)
1966 {
1967 re_token_t alt_token = *token;
f0d77aa8 1968 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72 1969 if (token->type != OP_ALT && token->type != END_OF_RE
15a7d175
UD
1970 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1971 {
1972 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1973 if (BE (*err != REG_NOERROR && branch == NULL, 0))
ee70274a 1974 return NULL;
15a7d175 1975 }
9b88fc16
UD
1976 else
1977 branch = NULL;
c5c9c929 1978 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
ee70274a 1979 if (BE (tree == NULL, 0))
15a7d175
UD
1980 {
1981 *err = REG_ESPACE;
1982 return NULL;
1983 }
0742e48e 1984 dfa->has_plural_match = 1;
3b0bdc72
UD
1985 }
1986 return tree;
1987}
1988
1989/* This function build the following tree, from regular expression
1990 <exp1><exp2>:
15a7d175
UD
1991 CAT
1992 / \
3b0bdc72
UD
1993 / \
1994 <exp1> <exp2>
1995
1996 CAT means concatenation. */
1997
1998static bin_tree_t *
1999parse_branch (regexp, preg, token, syntax, nest, err)
2000 re_string_t *regexp;
2001 regex_t *preg;
2002 re_token_t *token;
2003 reg_syntax_t syntax;
2004 int nest;
2005 reg_errcode_t *err;
2006{
2007 bin_tree_t *tree, *exp;
ee70274a 2008 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3b0bdc72 2009 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2010 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2011 return NULL;
2012
2013 while (token->type != OP_ALT && token->type != END_OF_RE
15a7d175 2014 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
3b0bdc72
UD
2015 {
2016 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 2017 if (BE (*err != REG_NOERROR && exp == NULL, 0))
15a7d175 2018 {
15a7d175
UD
2019 return NULL;
2020 }
3b0bdc72 2021 if (tree != NULL && exp != NULL)
15a7d175 2022 {
ee70274a 2023 tree = create_tree (dfa, tree, exp, CONCAT, 0);
15a7d175
UD
2024 if (tree == NULL)
2025 {
2026 *err = REG_ESPACE;
2027 return NULL;
2028 }
2029 }
3b0bdc72 2030 else if (tree == NULL)
15a7d175 2031 tree = exp;
3b0bdc72
UD
2032 /* Otherwise exp == NULL, we don't need to create new tree. */
2033 }
2034 return tree;
2035}
2036
2037/* This function build the following tree, from regular expression a*:
15a7d175
UD
2038 *
2039 |
2040 a
3b0bdc72
UD
2041*/
2042
2043static bin_tree_t *
2044parse_expression (regexp, preg, token, syntax, nest, err)
2045 re_string_t *regexp;
2046 regex_t *preg;
2047 re_token_t *token;
2048 reg_syntax_t syntax;
2049 int nest;
2050 reg_errcode_t *err;
2051{
2052 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2053 bin_tree_t *tree;
3b0bdc72
UD
2054 switch (token->type)
2055 {
2056 case CHARACTER:
c5c9c929 2057 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2058 if (BE (tree == NULL, 0))
15a7d175
UD
2059 {
2060 *err = REG_ESPACE;
2061 return NULL;
2062 }
3b0bdc72 2063#ifdef RE_ENABLE_I18N
3c0fb574 2064 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
2065 {
2066 while (!re_string_eoi (regexp)
2067 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2068 {
15a7d175 2069 bin_tree_t *mbc_remain;
f0d77aa8 2070 fetch_token (token, regexp, syntax);
c5c9c929 2071 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a
UD
2072 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2073 if (BE (mbc_remain == NULL || tree == NULL, 0))
54e1cabc
UD
2074 {
2075 *err = REG_ESPACE;
2076 return NULL;
2077 }
15a7d175 2078 }
3b0bdc72
UD
2079 }
2080#endif
2081 break;
2082 case OP_OPEN_SUBEXP:
2083 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 2084 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2085 return NULL;
3b0bdc72
UD
2086 break;
2087 case OP_OPEN_BRACKET:
2088 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 2089 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2090 return NULL;
3b0bdc72
UD
2091 break;
2092 case OP_BACK_REF:
bc15410e 2093 if (BE (preg->re_nsub < token->opr.idx
15a7d175
UD
2094 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2095 {
2096 *err = REG_ESUBREG;
2097 return NULL;
2098 }
6291ee3c 2099 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
c5c9c929 2100 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2101 if (BE (tree == NULL, 0))
15a7d175
UD
2102 {
2103 *err = REG_ESPACE;
2104 return NULL;
2105 }
3b0bdc72
UD
2106 ++dfa->nbackref;
2107 dfa->has_mb_node = 1;
2108 break;
06e8303a
UD
2109 case OP_OPEN_DUP_NUM:
2110 if (syntax & RE_CONTEXT_INVALID_DUP)
2111 {
2112 *err = REG_BADRPT;
2113 return NULL;
2114 }
2115 /* FALLTHROUGH */
3b0bdc72
UD
2116 case OP_DUP_ASTERISK:
2117 case OP_DUP_PLUS:
2118 case OP_DUP_QUESTION:
3b0bdc72 2119 if (syntax & RE_CONTEXT_INVALID_OPS)
15a7d175
UD
2120 {
2121 *err = REG_BADRPT;
2122 return NULL;
2123 }
3b0bdc72 2124 else if (syntax & RE_CONTEXT_INDEP_OPS)
15a7d175 2125 {
f0d77aa8 2126 fetch_token (token, regexp, syntax);
15a7d175
UD
2127 return parse_expression (regexp, preg, token, syntax, nest, err);
2128 }
3b0bdc72
UD
2129 /* else fall through */
2130 case OP_CLOSE_SUBEXP:
2131 if ((token->type == OP_CLOSE_SUBEXP) &&
15a7d175
UD
2132 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2133 {
2134 *err = REG_ERPAREN;
2135 return NULL;
2136 }
3b0bdc72
UD
2137 /* else fall through */
2138 case OP_CLOSE_DUP_NUM:
2139 /* We treat it as a normal character. */
2140
2141 /* Then we can these characters as normal characters. */
2142 token->type = CHARACTER;
c5c9c929 2143 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2144 if (BE (tree == NULL, 0))
15a7d175
UD
2145 {
2146 *err = REG_ESPACE;
2147 return NULL;
2148 }
3b0bdc72
UD
2149 break;
2150 case ANCHOR:
2151 if (dfa->word_char == NULL)
15a7d175
UD
2152 {
2153 *err = init_word_char (dfa);
2154 if (BE (*err != REG_NOERROR, 0))
2155 return NULL;
2156 }
3b0bdc72 2157 if (token->opr.ctx_type == WORD_DELIM)
15a7d175
UD
2158 {
2159 bin_tree_t *tree_first, *tree_last;
15a7d175 2160 token->opr.ctx_type = WORD_FIRST;
c5c9c929 2161 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
15a7d175 2162 token->opr.ctx_type = WORD_LAST;
c5c9c929 2163 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
15a7d175 2164 token->type = OP_ALT;
c5c9c929 2165 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
ee70274a 2166 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
15a7d175
UD
2167 {
2168 *err = REG_ESPACE;
2169 return NULL;
2170 }
2171 }
3b0bdc72 2172 else
15a7d175 2173 {
c5c9c929 2174 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2175 if (BE (tree == NULL, 0))
54e1cabc
UD
2176 {
2177 *err = REG_ESPACE;
2178 return NULL;
2179 }
15a7d175 2180 }
3b0bdc72 2181 /* We must return here, since ANCHORs can't be followed
15a7d175
UD
2182 by repetition operators.
2183 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2184 it must not be "<ANCHOR(^)><REPEAT(*)>". */
f0d77aa8 2185 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2186 return tree;
2187 case OP_PERIOD:
c5c9c929 2188 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2189 if (BE (tree == NULL, 0))
15a7d175
UD
2190 {
2191 *err = REG_ESPACE;
2192 return NULL;
2193 }
3c0fb574 2194 if (dfa->mb_cur_max > 1)
15a7d175 2195 dfa->has_mb_node = 1;
3b0bdc72
UD
2196 break;
2197 case OP_WORD:
e2b6bfa3 2198 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
bc15410e 2199 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2200 return NULL;
3b0bdc72
UD
2201 break;
2202 case OP_NOTWORD:
e2b6bfa3
UD
2203 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2204 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2205 return NULL;
2206 break;
2207 case OP_SPACE:
2208 tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2209 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210 return NULL;
2211 break;
2212 case OP_NOTSPACE:
2213 tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
bc15410e 2214 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2215 return NULL;
3b0bdc72
UD
2216 break;
2217 case OP_ALT:
2218 case END_OF_RE:
2219 return NULL;
2220 case BACK_SLASH:
2221 *err = REG_EESCAPE;
2222 return NULL;
2223 default:
2224 /* Must not happen? */
2225#ifdef DEBUG
2226 assert (0);
2227#endif
2228 return NULL;
2229 }
f0d77aa8 2230 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2231
2232 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
15a7d175 2233 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
3b0bdc72
UD
2234 {
2235 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
bc15410e 2236 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2237 return NULL;
c34bfc8d
UD
2238 /* In BRE consecutive duplications are not allowed. */
2239 if ((syntax & RE_CONTEXT_INVALID_DUP)
2240 && (token->type == OP_DUP_ASTERISK
2241 || token->type == OP_OPEN_DUP_NUM))
2242 {
2243 *err = REG_BADRPT;
2244 return NULL;
2245 }
0742e48e 2246 dfa->has_plural_match = 1;
3b0bdc72
UD
2247 }
2248
2249 return tree;
2250}
2251
2252/* This function build the following tree, from regular expression
2253 (<reg_exp>):
15a7d175
UD
2254 SUBEXP
2255 |
2256 <reg_exp>
3b0bdc72
UD
2257*/
2258
2259static bin_tree_t *
2260parse_sub_exp (regexp, preg, token, syntax, nest, err)
2261 re_string_t *regexp;
2262 regex_t *preg;
2263 re_token_t *token;
2264 reg_syntax_t syntax;
2265 int nest;
2266 reg_errcode_t *err;
2267{
2268 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 2269 bin_tree_t *tree, *left_par, *right_par;
3b0bdc72
UD
2270 size_t cur_nsub;
2271 cur_nsub = preg->re_nsub++;
951d6408 2272 if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
3b0bdc72
UD
2273 {
2274 re_subexp_t *new_array;
2275 dfa->subexps_alloc *= 2;
2276 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
bc15410e 2277 if (BE (new_array == NULL, 0))
3b0bdc72
UD
2278 {
2279 dfa->subexps_alloc /= 2;
2280 *err = REG_ESPACE;
2281 return NULL;
2282 }
2283 dfa->subexps = new_array;
2284 }
2285 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2286 dfa->subexps[cur_nsub].end = -1;
81c64d40 2287
c5c9c929 2288 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
ee70274a 2289 if (BE (left_par == NULL, 0))
485d775d
UD
2290 {
2291 *err = REG_ESPACE;
2292 return NULL;
2293 }
ee70274a 2294 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
f0d77aa8 2295 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
3b0bdc72
UD
2296
2297 /* The subexpression may be a null string. */
2298 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2299 tree = NULL;
3b0bdc72
UD
2300 else
2301 {
2302 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
bc15410e 2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
15a7d175 2304 return NULL;
3b0bdc72 2305 }
81c64d40
UD
2306 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2307 {
06e8303a 2308 *err = REG_EPAREN;
81c64d40
UD
2309 return NULL;
2310 }
c5c9c929 2311 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
81c64d40 2312 dfa->subexps[cur_nsub].end = dfa->nodes_len;
81c64d40 2313 tree = ((tree == NULL) ? right_par
ee70274a
UD
2314 : create_tree (dfa, tree, right_par, CONCAT, 0));
2315 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2316 if (BE (right_par == NULL || tree == NULL, 0))
485d775d
UD
2317 {
2318 *err = REG_ESPACE;
2319 return NULL;
2320 }
ee70274a 2321 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
81c64d40 2322
3b0bdc72
UD
2323 return tree;
2324}
2325
2326/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2327
2328static bin_tree_t *
2329parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2330 bin_tree_t *dup_elem;
2331 re_string_t *regexp;
2332 re_dfa_t *dfa;
2333 re_token_t *token;
2334 reg_syntax_t syntax;
2335 reg_errcode_t *err;
2336{
2337 re_token_t dup_token;
2338 bin_tree_t *tree = dup_elem, *work_tree;
ee70274a 2339 int start_idx = re_string_cur_idx (regexp);
3b0bdc72
UD
2340 re_token_t start_token = *token;
2341 if (token->type == OP_OPEN_DUP_NUM)
2342 {
602c2f9d
UD
2343 int i;
2344 int end = 0;
2345 int start = fetch_number (regexp, token, syntax);
3b0bdc72
UD
2346 bin_tree_t *elem;
2347 if (start == -1)
15a7d175
UD
2348 {
2349 if (token->type == CHARACTER && token->opr.c == ',')
2350 start = 0; /* We treat "{,m}" as "{0,m}". */
2351 else
2352 {
2353 *err = REG_BADBR; /* <re>{} is invalid. */
2354 return NULL;
2355 }
2356 }
602c2f9d 2357 if (BE (start != -2, 1))
15a7d175
UD
2358 {
2359 /* We treat "{n}" as "{n,n}". */
2360 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2361 : ((token->type == CHARACTER && token->opr.c == ',')
2362 ? fetch_number (regexp, token, syntax) : -2));
2363 }
602c2f9d 2364 if (BE (start == -2 || end == -2, 0))
15a7d175
UD
2365 {
2366 /* Invalid sequence. */
2367 if (token->type == OP_CLOSE_DUP_NUM)
2368 goto parse_dup_op_invalid_interval;
2369 else
2370 goto parse_dup_op_ebrace;
2371 }
602c2f9d 2372 if (BE (start == 0 && end == 0, 0))
15a7d175
UD
2373 {
2374 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
f0d77aa8 2375 fetch_token (token, regexp, syntax);
15a7d175
UD
2376 return NULL;
2377 }
602c2f9d 2378
3b0bdc72
UD
2379 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2380 elem = tree;
2381 for (i = 0; i < start; ++i)
15a7d175
UD
2382 if (i != 0)
2383 {
2384 work_tree = duplicate_tree (elem, dfa);
ee70274a 2385 tree = create_tree (dfa, tree, work_tree, CONCAT, 0);
15a7d175
UD
2386 if (BE (work_tree == NULL || tree == NULL, 0))
2387 goto parse_dup_op_espace;
2388 }
3b0bdc72
UD
2389
2390 if (end == -1)
15a7d175
UD
2391 {
2392 /* We treat "<re>{0,}" as "<re>*". */
2393 dup_token.type = OP_DUP_ASTERISK;
2394 if (start > 0)
2395 {
2396 elem = duplicate_tree (elem, dfa);
c5c9c929 2397 work_tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
ee70274a
UD
2398 tree = create_tree (dfa, tree, work_tree, CONCAT, 0);
2399 if (BE (elem == NULL || work_tree == NULL || tree == NULL, 0))
15a7d175
UD
2400 goto parse_dup_op_espace;
2401 }
2402 else
2403 {
c5c9c929 2404 tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
ee70274a 2405 if (BE (tree == NULL, 0))
15a7d175
UD
2406 goto parse_dup_op_espace;
2407 }
2408 }
deeb4aae
UD
2409 else if (BE (start > end, 0))
2410 {
2411 /* First number greater than first. */
2412 *err = REG_BADBR;
2413 return NULL;
2414 }
3b0bdc72 2415 else if (end - start > 0)
15a7d175
UD
2416 {
2417 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2418 dup_token.type = OP_DUP_QUESTION;
2419 if (start > 0)
2420 {
2421 elem = duplicate_tree (elem, dfa);
c5c9c929 2422 elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
ee70274a
UD
2423 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2424 if (BE (elem == NULL || tree == NULL, 0))
15a7d175
UD
2425 goto parse_dup_op_espace;
2426 }
2427 else
2428 {
c5c9c929 2429 tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
ee70274a 2430 if (BE (tree == NULL, 0))
15a7d175
UD
2431 goto parse_dup_op_espace;
2432 }
2433 for (i = 1; i < end - start; ++i)
2434 {
2435 work_tree = duplicate_tree (elem, dfa);
ee70274a 2436 tree = create_tree (dfa, tree, work_tree, CONCAT, 0);
15a7d175
UD
2437 if (BE (work_tree == NULL || tree == NULL, 0))
2438 {
2439 *err = REG_ESPACE;
2440 return NULL;
2441 }
2442 }
2443 }
3b0bdc72
UD
2444 }
2445 else
2446 {
c5c9c929 2447 tree = re_dfa_add_tree_node (dfa, tree, NULL, token);
ee70274a 2448 if (BE (tree == NULL, 0))
15a7d175
UD
2449 {
2450 *err = REG_ESPACE;
2451 return NULL;
2452 }
3b0bdc72 2453 }
f0d77aa8 2454 fetch_token (token, regexp, syntax);
3b0bdc72
UD
2455 return tree;
2456
2457 parse_dup_op_espace:
3b0bdc72
UD
2458 *err = REG_ESPACE;
2459 return NULL;
2460
602c2f9d 2461 parse_dup_op_ebrace:
bc15410e 2462 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
3b0bdc72
UD
2463 {
2464 *err = REG_EBRACE;
2465 return NULL;
2466 }
602c2f9d
UD
2467 goto parse_dup_op_rollback;
2468 parse_dup_op_invalid_interval:
2469 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2470 {
2471 *err = REG_BADBR;
2472 return NULL;
2473 }
2474 parse_dup_op_rollback:
3b0bdc72
UD
2475 re_string_set_index (regexp, start_idx);
2476 *token = start_token;
2477 token->type = CHARACTER;
2478 return dup_elem;
2479}
2480
2481/* Size of the names for collating symbol/equivalence_class/character_class.
2482 I'm not sure, but maybe enough. */
2483#define BRACKET_NAME_BUF_SIZE 32
2484
434d3784
UD
2485#ifndef _LIBC
2486 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2487 Build the range expression which starts from START_ELEM, and ends
2488 at END_ELEM. The result are written to MBCSET and SBCSET.
2489 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2490 mbcset->range_ends, is a pointer argument sinse we may
2491 update it. */
2492
2493static reg_errcode_t
c0a0f9a3
UD
2494# ifdef RE_ENABLE_I18N
2495build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
434d3784 2496 re_charset_t *mbcset;
434d3784 2497 int *range_alloc;
c0a0f9a3
UD
2498# else /* not RE_ENABLE_I18N */
2499build_range_exp (sbcset, start_elem, end_elem)
2500# endif /* not RE_ENABLE_I18N */
2501 re_bitset_ptr_t sbcset;
434d3784
UD
2502 bracket_elem_t *start_elem, *end_elem;
2503{
2504 unsigned int start_ch, end_ch;
2505 /* Equivalence Classes and Character Classes can't be a range start/end. */
2506 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2507 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2508 0))
434d3784
UD
2509 return REG_ERANGE;
2510
2511 /* We can handle no multi character collating elements without libc
2512 support. */
62439eac 2513 if (BE ((start_elem->type == COLL_SYM
15a7d175
UD
2514 && strlen ((char *) start_elem->opr.name) > 1)
2515 || (end_elem->type == COLL_SYM
2516 && strlen ((char *) end_elem->opr.name) > 1), 0))
434d3784
UD
2517 return REG_ECOLLATE;
2518
2519# ifdef RE_ENABLE_I18N
2520 {
2521 wchar_t wc, start_wc, end_wc;
2522 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2523
2524 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
15a7d175
UD
2525 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2526 : 0));
434d3784 2527 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
15a7d175
UD
2528 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2529 : 0));
434d3784 2530 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
15a7d175 2531 ? __btowc (start_ch) : start_elem->opr.wch);
434d3784 2532 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
15a7d175 2533 ? __btowc (end_ch) : end_elem->opr.wch);
434d3784
UD
2534 cmp_buf[0] = start_wc;
2535 cmp_buf[4] = end_wc;
2536 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2537 return REG_ERANGE;
2538
2539 /* Check the space of the arrays. */
951d6408 2540 if (BE (*range_alloc == mbcset->nranges, 0))
434d3784 2541 {
15a7d175
UD
2542 /* There are not enough space, need realloc. */
2543 wchar_t *new_array_start, *new_array_end;
2544 int new_nranges;
2545
2546 /* +1 in case of mbcset->nranges is 0. */
2547 new_nranges = 2 * mbcset->nranges + 1;
2548 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2549 are NULL if *range_alloc == 0. */
2550 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2551 new_nranges);
2552 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2553 new_nranges);
2554
2555 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2556 return REG_ESPACE;
2557
2558 mbcset->range_starts = new_array_start;
2559 mbcset->range_ends = new_array_end;
2560 *range_alloc = new_nranges;
434d3784
UD
2561 }
2562
2563 mbcset->range_starts[mbcset->nranges] = start_wc;
2564 mbcset->range_ends[mbcset->nranges++] = end_wc;
2565
2566 /* Build the table for single byte characters. */
2567 for (wc = 0; wc <= SBC_MAX; ++wc)
2568 {
15a7d175
UD
2569 cmp_buf[2] = wc;
2570 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2571 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2572 bitset_set (sbcset, wc);
434d3784
UD
2573 }
2574 }
2575# else /* not RE_ENABLE_I18N */
2576 {
2577 unsigned int ch;
2578 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
15a7d175
UD
2579 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2580 : 0));
434d3784 2581 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
15a7d175
UD
2582 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2583 : 0));
434d3784
UD
2584 if (start_ch > end_ch)
2585 return REG_ERANGE;
2586 /* Build the table for single byte characters. */
2587 for (ch = 0; ch <= SBC_MAX; ++ch)
2588 if (start_ch <= ch && ch <= end_ch)
15a7d175 2589 bitset_set (sbcset, ch);
434d3784
UD
2590 }
2591# endif /* not RE_ENABLE_I18N */
2592 return REG_NOERROR;
2593}
2594#endif /* not _LIBC */
2595
2596#ifndef _LIBC
2597/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2598 Build the collating element which is represented by NAME.
2599 The result are written to MBCSET and SBCSET.
2600 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2601 pointer argument since we may update it. */
2602
2603static reg_errcode_t
c0a0f9a3
UD
2604# ifdef RE_ENABLE_I18N
2605build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
434d3784 2606 re_charset_t *mbcset;
434d3784 2607 int *coll_sym_alloc;
c0a0f9a3
UD
2608# else /* not RE_ENABLE_I18N */
2609build_collating_symbol (sbcset, name)
2610# endif /* not RE_ENABLE_I18N */
2611 re_bitset_ptr_t sbcset;
62439eac 2612 const unsigned char *name;
434d3784 2613{
62439eac
UD
2614 size_t name_len = strlen ((const char *) name);
2615 if (BE (name_len != 1, 0))
434d3784
UD
2616 return REG_ECOLLATE;
2617 else
2618 {
62439eac 2619 bitset_set (sbcset, name[0]);
434d3784
UD
2620 return REG_NOERROR;
2621 }
2622}
2623#endif /* not _LIBC */
2624
3b0bdc72
UD
2625/* This function parse bracket expression like "[abc]", "[a-c]",
2626 "[[.a-a.]]" etc. */
2627
2628static bin_tree_t *
2629parse_bracket_exp (regexp, dfa, token, syntax, err)
2630 re_string_t *regexp;
2631 re_dfa_t *dfa;
2632 re_token_t *token;
2633 reg_syntax_t syntax;
2634 reg_errcode_t *err;
2635{
2636#ifdef _LIBC
62439eac
UD
2637 const unsigned char *collseqmb;
2638 const char *collseqwc;
3b0bdc72
UD
2639 uint32_t nrules;
2640 int32_t table_size;
2641 const int32_t *symb_table;
2642 const unsigned char *extra;
2643
434d3784 2644 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2645 Seek the collating symbol entry correspondings to NAME.
2646 Return the index of the symbol in the SYMB_TABLE. */
2647
2648 static inline int32_t
25337753 2649 __attribute ((always_inline))
3b0bdc72 2650 seek_collating_symbol_entry (name, name_len)
15a7d175
UD
2651 const unsigned char *name;
2652 size_t name_len;
3b0bdc72 2653 {
62439eac 2654 int32_t hash = elem_hash ((const char *) name, name_len);
3b0bdc72
UD
2655 int32_t elem = hash % table_size;
2656 int32_t second = hash % (table_size - 2);
2657 while (symb_table[2 * elem] != 0)
15a7d175
UD
2658 {
2659 /* First compare the hashing value. */
2660 if (symb_table[2 * elem] == hash
2661 /* Compare the length of the name. */
2662 && name_len == extra[symb_table[2 * elem + 1]]
2663 /* Compare the name. */
2664 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2665 name_len) == 0)
2666 {
2667 /* Yep, this is the entry. */
2668 break;
2669 }
2670
2671 /* Next entry. */
2672 elem += second;
2673 }
3b0bdc72
UD
2674 return elem;
2675 }
2676
434d3784 2677 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2678 Look up the collation sequence value of BR_ELEM.
2679 Return the value if succeeded, UINT_MAX otherwise. */
2680
2681 static inline unsigned int
25337753 2682 __attribute ((always_inline))
3b0bdc72 2683 lookup_collation_sequence_value (br_elem)
15a7d175 2684 bracket_elem_t *br_elem;
3b0bdc72
UD
2685 {
2686 if (br_elem->type == SB_CHAR)
15a7d175
UD
2687 {
2688 /*
2689 if (MB_CUR_MAX == 1)
2690 */
2691 if (nrules == 0)
2692 return collseqmb[br_elem->opr.ch];
2693 else
2694 {
2695 wint_t wc = __btowc (br_elem->opr.ch);
25337753 2696 return __collseq_table_lookup (collseqwc, wc);
15a7d175
UD
2697 }
2698 }
3b0bdc72 2699 else if (br_elem->type == MB_CHAR)
15a7d175 2700 {
25337753 2701 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
15a7d175 2702 }
3b0bdc72 2703 else if (br_elem->type == COLL_SYM)
15a7d175
UD
2704 {
2705 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2706 if (nrules != 0)
2707 {
2708 int32_t elem, idx;
2709 elem = seek_collating_symbol_entry (br_elem->opr.name,
2710 sym_name_len);
2711 if (symb_table[2 * elem] != 0)
2712 {
2713 /* We found the entry. */
2714 idx = symb_table[2 * elem + 1];
2715 /* Skip the name of collating element name. */
2716 idx += 1 + extra[idx];
2717 /* Skip the byte sequence of the collating element. */
2718 idx += 1 + extra[idx];
2719 /* Adjust for the alignment. */
2720 idx = (idx + 3) & ~3;
2721 /* Skip the multibyte collation sequence value. */
2722 idx += sizeof (unsigned int);
2723 /* Skip the wide char sequence of the collating element. */
2724 idx += sizeof (unsigned int) *
2725 (1 + *(unsigned int *) (extra + idx));
2726 /* Return the collation sequence value. */
2727 return *(unsigned int *) (extra + idx);
2728 }
2729 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2730 {
2731 /* No valid character. Match it as a single byte
2732 character. */
2733 return collseqmb[br_elem->opr.name[0]];
2734 }
2735 }
2736 else if (sym_name_len == 1)
2737 return collseqmb[br_elem->opr.name[0]];
2738 }
3b0bdc72
UD
2739 return UINT_MAX;
2740 }
2741
434d3784 2742 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2743 Build the range expression which starts from START_ELEM, and ends
2744 at END_ELEM. The result are written to MBCSET and SBCSET.
2745 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2746 mbcset->range_ends, is a pointer argument sinse we may
2747 update it. */
2748
2749 static inline reg_errcode_t
25337753 2750 __attribute ((always_inline))
c0a0f9a3
UD
2751# ifdef RE_ENABLE_I18N
2752 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
15a7d175
UD
2753 re_charset_t *mbcset;
2754 int *range_alloc;
c0a0f9a3
UD
2755# else /* not RE_ENABLE_I18N */
2756 build_range_exp (sbcset, start_elem, end_elem)
2757# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2758 re_bitset_ptr_t sbcset;
2759 bracket_elem_t *start_elem, *end_elem;
3b0bdc72
UD
2760 {
2761 unsigned int ch;
2762 uint32_t start_collseq;
2763 uint32_t end_collseq;
2764
c0a0f9a3 2765# ifdef RE_ENABLE_I18N
3b0bdc72 2766 /* Check the space of the arrays. */
951d6408 2767 if (BE (*range_alloc == mbcset->nranges, 0))
15a7d175
UD
2768 {
2769 /* There are not enough space, need realloc. */
2770 uint32_t *new_array_start;
2771 uint32_t *new_array_end;
3b0bdc72
UD
2772 int new_nranges;
2773
15a7d175
UD
2774 /* +1 in case of mbcset->nranges is 0. */
2775 new_nranges = 2 * mbcset->nranges + 1;
a9388965 2776 /* Use realloc since mbcset->range_starts and mbcset->range_ends
15a7d175
UD
2777 are NULL if *range_alloc == 0. */
2778 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2779 new_nranges);
2780 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2781 new_nranges);
a9388965 2782
15a7d175
UD
2783 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2784 return REG_ESPACE;
3b0bdc72 2785
15a7d175
UD
2786 mbcset->range_starts = new_array_start;
2787 mbcset->range_ends = new_array_end;
3b0bdc72 2788 *range_alloc = new_nranges;
15a7d175 2789 }
c0a0f9a3 2790# endif /* RE_ENABLE_I18N */
3b0bdc72 2791
434d3784 2792 /* Equivalence Classes and Character Classes can't be a range
15a7d175 2793 start/end. */
bc15410e 2794 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
15a7d175
UD
2795 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2796 0))
2797 return REG_ERANGE;
3b0bdc72
UD
2798
2799 start_collseq = lookup_collation_sequence_value (start_elem);
2800 end_collseq = lookup_collation_sequence_value (end_elem);
2801 /* Check start/end collation sequence values. */
bc15410e 2802 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
15a7d175 2803 return REG_ECOLLATE;
bc15410e 2804 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
15a7d175 2805 return REG_ERANGE;
3b0bdc72 2806
c0a0f9a3 2807# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2808 /* Got valid collation sequence values, add them as a new entry. */
2809 mbcset->range_starts[mbcset->nranges] = start_collseq;
2810 mbcset->range_ends[mbcset->nranges++] = end_collseq;
c0a0f9a3 2811# endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2812
2813 /* Build the table for single byte characters. */
2814 for (ch = 0; ch <= SBC_MAX; ch++)
15a7d175
UD
2815 {
2816 uint32_t ch_collseq;
2817 /*
2818 if (MB_CUR_MAX == 1)
2819 */
2820 if (nrules == 0)
2821 ch_collseq = collseqmb[ch];
2822 else
25337753 2823 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
15a7d175
UD
2824 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2825 bitset_set (sbcset, ch);
2826 }
3b0bdc72
UD
2827 return REG_NOERROR;
2828 }
3b0bdc72 2829
434d3784 2830 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2831 Build the collating element which is represented by NAME.
2832 The result are written to MBCSET and SBCSET.
2833 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2834 pointer argument sinse we may update it. */
2835
2836 static inline reg_errcode_t
25337753 2837 __attribute ((always_inline))
c0a0f9a3
UD
2838# ifdef RE_ENABLE_I18N
2839 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
15a7d175
UD
2840 re_charset_t *mbcset;
2841 int *coll_sym_alloc;
c0a0f9a3
UD
2842# else /* not RE_ENABLE_I18N */
2843 build_collating_symbol (sbcset, name)
2844# endif /* not RE_ENABLE_I18N */
15a7d175
UD
2845 re_bitset_ptr_t sbcset;
2846 const unsigned char *name;
3b0bdc72 2847 {
3b0bdc72 2848 int32_t elem, idx;
62439eac 2849 size_t name_len = strlen ((const char *) name);
3b0bdc72 2850 if (nrules != 0)
15a7d175
UD
2851 {
2852 elem = seek_collating_symbol_entry (name, name_len);
2853 if (symb_table[2 * elem] != 0)
2854 {
2855 /* We found the entry. */
2856 idx = symb_table[2 * elem + 1];
2857 /* Skip the name of collating element name. */
2858 idx += 1 + extra[idx];
2859 }
2860 else if (symb_table[2 * elem] == 0 && name_len == 1)
2861 {
2862 /* No valid character, treat it as a normal
2863 character. */
2864 bitset_set (sbcset, name[0]);
2865 return REG_NOERROR;
2866 }
2867 else
2868 return REG_ECOLLATE;
3b0bdc72 2869
c0a0f9a3 2870# ifdef RE_ENABLE_I18N
15a7d175
UD
2871 /* Got valid collation sequence, add it as a new entry. */
2872 /* Check the space of the arrays. */
951d6408 2873 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
15a7d175
UD
2874 {
2875 /* Not enough, realloc it. */
2876 /* +1 in case of mbcset->ncoll_syms is 0. */
951d6408 2877 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
15a7d175
UD
2878 /* Use realloc since mbcset->coll_syms is NULL
2879 if *alloc == 0. */
951d6408
UD
2880 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2881 new_coll_sym_alloc);
2882 if (BE (new_coll_syms == NULL, 0))
15a7d175 2883 return REG_ESPACE;
951d6408
UD
2884 mbcset->coll_syms = new_coll_syms;
2885 *coll_sym_alloc = new_coll_sym_alloc;
15a7d175
UD
2886 }
2887 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
c0a0f9a3 2888# endif /* RE_ENABLE_I18N */
15a7d175
UD
2889 return REG_NOERROR;
2890 }
3b0bdc72 2891 else
15a7d175
UD
2892 {
2893 if (BE (name_len != 1, 0))
2894 return REG_ECOLLATE;
2895 else
2896 {
2897 bitset_set (sbcset, name[0]);
2898 return REG_NOERROR;
2899 }
2900 }
3b0bdc72 2901 }
434d3784
UD
2902#endif
2903
3b0bdc72
UD
2904 re_token_t br_token;
2905 re_bitset_ptr_t sbcset;
c0a0f9a3 2906#ifdef RE_ENABLE_I18N
3b0bdc72 2907 re_charset_t *mbcset;
3b0bdc72
UD
2908 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2909 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3
UD
2910#else /* not RE_ENABLE_I18N */
2911 int non_match = 0;
2912#endif /* not RE_ENABLE_I18N */
2913 bin_tree_t *work_tree;
ee70274a 2914 int token_len;
3b0bdc72 2915#ifdef _LIBC
62439eac
UD
2916 collseqmb = (const unsigned char *)
2917 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3b0bdc72
UD
2918 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2919 if (nrules)
2920 {
2921 /*
2922 if (MB_CUR_MAX > 1)
2923 */
15a7d175 2924 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3b0bdc72
UD
2925 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2926 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 2927 _NL_COLLATE_SYMB_TABLEMB);
3b0bdc72 2928 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 2929 _NL_COLLATE_SYMB_EXTRAMB);
3b0bdc72
UD
2930 }
2931#endif
2932 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 2933#ifdef RE_ENABLE_I18N
3b0bdc72 2934 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
2935#endif /* RE_ENABLE_I18N */
2936#ifdef RE_ENABLE_I18N
bc15410e 2937 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
2938#else
2939 if (BE (sbcset == NULL, 0))
2940#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2941 {
2942 *err = REG_ESPACE;
2943 return NULL;
2944 }
2945
2946 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2947 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2948 {
3b0bdc72 2949 *err = REG_BADPAT;
434d3784 2950 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2951 }
2952 if (token->type == OP_NON_MATCH_LIST)
2953 {
c0a0f9a3 2954#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2955 int i;
2956 mbcset->non_match = 1;
c0a0f9a3
UD
2957#else /* not RE_ENABLE_I18N */
2958 non_match = 1;
2959#endif /* not RE_ENABLE_I18N */
3b0bdc72 2960 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 2961 bitset_set (sbcset, '\0');
3b0bdc72
UD
2962 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2963 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2964 if (BE (token->type == END_OF_RE, 0))
15a7d175
UD
2965 {
2966 *err = REG_BADPAT;
2967 goto parse_bracket_exp_free_return;
2968 }
c0a0f9a3 2969#ifdef RE_ENABLE_I18N
3c0fb574 2970 if (dfa->mb_cur_max > 1)
15a7d175
UD
2971 for (i = 0; i < SBC_MAX; ++i)
2972 if (__btowc (i) == WEOF)
2973 bitset_set (sbcset, i);
c0a0f9a3 2974#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2975 }
2976
2977 /* We treat the first ']' as a normal character. */
2978 if (token->type == OP_CLOSE_BRACKET)
2979 token->type = CHARACTER;
2980
78d8b07a 2981 int first_round = 1;
3b0bdc72
UD
2982 while (1)
2983 {
2984 bracket_elem_t start_elem, end_elem;
62439eac
UD
2985 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2986 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3b0bdc72
UD
2987 reg_errcode_t ret;
2988 int token_len2 = 0, is_range_exp = 0;
2989 re_token_t token2;
2990
2991 start_elem.opr.name = start_name_buf;
2992 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
78d8b07a 2993 syntax, first_round);
bc15410e 2994 if (BE (ret != REG_NOERROR, 0))
15a7d175
UD
2995 {
2996 *err = ret;
2997 goto parse_bracket_exp_free_return;
2998 }
78d8b07a 2999 first_round = 0;
3b0bdc72 3000
78d8b07a 3001 /* Get information about the next token. We need it in any case. */
3b0bdc72 3002 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a
UD
3003
3004 /* Do not check for ranges if we know they are not allowed. */
3005 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
15a7d175 3006 {
15a7d175
UD
3007 if (BE (token->type == END_OF_RE, 0))
3008 {
78d8b07a 3009 *err = REG_EBRACK;
15a7d175
UD
3010 goto parse_bracket_exp_free_return;
3011 }
78d8b07a 3012 if (token->type == OP_CHARSET_RANGE)
15a7d175 3013 {
78d8b07a
UD
3014 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3015 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3016 if (BE (token2.type == END_OF_RE, 0))
3017 {
3018 *err = REG_EBRACK;
3019 goto parse_bracket_exp_free_return;
3020 }
3021 if (token2.type == OP_CLOSE_BRACKET)
3022 {
3023 /* We treat the last '-' as a normal character. */
3024 re_string_skip_bytes (regexp, -token_len);
3025 token->type = CHARACTER;
3026 }
3027 else
3028 is_range_exp = 1;
15a7d175 3029 }
15a7d175 3030 }
3b0bdc72
UD
3031
3032 if (is_range_exp == 1)
15a7d175
UD
3033 {
3034 end_elem.opr.name = end_name_buf;
3035 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
78d8b07a 3036 dfa, syntax, 1);
15a7d175
UD
3037 if (BE (ret != REG_NOERROR, 0))
3038 {
3039 *err = ret;
3040 goto parse_bracket_exp_free_return;
3041 }
3042
3043 token_len = peek_token_bracket (token, regexp, syntax);
78d8b07a 3044
15a7d175 3045 *err = build_range_exp (sbcset,
c0a0f9a3 3046#ifdef RE_ENABLE_I18N
15a7d175 3047 mbcset, &range_alloc,
c0a0f9a3 3048#endif /* RE_ENABLE_I18N */
15a7d175
UD
3049 &start_elem, &end_elem);
3050 if (BE (*err != REG_NOERROR, 0))
3051 goto parse_bracket_exp_free_return;
3052 }
3b0bdc72 3053 else
15a7d175
UD
3054 {
3055 switch (start_elem.type)
3056 {
3057 case SB_CHAR:
3058 bitset_set (sbcset, start_elem.opr.ch);
3059 break;
c0a0f9a3 3060#ifdef RE_ENABLE_I18N
15a7d175
UD
3061 case MB_CHAR:
3062 /* Check whether the array has enough space. */
951d6408 3063 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
15a7d175 3064 {
951d6408 3065 wchar_t *new_mbchars;
15a7d175
UD
3066 /* Not enough, realloc it. */
3067 /* +1 in case of mbcset->nmbchars is 0. */
3068 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3069 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3070 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3071 mbchar_alloc);
3072 if (BE (new_mbchars == NULL, 0))
15a7d175 3073 goto parse_bracket_exp_espace;
951d6408 3074 mbcset->mbchars = new_mbchars;
15a7d175
UD
3075 }
3076 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3077 break;
c0a0f9a3 3078#endif /* RE_ENABLE_I18N */
15a7d175
UD
3079 case EQUIV_CLASS:
3080 *err = build_equiv_class (sbcset,
c0a0f9a3 3081#ifdef RE_ENABLE_I18N
15a7d175 3082 mbcset, &equiv_class_alloc,
c0a0f9a3 3083#endif /* RE_ENABLE_I18N */
3b0bdc72 3084 start_elem.opr.name);
15a7d175
UD
3085 if (BE (*err != REG_NOERROR, 0))
3086 goto parse_bracket_exp_free_return;
3087 break;
3088 case COLL_SYM:
3089 *err = build_collating_symbol (sbcset,
c0a0f9a3 3090#ifdef RE_ENABLE_I18N
15a7d175 3091 mbcset, &coll_sym_alloc,
c0a0f9a3 3092#endif /* RE_ENABLE_I18N */
3b0bdc72 3093 start_elem.opr.name);
15a7d175
UD
3094 if (BE (*err != REG_NOERROR, 0))
3095 goto parse_bracket_exp_free_return;
3096 break;
3097 case CHAR_CLASS:
66b110e8 3098 *err = build_charclass (regexp->trans, sbcset,
c0a0f9a3 3099#ifdef RE_ENABLE_I18N
7b7b9e70 3100 mbcset, &char_class_alloc,
c0a0f9a3 3101#endif /* RE_ENABLE_I18N */
7b7b9e70
UD
3102 start_elem.opr.name, syntax);
3103 if (BE (*err != REG_NOERROR, 0))
3104 goto parse_bracket_exp_free_return;
15a7d175
UD
3105 break;
3106 default:
3107 assert (0);
3108 break;
3109 }
3110 }
78d8b07a
UD
3111 if (BE (token->type == END_OF_RE, 0))
3112 {
3113 *err = REG_EBRACK;
3114 goto parse_bracket_exp_free_return;
3115 }
3b0bdc72 3116 if (token->type == OP_CLOSE_BRACKET)
15a7d175 3117 break;
3b0bdc72
UD
3118 }
3119
3120 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3121
3122 /* If it is non-matching list. */
c0a0f9a3 3123#ifdef RE_ENABLE_I18N
3b0bdc72 3124 if (mbcset->non_match)
c0a0f9a3
UD
3125#else /* not RE_ENABLE_I18N */
3126 if (non_match)
3127#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3128 bitset_not (sbcset);
3129
3130 /* Build a tree for simple bracket. */
3131 br_token.type = SIMPLE_BRACKET;
3132 br_token.opr.sbcset = sbcset;
c5c9c929 3133 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3134 if (BE (work_tree == NULL, 0))
3b0bdc72
UD
3135 goto parse_bracket_exp_espace;
3136
c0a0f9a3 3137#ifdef RE_ENABLE_I18N
3b0bdc72 3138 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3c0fb574
UD
3139 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3140 || mbcset->non_match)))
3b0bdc72
UD
3141 {
3142 re_token_t alt_token;
3143 bin_tree_t *mbc_tree;
ad7f28c2 3144 int sbc_idx;
3b0bdc72 3145 /* Build a tree for complex bracket. */
ad7f28c2
UD
3146 dfa->has_mb_node = 1;
3147 dfa->has_plural_match = 1;
3148 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3149 if (sbcset[sbc_idx])
3150 break;
3151 /* If there are no bits set in sbcset, there is no point
3152 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3153 if (sbc_idx == BITSET_UINTS)
3154 {
3155 re_free (sbcset);
ee70274a
UD
3156 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3157 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
ad7f28c2
UD
3158 return work_tree;
3159 }
3b0bdc72
UD
3160 br_token.type = COMPLEX_BRACKET;
3161 br_token.opr.mbcset = mbcset;
c5c9c929 3162 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3163 if (BE (mbc_tree == NULL, 0))
15a7d175 3164 goto parse_bracket_exp_espace;
3b0bdc72
UD
3165 /* Then join them by ALT node. */
3166 alt_token.type = OP_ALT;
c5c9c929 3167 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
ee70274a 3168 if (BE (mbc_tree != NULL, 1))
15a7d175 3169 return work_tree;
3b0bdc72
UD
3170 }
3171 else
3172 {
3173 free_charset (mbcset);
3174 return work_tree;
3175 }
c0a0f9a3
UD
3176#else /* not RE_ENABLE_I18N */
3177 return work_tree;
3178#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3179
3180 parse_bracket_exp_espace:
3b0bdc72 3181 *err = REG_ESPACE;
434d3784
UD
3182 parse_bracket_exp_free_return:
3183 re_free (sbcset);
c0a0f9a3 3184#ifdef RE_ENABLE_I18N
434d3784 3185 free_charset (mbcset);
c0a0f9a3 3186#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3187 return NULL;
3188}
3189
434d3784
UD
3190/* Parse an element in the bracket expression. */
3191
3b0bdc72 3192static reg_errcode_t
78d8b07a
UD
3193parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3194 accept_hyphen)
3b0bdc72
UD
3195 bracket_elem_t *elem;
3196 re_string_t *regexp;
3197 re_token_t *token;
3198 int token_len;
3199 re_dfa_t *dfa;
3200 reg_syntax_t syntax;
78d8b07a 3201 int accept_hyphen;
3b0bdc72
UD
3202{
3203#ifdef RE_ENABLE_I18N
3204 int cur_char_size;
3205 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3206 if (cur_char_size > 1)
3207 {
3208 elem->type = MB_CHAR;
3209 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3210 re_string_skip_bytes (regexp, cur_char_size);
3211 return REG_NOERROR;
3212 }
3213#endif /* RE_ENABLE_I18N */
3214 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3215 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3216 || token->type == OP_OPEN_EQUIV_CLASS)
3217 return parse_bracket_symbol (elem, regexp, token);
78d8b07a
UD
3218 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3219 {
3220 /* A '-' must only appear as anything but a range indicator before
3221 the closing bracket. Everything else is an error. */
3222 re_token_t token2;
3223 (void) peek_token_bracket (&token2, regexp, syntax);
3224 if (token2.type != OP_CLOSE_BRACKET)
3225 /* The actual error value is not standardized since this whole
3226 case is undefined. But ERANGE makes good sense. */
3227 return REG_ERANGE;
3228 }
3b0bdc72
UD
3229 elem->type = SB_CHAR;
3230 elem->opr.ch = token->opr.c;
3231 return REG_NOERROR;
3232}
3233
434d3784
UD
3234/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3235 such as [:<character_class>:], [.<collating_element>.], and
3236 [=<equivalent_class>=]. */
3237
3b0bdc72
UD
3238static reg_errcode_t
3239parse_bracket_symbol (elem, regexp, token)
3240 bracket_elem_t *elem;
3241 re_string_t *regexp;
3242 re_token_t *token;
3243{
3244 unsigned char ch, delim = token->opr.c;
3245 int i = 0;
602c2f9d 3246 for (;; ++i)
3b0bdc72 3247 {
602c2f9d 3248 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
15a7d175 3249 return REG_EBRACK;
3b0bdc72 3250 if (token->type == OP_OPEN_CHAR_CLASS)
15a7d175 3251 ch = re_string_fetch_byte_case (regexp);
3b0bdc72 3252 else
15a7d175 3253 ch = re_string_fetch_byte (regexp);
3b0bdc72 3254 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
15a7d175 3255 break;
3b0bdc72
UD
3256 elem->opr.name[i] = ch;
3257 }
3258 re_string_skip_bytes (regexp, 1);
3259 elem->opr.name[i] = '\0';
3260 switch (token->type)
3261 {
3262 case OP_OPEN_COLL_ELEM:
3263 elem->type = COLL_SYM;
3264 break;
3265 case OP_OPEN_EQUIV_CLASS:
3266 elem->type = EQUIV_CLASS;
3267 break;
3268 case OP_OPEN_CHAR_CLASS:
3269 elem->type = CHAR_CLASS;
3270 break;
3271 default:
3272 break;
3273 }
3274 return REG_NOERROR;
3275}
3276
3277 /* Helper function for parse_bracket_exp.
3278 Build the equivalence class which is represented by NAME.
3279 The result are written to MBCSET and SBCSET.
3280 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3281 is a pointer argument sinse we may update it. */
3282
3283static reg_errcode_t
c0a0f9a3
UD
3284#ifdef RE_ENABLE_I18N
3285build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3b0bdc72 3286 re_charset_t *mbcset;
3b0bdc72 3287 int *equiv_class_alloc;
c0a0f9a3
UD
3288#else /* not RE_ENABLE_I18N */
3289build_equiv_class (sbcset, name)
3290#endif /* not RE_ENABLE_I18N */
3291 re_bitset_ptr_t sbcset;
62439eac 3292 const unsigned char *name;
3b0bdc72 3293{
c0a0f9a3 3294#if defined _LIBC && defined RE_ENABLE_I18N
3b0bdc72
UD
3295 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3296 if (nrules != 0)
3297 {
3298 const int32_t *table, *indirect;
3299 const unsigned char *weights, *extra, *cp;
62439eac 3300 unsigned char char_buf[2];
3b0bdc72
UD
3301 int32_t idx1, idx2;
3302 unsigned int ch;
3303 size_t len;
3304 /* This #include defines a local function! */
3305# include <locale/weight.h>
3306 /* Calculate the index for equivalence class. */
62439eac 3307 cp = name;
3b0bdc72
UD
3308 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3309 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3310 _NL_COLLATE_WEIGHTMB);
3311 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
15a7d175 3312 _NL_COLLATE_EXTRAMB);
3b0bdc72 3313 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
15a7d175 3314 _NL_COLLATE_INDIRECTMB);
3b0bdc72 3315 idx1 = findidx (&cp);
62439eac 3316 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
15a7d175
UD
3317 /* This isn't a valid character. */
3318 return REG_ECOLLATE;
3b0bdc72
UD
3319
3320 /* Build single byte matcing table for this equivalence class. */
62439eac 3321 char_buf[1] = (unsigned char) '\0';
3b0bdc72
UD
3322 len = weights[idx1];
3323 for (ch = 0; ch < SBC_MAX; ++ch)
15a7d175
UD
3324 {
3325 char_buf[0] = ch;
3326 cp = char_buf;
3327 idx2 = findidx (&cp);
3b0bdc72 3328/*
15a7d175 3329 idx2 = table[ch];
3b0bdc72 3330*/
15a7d175
UD
3331 if (idx2 == 0)
3332 /* This isn't a valid character. */
3333 continue;
3334 if (len == weights[idx2])
3335 {
3336 int cnt = 0;
3337 while (cnt <= len &&
3338 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3339 ++cnt;
3340
3341 if (cnt > len)
3342 bitset_set (sbcset, ch);
3343 }
3344 }
a9388965 3345 /* Check whether the array has enough space. */
951d6408 3346 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
15a7d175
UD
3347 {
3348 /* Not enough, realloc it. */
3349 /* +1 in case of mbcset->nequiv_classes is 0. */
951d6408 3350 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
15a7d175 3351 /* Use realloc since the array is NULL if *alloc == 0. */
951d6408
UD
3352 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3353 int32_t,
3354 new_equiv_class_alloc);
3355 if (BE (new_equiv_classes == NULL, 0))
15a7d175 3356 return REG_ESPACE;
951d6408
UD
3357 mbcset->equiv_classes = new_equiv_classes;
3358 *equiv_class_alloc = new_equiv_class_alloc;
15a7d175 3359 }
3b0bdc72
UD
3360 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3361 }
3362 else
c0a0f9a3 3363#endif /* _LIBC && RE_ENABLE_I18N */
3b0bdc72 3364 {
62439eac 3365 if (BE (strlen ((const char *) name) != 1, 0))
15a7d175 3366 return REG_ECOLLATE;
62439eac 3367 bitset_set (sbcset, *name);
3b0bdc72
UD
3368 }
3369 return REG_NOERROR;
3370}
3371
3372 /* Helper function for parse_bracket_exp.
3373 Build the character class which is represented by NAME.
3374 The result are written to MBCSET and SBCSET.
3375 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3376 is a pointer argument sinse we may update it. */
3377
3378static reg_errcode_t
c0a0f9a3 3379#ifdef RE_ENABLE_I18N
66b110e8 3380build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3b0bdc72 3381 re_charset_t *mbcset;
3b0bdc72 3382 int *char_class_alloc;
c0a0f9a3 3383#else /* not RE_ENABLE_I18N */
66b110e8 3384build_charclass (trans, sbcset, class_name, syntax)
c0a0f9a3 3385#endif /* not RE_ENABLE_I18N */
66b110e8 3386 RE_TRANSLATE_TYPE trans;
c0a0f9a3 3387 re_bitset_ptr_t sbcset;
62439eac 3388 const unsigned char *class_name;
602c2f9d 3389 reg_syntax_t syntax;
3b0bdc72
UD
3390{
3391 int i;
62439eac 3392 const char *name = (const char *) class_name;
c0a0f9a3
UD
3393
3394 /* In case of REG_ICASE "upper" and "lower" match the both of
3395 upper and lower cases. */
3396 if ((syntax & RE_ICASE)
3397 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3398 name = "alpha";
3399
3400#ifdef RE_ENABLE_I18N
3b0bdc72 3401 /* Check the space of the arrays. */
951d6408 3402 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
a9388965
UD
3403 {
3404 /* Not enough, realloc it. */
3405 /* +1 in case of mbcset->nchar_classes is 0. */
951d6408 3406 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
a9388965 3407 /* Use realloc since array is NULL if *alloc == 0. */
951d6408
UD
3408 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3409 new_char_class_alloc);
3410 if (BE (new_char_classes == NULL, 0))
15a7d175 3411 return REG_ESPACE;
951d6408
UD
3412 mbcset->char_classes = new_char_classes;
3413 *char_class_alloc = new_char_class_alloc;
a9388965 3414 }
3b0bdc72 3415 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3416#endif /* RE_ENABLE_I18N */
3b0bdc72 3417
66b110e8
UD
3418#define BUILD_CHARCLASS_LOOP(ctype_func) \
3419 for (i = 0; i < SBC_MAX; ++i) \
3420 { \
3421 if (ctype_func (i)) \
3422 { \
3423 int ch = trans ? trans[i] : i; \
3424 bitset_set (sbcset, ch); \
3425 } \
3b0bdc72
UD
3426 }
3427
3428 if (strcmp (name, "alnum") == 0)
3429 BUILD_CHARCLASS_LOOP (isalnum)
3430 else if (strcmp (name, "cntrl") == 0)
3431 BUILD_CHARCLASS_LOOP (iscntrl)
3432 else if (strcmp (name, "lower") == 0)
3433 BUILD_CHARCLASS_LOOP (islower)
3434 else if (strcmp (name, "space") == 0)
3435 BUILD_CHARCLASS_LOOP (isspace)
3436 else if (strcmp (name, "alpha") == 0)
3437 BUILD_CHARCLASS_LOOP (isalpha)
3438 else if (strcmp (name, "digit") == 0)
3439 BUILD_CHARCLASS_LOOP (isdigit)
3440 else if (strcmp (name, "print") == 0)
3441 BUILD_CHARCLASS_LOOP (isprint)
3442 else if (strcmp (name, "upper") == 0)
3443 BUILD_CHARCLASS_LOOP (isupper)
3444 else if (strcmp (name, "blank") == 0)
3445 BUILD_CHARCLASS_LOOP (isblank)
3446 else if (strcmp (name, "graph") == 0)
3447 BUILD_CHARCLASS_LOOP (isgraph)
3448 else if (strcmp (name, "punct") == 0)
3449 BUILD_CHARCLASS_LOOP (ispunct)
3450 else if (strcmp (name, "xdigit") == 0)
3451 BUILD_CHARCLASS_LOOP (isxdigit)
3452 else
3453 return REG_ECTYPE;
3454
3455 return REG_NOERROR;
3456}
3457
3458static bin_tree_t *
e2b6bfa3 3459build_charclass_op (dfa, trans, class_name, extra, not, err)
3b0bdc72 3460 re_dfa_t *dfa;
66b110e8 3461 RE_TRANSLATE_TYPE trans;
e2b6bfa3
UD
3462 const unsigned char *class_name;
3463 const unsigned char *extra;
3b0bdc72
UD
3464 int not;
3465 reg_errcode_t *err;
3466{
3467 re_bitset_ptr_t sbcset;
c0a0f9a3 3468#ifdef RE_ENABLE_I18N
3b0bdc72 3469 re_charset_t *mbcset;
c0a0f9a3
UD
3470 int alloc = 0;
3471#else /* not RE_ENABLE_I18N */
3472 int non_match = 0;
3473#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3474 reg_errcode_t ret;
3475 re_token_t br_token;
3476 bin_tree_t *tree;
3b0bdc72
UD
3477
3478 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 3479#ifdef RE_ENABLE_I18N
3b0bdc72 3480 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3481#endif /* RE_ENABLE_I18N */
3482
3483#ifdef RE_ENABLE_I18N
bc15410e 3484 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3485#else /* not RE_ENABLE_I18N */
3486 if (BE (sbcset == NULL, 0))
3487#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3488 {
3489 *err = REG_ESPACE;
3490 return NULL;
3491 }
3492
3493 if (not)
3494 {
c0a0f9a3 3495#ifdef RE_ENABLE_I18N
3b0bdc72 3496 int i;
3b0bdc72
UD
3497 /*
3498 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
15a7d175 3499 bitset_set(cset->sbcset, '\0');
3b0bdc72 3500 */
c0a0f9a3 3501 mbcset->non_match = 1;
3c0fb574 3502 if (dfa->mb_cur_max > 1)
15a7d175
UD
3503 for (i = 0; i < SBC_MAX; ++i)
3504 if (__btowc (i) == WEOF)
3505 bitset_set (sbcset, i);
c0a0f9a3
UD
3506#else /* not RE_ENABLE_I18N */
3507 non_match = 1;
3508#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3509 }
3510
602c2f9d 3511 /* We don't care the syntax in this case. */
66b110e8 3512 ret = build_charclass (trans, sbcset,
c0a0f9a3 3513#ifdef RE_ENABLE_I18N
15a7d175 3514 mbcset, &alloc,
c0a0f9a3 3515#endif /* RE_ENABLE_I18N */
e2b6bfa3 3516 class_name, 0);
c0a0f9a3 3517
bc15410e 3518 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3519 {
3520 re_free (sbcset);
c0a0f9a3 3521#ifdef RE_ENABLE_I18N
3b0bdc72 3522 free_charset (mbcset);
c0a0f9a3 3523#endif /* RE_ENABLE_I18N */
7b7b9e70 3524 *err = ret;
3b0bdc72
UD
3525 return NULL;
3526 }
434d3784 3527 /* \w match '_' also. */
e2b6bfa3
UD
3528 for (; *extra; extra++)
3529 bitset_set (sbcset, *extra);
3b0bdc72
UD
3530
3531 /* If it is non-matching list. */
c0a0f9a3 3532#ifdef RE_ENABLE_I18N
3b0bdc72 3533 if (mbcset->non_match)
c0a0f9a3
UD
3534#else /* not RE_ENABLE_I18N */
3535 if (non_match)
3536#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3537 bitset_not (sbcset);
3538
3539 /* Build a tree for simple bracket. */
3540 br_token.type = SIMPLE_BRACKET;
3541 br_token.opr.sbcset = sbcset;
c5c9c929 3542 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3543 if (BE (tree == NULL, 0))
3b0bdc72
UD
3544 goto build_word_op_espace;
3545
c0a0f9a3 3546#ifdef RE_ENABLE_I18N
3c0fb574 3547 if (dfa->mb_cur_max > 1)
3b0bdc72
UD
3548 {
3549 re_token_t alt_token;
3550 bin_tree_t *mbc_tree;
3551 /* Build a tree for complex bracket. */
3552 br_token.type = COMPLEX_BRACKET;
3553 br_token.opr.mbcset = mbcset;
3554 dfa->has_mb_node = 1;
c5c9c929 3555 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
ee70274a 3556 if (BE (mbc_tree == NULL, 0))
15a7d175 3557 goto build_word_op_espace;
3b0bdc72
UD
3558 /* Then join them by ALT node. */
3559 alt_token.type = OP_ALT;
c5c9c929 3560 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
ee70274a 3561 if (BE (mbc_tree != NULL, 1))
15a7d175 3562 return tree;
3b0bdc72
UD
3563 }
3564 else
3565 {
3566 free_charset (mbcset);
3567 return tree;
3568 }
c0a0f9a3
UD
3569#else /* not RE_ENABLE_I18N */
3570 return tree;
3571#endif /* not RE_ENABLE_I18N */
3572
3b0bdc72
UD
3573 build_word_op_espace:
3574 re_free (sbcset);
c0a0f9a3 3575#ifdef RE_ENABLE_I18N
3b0bdc72 3576 free_charset (mbcset);
c0a0f9a3 3577#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3578 *err = REG_ESPACE;
3579 return NULL;
3580}
3581
3582/* This is intended for the expressions like "a{1,3}".
3583 Fetch a number from `input', and return the number.
3584 Return -1, if the number field is empty like "{,1}".
3585 Return -2, If an error is occured. */
3586
3587static int
3588fetch_number (input, token, syntax)
3589 re_string_t *input;
3590 re_token_t *token;
3591 reg_syntax_t syntax;
3592{
3593 int num = -1;
3594 unsigned char c;
3595 while (1)
3596 {
f0d77aa8 3597 fetch_token (token, input, syntax);
3b0bdc72 3598 c = token->opr.c;
602c2f9d 3599 if (BE (token->type == END_OF_RE, 0))
15a7d175 3600 return -2;
3b0bdc72 3601 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
15a7d175 3602 break;
602c2f9d 3603 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
15a7d175 3604 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
602c2f9d 3605 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3606 }
3b0bdc72
UD
3607 return num;
3608}
3609\f
c0a0f9a3 3610#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3611static void
3612free_charset (re_charset_t *cset)
3613{
3614 re_free (cset->mbchars);
c0a0f9a3 3615# ifdef _LIBC
3b0bdc72
UD
3616 re_free (cset->coll_syms);
3617 re_free (cset->equiv_classes);
3618 re_free (cset->range_starts);
3619 re_free (cset->range_ends);
c0a0f9a3 3620# endif
3b0bdc72
UD
3621 re_free (cset->char_classes);
3622 re_free (cset);
3623}
c0a0f9a3 3624#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3625\f
3626/* Functions for binary tree operation. */
3627
ee70274a 3628/* Create a tree node. */
3b0bdc72
UD
3629
3630static bin_tree_t *
ee70274a
UD
3631create_tree (dfa, left, right, type, index)
3632 re_dfa_t *dfa;
3b0bdc72
UD
3633 bin_tree_t *left;
3634 bin_tree_t *right;
3635 re_token_type_t type;
3636 int index;
3637{
3638 bin_tree_t *tree;
ee70274a 3639 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3b0bdc72 3640 {
ee70274a
UD
3641 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3642
3643 if (storage == NULL)
3644 return NULL;
3645 storage->next = dfa->str_tree_storage;
3646 dfa->str_tree_storage = storage;
3647 dfa->str_tree_storage_idx = 0;
3b0bdc72 3648 }
ee70274a
UD
3649 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3650
3b0bdc72
UD
3651 tree->parent = NULL;
3652 tree->left = left;
3653 tree->right = right;
3654 tree->type = type;
3655 tree->node_idx = index;
3656 tree->first = -1;
3657 tree->next = -1;
3658 re_node_set_init_empty (&tree->eclosure);
3659
3660 if (left != NULL)
3661 left->parent = tree;
3662 if (right != NULL)
3663 right->parent = tree;
3664 return tree;
3665}
3666
ee70274a 3667/* Create both a DFA node and a tree for it. */
3b0bdc72 3668
ee70274a
UD
3669static bin_tree_t *
3670re_dfa_add_tree_node (dfa, left, right, token)
3671 re_dfa_t *dfa;
3672 bin_tree_t *left;
3673 bin_tree_t *right;
c5c9c929 3674 const re_token_t *token;
f0d77aa8 3675{
c5c9c929 3676 int new_idx = re_dfa_add_node (dfa, *token, 0);
ee70274a
UD
3677
3678 if (new_idx == -1)
3679 return NULL;
3680
3681 return create_tree (dfa, left, right, 0, new_idx);
3b0bdc72
UD
3682}
3683
ee70274a 3684
3b0bdc72
UD
3685/* Duplicate the node SRC, and return new node. */
3686
3687static bin_tree_t *
3688duplicate_tree (src, dfa)
3689 const bin_tree_t *src;
3690 re_dfa_t *dfa;
3691{
3692 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3693 int new_node_idx;
3694 /* Since node indies must be according to Post-order of the tree,
3695 we must duplicate the left at first. */
3696 if (src->left != NULL)
3697 {
3698 left = duplicate_tree (src->left, dfa);
3699 if (left == NULL)
15a7d175 3700 return NULL;
3b0bdc72
UD
3701 }
3702
3703 /* Secondaly, duplicate the right. */
3704 if (src->right != NULL)
3705 {
3706 right = duplicate_tree (src->right, dfa);
3707 if (right == NULL)
ee70274a 3708 return NULL;
3b0bdc72
UD
3709 }
3710
3711 /* At last, duplicate itself. */
3712 if (src->type == NON_TYPE)
3713 {
3714 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3715 dfa->nodes[new_node_idx].duplicated = 1;
bc15410e 3716 if (BE (new_node_idx == -1, 0))
ee70274a 3717 return NULL;
3b0bdc72
UD
3718 }
3719 else
3720 new_node_idx = src->type;
3721
ee70274a 3722 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3b0bdc72
UD
3723 return new_tree;
3724}