]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
Update.
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (re_dfa_t *dfa);
37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
42 int top_clone_node, int root_node,
43 unsigned int constraint);
44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
45 unsigned int constraint);
46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
50 int node, int root);
51 static void calc_inveclosure (re_dfa_t *dfa);
52 static int fetch_number (re_string_t *input, re_token_t *token,
53 reg_syntax_t syntax);
54 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
55 static int peek_token (re_token_t *token, re_string_t *input,
56 reg_syntax_t syntax);
57 static int peek_token_bracket (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 int accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifndef _LIBC
89 # ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
91 re_charset_t *mbcset, int *range_alloc,
92 bracket_elem_t *start_elem,
93 bracket_elem_t *end_elem);
94 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
95 re_charset_t *mbcset,
96 int *coll_sym_alloc,
97 const unsigned char *name);
98 # else /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
100 bracket_elem_t *start_elem,
101 bracket_elem_t *end_elem);
102 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
103 const unsigned char *name);
104 # endif /* not RE_ENABLE_I18N */
105 #endif /* not _LIBC */
106 #ifdef RE_ENABLE_I18N
107 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
108 re_charset_t *mbcset,
109 int *equiv_class_alloc,
110 const unsigned char *name);
111 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
112 re_bitset_ptr_t sbcset,
113 re_charset_t *mbcset,
114 int *char_class_alloc,
115 const unsigned char *class_name,
116 reg_syntax_t syntax);
117 #else /* not RE_ENABLE_I18N */
118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 const unsigned char *name);
120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
121 re_bitset_ptr_t sbcset,
122 const unsigned char *class_name,
123 reg_syntax_t syntax);
124 #endif /* not RE_ENABLE_I18N */
125 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
126 const unsigned char *class_name,
127 const unsigned char *extra, int not,
128 reg_errcode_t *err);
129 static void free_bin_tree (bin_tree_t *tree);
130 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
131 re_token_type_t type, int index);
132 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
133 \f
134 /* This table gives an error message for each of the error codes listed
135 in regex.h. Obviously the order here has to be same as there.
136 POSIX doesn't require that we do anything for REG_NOERROR,
137 but why not be nice? */
138
139 const char __re_error_msgid[] attribute_hidden =
140 {
141 #define REG_NOERROR_IDX 0
142 gettext_noop ("Success") /* REG_NOERROR */
143 "\0"
144 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
145 gettext_noop ("No match") /* REG_NOMATCH */
146 "\0"
147 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
148 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
149 "\0"
150 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
151 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
152 "\0"
153 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
154 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
155 "\0"
156 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
157 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
158 "\0"
159 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
160 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
161 "\0"
162 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
163 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
164 "\0"
165 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
166 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
167 "\0"
168 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
169 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
170 "\0"
171 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
172 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
173 "\0"
174 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
175 gettext_noop ("Invalid range end") /* REG_ERANGE */
176 "\0"
177 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
178 gettext_noop ("Memory exhausted") /* REG_ESPACE */
179 "\0"
180 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
181 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
182 "\0"
183 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
184 gettext_noop ("Premature end of regular expression") /* REG_EEND */
185 "\0"
186 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
187 gettext_noop ("Regular expression too big") /* REG_ESIZE */
188 "\0"
189 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
190 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
191 };
192
193 const size_t __re_error_msgid_idx[] attribute_hidden =
194 {
195 REG_NOERROR_IDX,
196 REG_NOMATCH_IDX,
197 REG_BADPAT_IDX,
198 REG_ECOLLATE_IDX,
199 REG_ECTYPE_IDX,
200 REG_EESCAPE_IDX,
201 REG_ESUBREG_IDX,
202 REG_EBRACK_IDX,
203 REG_EPAREN_IDX,
204 REG_EBRACE_IDX,
205 REG_BADBR_IDX,
206 REG_ERANGE_IDX,
207 REG_ESPACE_IDX,
208 REG_BADRPT_IDX,
209 REG_EEND_IDX,
210 REG_ESIZE_IDX,
211 REG_ERPAREN_IDX
212 };
213 \f
214 /* Entry points for GNU code. */
215
216 /* re_compile_pattern is the GNU regular expression compiler: it
217 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
218 Returns 0 if the pattern was valid, otherwise an error string.
219
220 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
221 are set in BUFP on entry. */
222
223 const char *
224 re_compile_pattern (pattern, length, bufp)
225 const char *pattern;
226 size_t length;
227 struct re_pattern_buffer *bufp;
228 {
229 reg_errcode_t ret;
230
231 /* And GNU code determines whether or not to get register information
232 by passing null for the REGS argument to re_match, etc., not by
233 setting no_sub. */
234 bufp->no_sub = 0;
235
236 /* Match anchors at newline. */
237 bufp->newline_anchor = 1;
238
239 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
240
241 if (!ret)
242 return NULL;
243 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
244 }
245 #ifdef _LIBC
246 weak_alias (__re_compile_pattern, re_compile_pattern)
247 #endif
248
249 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
250 also be assigned to arbitrarily: each pattern buffer stores its own
251 syntax, so it can be changed between regex compilations. */
252 /* This has no initializer because initialized variables in Emacs
253 become read-only after dumping. */
254 reg_syntax_t re_syntax_options;
255
256
257 /* Specify the precise syntax of regexps for compilation. This provides
258 for compatibility for various utilities which historically have
259 different, incompatible syntaxes.
260
261 The argument SYNTAX is a bit mask comprised of the various bits
262 defined in regex.h. We return the old syntax. */
263
264 reg_syntax_t
265 re_set_syntax (syntax)
266 reg_syntax_t syntax;
267 {
268 reg_syntax_t ret = re_syntax_options;
269
270 re_syntax_options = syntax;
271 return ret;
272 }
273 #ifdef _LIBC
274 weak_alias (__re_set_syntax, re_set_syntax)
275 #endif
276
277 int
278 re_compile_fastmap (bufp)
279 struct re_pattern_buffer *bufp;
280 {
281 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
282 char *fastmap = bufp->fastmap;
283
284 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
285 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
286 if (dfa->init_state != dfa->init_state_word)
287 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
288 if (dfa->init_state != dfa->init_state_nl)
289 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
290 if (dfa->init_state != dfa->init_state_begbuf)
291 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
292 bufp->fastmap_accurate = 1;
293 return 0;
294 }
295 #ifdef _LIBC
296 weak_alias (__re_compile_fastmap, re_compile_fastmap)
297 #endif
298
299 static inline void
300 __attribute ((always_inline))
301 re_set_fastmap (char *fastmap, int icase, int ch)
302 {
303 fastmap[ch] = 1;
304 if (icase)
305 fastmap[tolower (ch)] = 1;
306 }
307
308 /* Helper function for re_compile_fastmap.
309 Compile fastmap for the initial_state INIT_STATE. */
310
311 static void
312 re_compile_fastmap_iter (bufp, init_state, fastmap)
313 regex_t *bufp;
314 const re_dfastate_t *init_state;
315 char *fastmap;
316 {
317 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
318 int node_cnt;
319 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
320 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
321 {
322 int node = init_state->nodes.elems[node_cnt];
323 re_token_type_t type = dfa->nodes[node].type;
324
325 if (type == CHARACTER)
326 {
327 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
328 #ifdef RE_ENABLE_I18N
329 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
330 {
331 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
332 wchar_t wc;
333 mbstate_t state;
334
335 p = buf;
336 *p++ = dfa->nodes[node].opr.c;
337 while (++node < dfa->nodes_len
338 && dfa->nodes[node].type == CHARACTER
339 && dfa->nodes[node].mb_partial)
340 *p++ = dfa->nodes[node].opr.c;
341 memset (&state, 0, sizeof (state));
342 if (mbrtowc (&wc, (const char *) buf, p - buf,
343 &state) == p - buf
344 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
345 re_set_fastmap (fastmap, 0, buf[0]);
346 }
347 #endif
348 }
349 else if (type == SIMPLE_BRACKET)
350 {
351 int i, j, ch;
352 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
353 for (j = 0; j < UINT_BITS; ++j, ++ch)
354 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
355 re_set_fastmap (fastmap, icase, ch);
356 }
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
359 {
360 int i;
361 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
363 || cset->nranges || cset->nchar_classes)
364 {
365 # ifdef _LIBC
366 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
367 {
368 /* In this case we want to catch the bytes which are
369 the first byte of any collation elements.
370 e.g. In da_DK, we want to catch 'a' since "aa"
371 is a valid collation element, and don't catch
372 'b' since 'b' is the only collation element
373 which starts from 'b'. */
374 int j, ch;
375 const int32_t *table = (const int32_t *)
376 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
378 for (j = 0; j < UINT_BITS; ++j, ++ch)
379 if (table[ch] < 0)
380 re_set_fastmap (fastmap, icase, ch);
381 }
382 # else
383 if (dfa->mb_cur_max > 1)
384 for (i = 0; i < SBC_MAX; ++i)
385 if (__btowc (i) == WEOF)
386 re_set_fastmap (fastmap, icase, i);
387 # endif /* not _LIBC */
388 }
389 for (i = 0; i < cset->nmbchars; ++i)
390 {
391 char buf[256];
392 mbstate_t state;
393 memset (&state, '\0', sizeof (state));
394 __wcrtomb (buf, cset->mbchars[i], &state);
395 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
396 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
397 {
398 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
399 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
400 }
401 }
402 }
403 #endif /* RE_ENABLE_I18N */
404 else if (type == END_OF_RE || type == OP_PERIOD)
405 {
406 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
407 if (type == END_OF_RE)
408 bufp->can_be_null = 1;
409 return;
410 }
411 }
412 }
413 \f
414 /* Entry point for POSIX code. */
415 /* regcomp takes a regular expression as a string and compiles it.
416
417 PREG is a regex_t *. We do not expect any fields to be initialized,
418 since POSIX says we shouldn't. Thus, we set
419
420 `buffer' to the compiled pattern;
421 `used' to the length of the compiled pattern;
422 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
423 REG_EXTENDED bit in CFLAGS is set; otherwise, to
424 RE_SYNTAX_POSIX_BASIC;
425 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
426 `fastmap' to an allocated space for the fastmap;
427 `fastmap_accurate' to zero;
428 `re_nsub' to the number of subexpressions in PATTERN.
429
430 PATTERN is the address of the pattern string.
431
432 CFLAGS is a series of bits which affect compilation.
433
434 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
435 use POSIX basic syntax.
436
437 If REG_NEWLINE is set, then . and [^...] don't match newline.
438 Also, regexec will try a match beginning after every newline.
439
440 If REG_ICASE is set, then we considers upper- and lowercase
441 versions of letters to be equivalent when matching.
442
443 If REG_NOSUB is set, then when PREG is passed to regexec, that
444 routine will report only success or failure, and nothing about the
445 registers.
446
447 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
448 the return codes and their meanings.) */
449
450 int
451 regcomp (preg, pattern, cflags)
452 regex_t *__restrict preg;
453 const char *__restrict pattern;
454 int cflags;
455 {
456 reg_errcode_t ret;
457 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
458 : RE_SYNTAX_POSIX_BASIC);
459
460 preg->buffer = NULL;
461 preg->allocated = 0;
462 preg->used = 0;
463
464 /* Try to allocate space for the fastmap. */
465 preg->fastmap = re_malloc (char, SBC_MAX);
466 if (BE (preg->fastmap == NULL, 0))
467 return REG_ESPACE;
468
469 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
470
471 /* If REG_NEWLINE is set, newlines are treated differently. */
472 if (cflags & REG_NEWLINE)
473 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
474 syntax &= ~RE_DOT_NEWLINE;
475 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
476 /* It also changes the matching behavior. */
477 preg->newline_anchor = 1;
478 }
479 else
480 preg->newline_anchor = 0;
481 preg->no_sub = !!(cflags & REG_NOSUB);
482 preg->translate = NULL;
483
484 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
485
486 /* POSIX doesn't distinguish between an unmatched open-group and an
487 unmatched close-group: both are REG_EPAREN. */
488 if (ret == REG_ERPAREN)
489 ret = REG_EPAREN;
490
491 /* We have already checked preg->fastmap != NULL. */
492 if (BE (ret == REG_NOERROR, 1))
493 /* Compute the fastmap now, since regexec cannot modify the pattern
494 buffer. This function nevers fails in this implementation. */
495 (void) re_compile_fastmap (preg);
496 else
497 {
498 /* Some error occurred while compiling the expression. */
499 re_free (preg->fastmap);
500 preg->fastmap = NULL;
501 }
502
503 return (int) ret;
504 }
505 #ifdef _LIBC
506 weak_alias (__regcomp, regcomp)
507 #endif
508
509 /* Returns a message corresponding to an error code, ERRCODE, returned
510 from either regcomp or regexec. We don't use PREG here. */
511
512 size_t
513 regerror (errcode, preg, errbuf, errbuf_size)
514 int errcode;
515 const regex_t *preg;
516 char *errbuf;
517 size_t errbuf_size;
518 {
519 const char *msg;
520 size_t msg_size;
521
522 if (BE (errcode < 0
523 || errcode >= (int) (sizeof (__re_error_msgid_idx)
524 / sizeof (__re_error_msgid_idx[0])), 0))
525 /* Only error codes returned by the rest of the code should be passed
526 to this routine. If we are given anything else, or if other regex
527 code generates an invalid error code, then the program has a bug.
528 Dump core so we can fix it. */
529 abort ();
530
531 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
532
533 msg_size = strlen (msg) + 1; /* Includes the null. */
534
535 if (BE (errbuf_size != 0, 1))
536 {
537 if (BE (msg_size > errbuf_size, 0))
538 {
539 #if defined HAVE_MEMPCPY || defined _LIBC
540 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
541 #else
542 memcpy (errbuf, msg, errbuf_size - 1);
543 errbuf[errbuf_size - 1] = 0;
544 #endif
545 }
546 else
547 memcpy (errbuf, msg, msg_size);
548 }
549
550 return msg_size;
551 }
552 #ifdef _LIBC
553 weak_alias (__regerror, regerror)
554 #endif
555
556
557 static void
558 free_dfa_content (re_dfa_t *dfa)
559 {
560 int i, j;
561
562 re_free (dfa->subexps);
563
564 for (i = 0; i < dfa->nodes_len; ++i)
565 {
566 re_token_t *node = dfa->nodes + i;
567 #ifdef RE_ENABLE_I18N
568 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
569 free_charset (node->opr.mbcset);
570 else
571 #endif /* RE_ENABLE_I18N */
572 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
573 re_free (node->opr.sbcset);
574 }
575 re_free (dfa->nexts);
576 for (i = 0; i < dfa->nodes_len; ++i)
577 {
578 if (dfa->eclosures != NULL)
579 re_node_set_free (dfa->eclosures + i);
580 if (dfa->inveclosures != NULL)
581 re_node_set_free (dfa->inveclosures + i);
582 if (dfa->edests != NULL)
583 re_node_set_free (dfa->edests + i);
584 }
585 re_free (dfa->edests);
586 re_free (dfa->eclosures);
587 re_free (dfa->inveclosures);
588 re_free (dfa->nodes);
589
590 for (i = 0; i <= dfa->state_hash_mask; ++i)
591 {
592 struct re_state_table_entry *entry = dfa->state_table + i;
593 for (j = 0; j < entry->num; ++j)
594 {
595 re_dfastate_t *state = entry->array[j];
596 free_state (state);
597 }
598 re_free (entry->array);
599 }
600 re_free (dfa->state_table);
601
602 if (dfa->word_char != NULL)
603 re_free (dfa->word_char);
604 #ifdef DEBUG
605 re_free (dfa->re_str);
606 #endif
607
608 re_free (dfa);
609 }
610
611
612 /* Free dynamically allocated space used by PREG. */
613
614 void
615 regfree (preg)
616 regex_t *preg;
617 {
618 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
619 if (BE (dfa != NULL, 1))
620 free_dfa_content (dfa);
621
622 re_free (preg->fastmap);
623 }
624 #ifdef _LIBC
625 weak_alias (__regfree, regfree)
626 #endif
627 \f
628 /* Entry points compatible with 4.2 BSD regex library. We don't define
629 them unless specifically requested. */
630
631 #if defined _REGEX_RE_COMP || defined _LIBC
632
633 /* BSD has one and only one pattern buffer. */
634 static struct re_pattern_buffer re_comp_buf;
635
636 char *
637 # ifdef _LIBC
638 /* Make these definitions weak in libc, so POSIX programs can redefine
639 these names if they don't use our functions, and still use
640 regcomp/regexec above without link errors. */
641 weak_function
642 # endif
643 re_comp (s)
644 const char *s;
645 {
646 reg_errcode_t ret;
647 char *fastmap;
648
649 if (!s)
650 {
651 if (!re_comp_buf.buffer)
652 return gettext ("No previous regular expression");
653 return 0;
654 }
655
656 if (re_comp_buf.buffer)
657 {
658 fastmap = re_comp_buf.fastmap;
659 re_comp_buf.fastmap = NULL;
660 __regfree (&re_comp_buf);
661 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
662 re_comp_buf.fastmap = fastmap;
663 }
664
665 if (re_comp_buf.fastmap == NULL)
666 {
667 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
668 if (re_comp_buf.fastmap == NULL)
669 return (char *) gettext (__re_error_msgid
670 + __re_error_msgid_idx[(int) REG_ESPACE]);
671 }
672
673 /* Since `re_exec' always passes NULL for the `regs' argument, we
674 don't need to initialize the pattern buffer fields which affect it. */
675
676 /* Match anchors at newlines. */
677 re_comp_buf.newline_anchor = 1;
678
679 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
680
681 if (!ret)
682 return NULL;
683
684 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
685 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
686 }
687
688 #ifdef _LIBC
689 libc_freeres_fn (free_mem)
690 {
691 __regfree (&re_comp_buf);
692 }
693 #endif
694
695 #endif /* _REGEX_RE_COMP */
696 \f
697 /* Internal entry point.
698 Compile the regular expression PATTERN, whose length is LENGTH.
699 SYNTAX indicate regular expression's syntax. */
700
701 static reg_errcode_t
702 re_compile_internal (preg, pattern, length, syntax)
703 regex_t *preg;
704 const char * pattern;
705 int length;
706 reg_syntax_t syntax;
707 {
708 reg_errcode_t err = REG_NOERROR;
709 re_dfa_t *dfa;
710 re_string_t regexp;
711
712 /* Initialize the pattern buffer. */
713 preg->fastmap_accurate = 0;
714 preg->syntax = syntax;
715 preg->not_bol = preg->not_eol = 0;
716 preg->used = 0;
717 preg->re_nsub = 0;
718 preg->can_be_null = 0;
719 preg->regs_allocated = REGS_UNALLOCATED;
720
721 /* Initialize the dfa. */
722 dfa = (re_dfa_t *) preg->buffer;
723 if (preg->allocated < sizeof (re_dfa_t))
724 {
725 /* If zero allocated, but buffer is non-null, try to realloc
726 enough space. This loses if buffer's address is bogus, but
727 that is the user's responsibility. If ->buffer is NULL this
728 is a simple allocation. */
729 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
730 if (dfa == NULL)
731 return REG_ESPACE;
732 preg->allocated = sizeof (re_dfa_t);
733 }
734 preg->buffer = (unsigned char *) dfa;
735 preg->used = sizeof (re_dfa_t);
736
737 err = init_dfa (dfa, length);
738 if (BE (err != REG_NOERROR, 0))
739 {
740 re_free (dfa);
741 preg->buffer = NULL;
742 preg->allocated = 0;
743 return err;
744 }
745 #ifdef DEBUG
746 dfa->re_str = re_malloc (char, length + 1);
747 strncpy (dfa->re_str, pattern, length + 1);
748 #endif
749
750 err = re_string_construct (&regexp, pattern, length, preg->translate,
751 syntax & RE_ICASE, dfa->mb_cur_max,
752 dfa->is_utf8);
753 if (BE (err != REG_NOERROR, 0))
754 {
755 re_free (dfa);
756 preg->buffer = NULL;
757 preg->allocated = 0;
758 return err;
759 }
760
761 /* Parse the regular expression, and build a structure tree. */
762 preg->re_nsub = 0;
763 dfa->str_tree = parse (&regexp, preg, syntax, &err);
764 if (BE (dfa->str_tree == NULL, 0))
765 goto re_compile_internal_free_return;
766
767 #ifdef RE_ENABLE_I18N
768 /* If possible, do searching in single byte encoding to speed things up. */
769 if (dfa->is_utf8 && !(syntax & RE_ICASE))
770 optimize_utf8 (dfa);
771 #endif
772
773 /* Analyze the tree and collect information which is necessary to
774 create the dfa. */
775 err = analyze (dfa);
776 if (BE (err != REG_NOERROR, 0))
777 goto re_compile_internal_free_return;
778
779 /* Then create the initial state of the dfa. */
780 err = create_initial_state (dfa);
781
782 /* Release work areas. */
783 free_workarea_compile (preg);
784 re_string_destruct (&regexp);
785
786 if (BE (err != REG_NOERROR, 0))
787 {
788 re_compile_internal_free_return:
789 free_dfa_content (dfa);
790 preg->buffer = NULL;
791 preg->allocated = 0;
792 }
793
794 return err;
795 }
796
797 /* Initialize DFA. We use the length of the regular expression PAT_LEN
798 as the initial length of some arrays. */
799
800 static reg_errcode_t
801 init_dfa (dfa, pat_len)
802 re_dfa_t *dfa;
803 int pat_len;
804 {
805 int table_size;
806
807 memset (dfa, '\0', sizeof (re_dfa_t));
808
809 dfa->nodes_alloc = pat_len + 1;
810 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
811
812 dfa->states_alloc = pat_len + 1;
813
814 /* table_size = 2 ^ ceil(log pat_len) */
815 for (table_size = 1; table_size > 0; table_size <<= 1)
816 if (table_size > pat_len)
817 break;
818
819 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
820 dfa->state_hash_mask = table_size - 1;
821
822 dfa->subexps_alloc = 1;
823 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
824 dfa->word_char = NULL;
825
826 dfa->mb_cur_max = MB_CUR_MAX;
827 #ifdef _LIBC
828 if (dfa->mb_cur_max > 1
829 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
830 dfa->is_utf8 = 1;
831 #endif
832
833 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
834 || dfa->subexps == NULL, 0))
835 {
836 /* We don't bother to free anything which was allocated. Very
837 soon the process will go down anyway. */
838 dfa->subexps = NULL;
839 dfa->state_table = NULL;
840 dfa->nodes = NULL;
841 return REG_ESPACE;
842 }
843 return REG_NOERROR;
844 }
845
846 /* Initialize WORD_CHAR table, which indicate which character is
847 "word". In this case "word" means that it is the word construction
848 character used by some operators like "\<", "\>", etc. */
849
850 static reg_errcode_t
851 init_word_char (dfa)
852 re_dfa_t *dfa;
853 {
854 int i, j, ch;
855 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
856 if (BE (dfa->word_char == NULL, 0))
857 return REG_ESPACE;
858 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
859 for (j = 0; j < UINT_BITS; ++j, ++ch)
860 if (isalnum (ch) || ch == '_')
861 dfa->word_char[i] |= 1 << j;
862 return REG_NOERROR;
863 }
864
865 /* Free the work area which are only used while compiling. */
866
867 static void
868 free_workarea_compile (preg)
869 regex_t *preg;
870 {
871 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
872 free_bin_tree (dfa->str_tree);
873 dfa->str_tree = NULL;
874 re_free (dfa->org_indices);
875 dfa->org_indices = NULL;
876 }
877
878 /* Create initial states for all contexts. */
879
880 static reg_errcode_t
881 create_initial_state (dfa)
882 re_dfa_t *dfa;
883 {
884 int first, i;
885 reg_errcode_t err;
886 re_node_set init_nodes;
887
888 /* Initial states have the epsilon closure of the node which is
889 the first node of the regular expression. */
890 first = dfa->str_tree->first;
891 dfa->init_node = first;
892 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
893 if (BE (err != REG_NOERROR, 0))
894 return err;
895
896 /* The back-references which are in initial states can epsilon transit,
897 since in this case all of the subexpressions can be null.
898 Then we add epsilon closures of the nodes which are the next nodes of
899 the back-references. */
900 if (dfa->nbackref > 0)
901 for (i = 0; i < init_nodes.nelem; ++i)
902 {
903 int node_idx = init_nodes.elems[i];
904 re_token_type_t type = dfa->nodes[node_idx].type;
905
906 int clexp_idx;
907 if (type != OP_BACK_REF)
908 continue;
909 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
910 {
911 re_token_t *clexp_node;
912 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
913 if (clexp_node->type == OP_CLOSE_SUBEXP
914 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
915 break;
916 }
917 if (clexp_idx == init_nodes.nelem)
918 continue;
919
920 if (type == OP_BACK_REF)
921 {
922 int dest_idx = dfa->edests[node_idx].elems[0];
923 if (!re_node_set_contains (&init_nodes, dest_idx))
924 {
925 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
926 i = 0;
927 }
928 }
929 }
930
931 /* It must be the first time to invoke acquire_state. */
932 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
933 /* We don't check ERR here, since the initial state must not be NULL. */
934 if (BE (dfa->init_state == NULL, 0))
935 return err;
936 if (dfa->init_state->has_constraint)
937 {
938 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
939 CONTEXT_WORD);
940 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
941 CONTEXT_NEWLINE);
942 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
943 &init_nodes,
944 CONTEXT_NEWLINE
945 | CONTEXT_BEGBUF);
946 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
947 || dfa->init_state_begbuf == NULL, 0))
948 return err;
949 }
950 else
951 dfa->init_state_word = dfa->init_state_nl
952 = dfa->init_state_begbuf = dfa->init_state;
953
954 re_node_set_free (&init_nodes);
955 return REG_NOERROR;
956 }
957 \f
958 #ifdef RE_ENABLE_I18N
959 /* If it is possible to do searching in single byte encoding instead of UTF-8
960 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
961 DFA nodes where needed. */
962
963 static void
964 optimize_utf8 (dfa)
965 re_dfa_t *dfa;
966 {
967 int node;
968
969 for (node = 0; node < dfa->nodes_len; ++node)
970 switch (dfa->nodes[node].type)
971 {
972 case CHARACTER:
973 /* Chars >= 0x80 are optimizable in some cases (e.g. when not
974 followed by DUP operator, not in bracket etc.).
975 For now punt on them all. */
976 if (dfa->nodes[node].opr.c >= 0x80)
977 return;
978 break;
979 case ANCHOR:
980 switch (dfa->nodes[node].opr.idx)
981 {
982 case LINE_FIRST:
983 case LINE_LAST:
984 case BUF_FIRST:
985 case BUF_LAST:
986 break;
987 default:
988 /* Word anchors etc. cannot be handled. */
989 return;
990 }
991 break;
992 case OP_BACK_REF:
993 case OP_ALT:
994 case END_OF_RE:
995 case BACK_SLASH:
996 case OP_DUP_ASTERISK:
997 case OP_DUP_QUESTION:
998 case OP_DUP_PLUS:
999 case OP_OPEN_SUBEXP:
1000 case OP_CLOSE_SUBEXP:
1001 break;
1002 default:
1003 return;
1004 }
1005
1006 /* The search can be in single byte locale. */
1007 dfa->mb_cur_max = 1;
1008 dfa->is_utf8 = 0;
1009 dfa->has_mb_node = dfa->nbackref > 0;
1010 }
1011 #endif
1012 \f
1013 /* Analyze the structure tree, and calculate "first", "next", "edest",
1014 "eclosure", and "inveclosure". */
1015
1016 static reg_errcode_t
1017 analyze (dfa)
1018 re_dfa_t *dfa;
1019 {
1020 int i;
1021 reg_errcode_t ret;
1022
1023 /* Allocate arrays. */
1024 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1025 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1026 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1027 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1028 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1029 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1030 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1031 return REG_ESPACE;
1032 /* Initialize them. */
1033 for (i = 0; i < dfa->nodes_len; ++i)
1034 {
1035 dfa->nexts[i] = -1;
1036 re_node_set_init_empty (dfa->edests + i);
1037 re_node_set_init_empty (dfa->eclosures + i);
1038 re_node_set_init_empty (dfa->inveclosures + i);
1039 }
1040
1041 ret = analyze_tree (dfa, dfa->str_tree);
1042 if (BE (ret == REG_NOERROR, 1))
1043 {
1044 ret = calc_eclosure (dfa);
1045 if (ret == REG_NOERROR)
1046 calc_inveclosure (dfa);
1047 }
1048 return ret;
1049 }
1050
1051 /* Helper functions for analyze.
1052 This function calculate "first", "next", and "edest" for the subtree
1053 whose root is NODE. */
1054
1055 static reg_errcode_t
1056 analyze_tree (dfa, node)
1057 re_dfa_t *dfa;
1058 bin_tree_t *node;
1059 {
1060 reg_errcode_t ret;
1061 if (node->first == -1)
1062 calc_first (dfa, node);
1063 if (node->next == -1)
1064 calc_next (dfa, node);
1065 if (node->eclosure.nelem == 0)
1066 calc_epsdest (dfa, node);
1067 /* Calculate "first" etc. for the left child. */
1068 if (node->left != NULL)
1069 {
1070 ret = analyze_tree (dfa, node->left);
1071 if (BE (ret != REG_NOERROR, 0))
1072 return ret;
1073 }
1074 /* Calculate "first" etc. for the right child. */
1075 if (node->right != NULL)
1076 {
1077 ret = analyze_tree (dfa, node->right);
1078 if (BE (ret != REG_NOERROR, 0))
1079 return ret;
1080 }
1081 return REG_NOERROR;
1082 }
1083
1084 /* Calculate "first" for the node NODE. */
1085 static void
1086 calc_first (dfa, node)
1087 re_dfa_t *dfa;
1088 bin_tree_t *node;
1089 {
1090 int idx, type;
1091 idx = node->node_idx;
1092 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1093
1094 switch (type)
1095 {
1096 #ifdef DEBUG
1097 case OP_OPEN_BRACKET:
1098 case OP_CLOSE_BRACKET:
1099 case OP_OPEN_DUP_NUM:
1100 case OP_CLOSE_DUP_NUM:
1101 case OP_NON_MATCH_LIST:
1102 case OP_OPEN_COLL_ELEM:
1103 case OP_CLOSE_COLL_ELEM:
1104 case OP_OPEN_EQUIV_CLASS:
1105 case OP_CLOSE_EQUIV_CLASS:
1106 case OP_OPEN_CHAR_CLASS:
1107 case OP_CLOSE_CHAR_CLASS:
1108 /* These must not be appeared here. */
1109 assert (0);
1110 #endif
1111 case END_OF_RE:
1112 case CHARACTER:
1113 case OP_PERIOD:
1114 case OP_DUP_ASTERISK:
1115 case OP_DUP_QUESTION:
1116 #ifdef RE_ENABLE_I18N
1117 case COMPLEX_BRACKET:
1118 #endif /* RE_ENABLE_I18N */
1119 case SIMPLE_BRACKET:
1120 case OP_BACK_REF:
1121 case ANCHOR:
1122 case OP_OPEN_SUBEXP:
1123 case OP_CLOSE_SUBEXP:
1124 node->first = idx;
1125 break;
1126 case OP_DUP_PLUS:
1127 #ifdef DEBUG
1128 assert (node->left != NULL);
1129 #endif
1130 if (node->left->first == -1)
1131 calc_first (dfa, node->left);
1132 node->first = node->left->first;
1133 break;
1134 case OP_ALT:
1135 node->first = idx;
1136 break;
1137 /* else fall through */
1138 default:
1139 #ifdef DEBUG
1140 assert (node->left != NULL);
1141 #endif
1142 if (node->left->first == -1)
1143 calc_first (dfa, node->left);
1144 node->first = node->left->first;
1145 break;
1146 }
1147 }
1148
1149 /* Calculate "next" for the node NODE. */
1150
1151 static void
1152 calc_next (dfa, node)
1153 re_dfa_t *dfa;
1154 bin_tree_t *node;
1155 {
1156 int idx, type;
1157 bin_tree_t *parent = node->parent;
1158 if (parent == NULL)
1159 {
1160 node->next = -1;
1161 idx = node->node_idx;
1162 if (node->type == 0)
1163 dfa->nexts[idx] = node->next;
1164 return;
1165 }
1166
1167 idx = parent->node_idx;
1168 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1169
1170 switch (type)
1171 {
1172 case OP_DUP_ASTERISK:
1173 case OP_DUP_PLUS:
1174 node->next = idx;
1175 break;
1176 case CONCAT:
1177 if (parent->left == node)
1178 {
1179 if (parent->right->first == -1)
1180 calc_first (dfa, parent->right);
1181 node->next = parent->right->first;
1182 break;
1183 }
1184 /* else fall through */
1185 default:
1186 if (parent->next == -1)
1187 calc_next (dfa, parent);
1188 node->next = parent->next;
1189 break;
1190 }
1191 idx = node->node_idx;
1192 if (node->type == 0)
1193 dfa->nexts[idx] = node->next;
1194 }
1195
1196 /* Calculate "edest" for the node NODE. */
1197
1198 static void
1199 calc_epsdest (dfa, node)
1200 re_dfa_t *dfa;
1201 bin_tree_t *node;
1202 {
1203 int idx;
1204 idx = node->node_idx;
1205 if (node->type == 0)
1206 {
1207 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1208 || dfa->nodes[idx].type == OP_DUP_PLUS
1209 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1210 {
1211 if (node->left->first == -1)
1212 calc_first (dfa, node->left);
1213 if (node->next == -1)
1214 calc_next (dfa, node);
1215 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1216 node->next);
1217 }
1218 else if (dfa->nodes[idx].type == OP_ALT)
1219 {
1220 int left, right;
1221 if (node->left != NULL)
1222 {
1223 if (node->left->first == -1)
1224 calc_first (dfa, node->left);
1225 left = node->left->first;
1226 }
1227 else
1228 {
1229 if (node->next == -1)
1230 calc_next (dfa, node);
1231 left = node->next;
1232 }
1233 if (node->right != NULL)
1234 {
1235 if (node->right->first == -1)
1236 calc_first (dfa, node->right);
1237 right = node->right->first;
1238 }
1239 else
1240 {
1241 if (node->next == -1)
1242 calc_next (dfa, node);
1243 right = node->next;
1244 }
1245 re_node_set_init_2 (dfa->edests + idx, left, right);
1246 }
1247 else if (dfa->nodes[idx].type == ANCHOR
1248 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1249 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1250 || dfa->nodes[idx].type == OP_BACK_REF)
1251 re_node_set_init_1 (dfa->edests + idx, node->next);
1252 else
1253 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1254 }
1255 }
1256
1257 /* Duplicate the epsilon closure of the node ROOT_NODE.
1258 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1259 to their own constraint. */
1260
1261 static reg_errcode_t
1262 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1263 init_constraint)
1264 re_dfa_t *dfa;
1265 int top_org_node, top_clone_node, root_node;
1266 unsigned int init_constraint;
1267 {
1268 reg_errcode_t err;
1269 int org_node, clone_node, ret;
1270 unsigned int constraint = init_constraint;
1271 for (org_node = top_org_node, clone_node = top_clone_node;;)
1272 {
1273 int org_dest, clone_dest;
1274 if (dfa->nodes[org_node].type == OP_BACK_REF)
1275 {
1276 /* If the back reference epsilon-transit, its destination must
1277 also have the constraint. Then duplicate the epsilon closure
1278 of the destination of the back reference, and store it in
1279 edests of the back reference. */
1280 org_dest = dfa->nexts[org_node];
1281 re_node_set_empty (dfa->edests + clone_node);
1282 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1283 if (BE (err != REG_NOERROR, 0))
1284 return err;
1285 dfa->nexts[clone_node] = dfa->nexts[org_node];
1286 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1287 if (BE (ret < 0, 0))
1288 return REG_ESPACE;
1289 }
1290 else if (dfa->edests[org_node].nelem == 0)
1291 {
1292 /* In case of the node can't epsilon-transit, don't duplicate the
1293 destination and store the original destination as the
1294 destination of the node. */
1295 dfa->nexts[clone_node] = dfa->nexts[org_node];
1296 break;
1297 }
1298 else if (dfa->edests[org_node].nelem == 1)
1299 {
1300 /* In case of the node can epsilon-transit, and it has only one
1301 destination. */
1302 org_dest = dfa->edests[org_node].elems[0];
1303 re_node_set_empty (dfa->edests + clone_node);
1304 if (dfa->nodes[org_node].type == ANCHOR)
1305 {
1306 /* In case of the node has another constraint, append it. */
1307 if (org_node == root_node && clone_node != org_node)
1308 {
1309 /* ...but if the node is root_node itself, it means the
1310 epsilon closure have a loop, then tie it to the
1311 destination of the root_node. */
1312 ret = re_node_set_insert (dfa->edests + clone_node,
1313 org_dest);
1314 if (BE (ret < 0, 0))
1315 return REG_ESPACE;
1316 break;
1317 }
1318 constraint |= dfa->nodes[org_node].opr.ctx_type;
1319 }
1320 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1321 if (BE (err != REG_NOERROR, 0))
1322 return err;
1323 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1324 if (BE (ret < 0, 0))
1325 return REG_ESPACE;
1326 }
1327 else /* dfa->edests[org_node].nelem == 2 */
1328 {
1329 /* In case of the node can epsilon-transit, and it has two
1330 destinations. E.g. '|', '*', '+', '?'. */
1331 org_dest = dfa->edests[org_node].elems[0];
1332 re_node_set_empty (dfa->edests + clone_node);
1333 /* Search for a duplicated node which satisfies the constraint. */
1334 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1335 if (clone_dest == -1)
1336 {
1337 /* There are no such a duplicated node, create a new one. */
1338 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1339 if (BE (err != REG_NOERROR, 0))
1340 return err;
1341 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1342 if (BE (ret < 0, 0))
1343 return REG_ESPACE;
1344 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1345 root_node, constraint);
1346 if (BE (err != REG_NOERROR, 0))
1347 return err;
1348 }
1349 else
1350 {
1351 /* There are a duplicated node which satisfy the constraint,
1352 use it to avoid infinite loop. */
1353 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1354 if (BE (ret < 0, 0))
1355 return REG_ESPACE;
1356 }
1357
1358 org_dest = dfa->edests[org_node].elems[1];
1359 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1360 if (BE (err != REG_NOERROR, 0))
1361 return err;
1362 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1363 if (BE (ret < 0, 0))
1364 return REG_ESPACE;
1365 }
1366 org_node = org_dest;
1367 clone_node = clone_dest;
1368 }
1369 return REG_NOERROR;
1370 }
1371
1372 /* Search for a node which is duplicated from the node ORG_NODE, and
1373 satisfies the constraint CONSTRAINT. */
1374
1375 static int
1376 search_duplicated_node (dfa, org_node, constraint)
1377 re_dfa_t *dfa;
1378 int org_node;
1379 unsigned int constraint;
1380 {
1381 int idx;
1382 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1383 {
1384 if (org_node == dfa->org_indices[idx]
1385 && constraint == dfa->nodes[idx].constraint)
1386 return idx; /* Found. */
1387 }
1388 return -1; /* Not found. */
1389 }
1390
1391 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1392 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1393 otherwise return the error code. */
1394
1395 static reg_errcode_t
1396 duplicate_node (new_idx, dfa, org_idx, constraint)
1397 re_dfa_t *dfa;
1398 int *new_idx, org_idx;
1399 unsigned int constraint;
1400 {
1401 re_token_t dup;
1402 int dup_idx;
1403
1404 dup = dfa->nodes[org_idx];
1405 dup_idx = re_dfa_add_node (dfa, dup, 1);
1406 if (BE (dup_idx == -1, 0))
1407 return REG_ESPACE;
1408 dfa->nodes[dup_idx].constraint = constraint;
1409 if (dfa->nodes[org_idx].type == ANCHOR)
1410 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1411 dfa->nodes[dup_idx].duplicated = 1;
1412 re_node_set_init_empty (dfa->edests + dup_idx);
1413 re_node_set_init_empty (dfa->eclosures + dup_idx);
1414 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1415
1416 /* Store the index of the original node. */
1417 dfa->org_indices[dup_idx] = org_idx;
1418 *new_idx = dup_idx;
1419 return REG_NOERROR;
1420 }
1421
1422 static void
1423 calc_inveclosure (dfa)
1424 re_dfa_t *dfa;
1425 {
1426 int src, idx, dest;
1427 for (src = 0; src < dfa->nodes_len; ++src)
1428 {
1429 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1430 {
1431 dest = dfa->eclosures[src].elems[idx];
1432 re_node_set_insert (dfa->inveclosures + dest, src);
1433 }
1434 }
1435 }
1436
1437 /* Calculate "eclosure" for all the node in DFA. */
1438
1439 static reg_errcode_t
1440 calc_eclosure (dfa)
1441 re_dfa_t *dfa;
1442 {
1443 int node_idx, incomplete;
1444 #ifdef DEBUG
1445 assert (dfa->nodes_len > 0);
1446 #endif
1447 incomplete = 0;
1448 /* For each nodes, calculate epsilon closure. */
1449 for (node_idx = 0; ; ++node_idx)
1450 {
1451 reg_errcode_t err;
1452 re_node_set eclosure_elem;
1453 if (node_idx == dfa->nodes_len)
1454 {
1455 if (!incomplete)
1456 break;
1457 incomplete = 0;
1458 node_idx = 0;
1459 }
1460
1461 #ifdef DEBUG
1462 assert (dfa->eclosures[node_idx].nelem != -1);
1463 #endif
1464 /* If we have already calculated, skip it. */
1465 if (dfa->eclosures[node_idx].nelem != 0)
1466 continue;
1467 /* Calculate epsilon closure of `node_idx'. */
1468 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1469 if (BE (err != REG_NOERROR, 0))
1470 return err;
1471
1472 if (dfa->eclosures[node_idx].nelem == 0)
1473 {
1474 incomplete = 1;
1475 re_node_set_free (&eclosure_elem);
1476 }
1477 }
1478 return REG_NOERROR;
1479 }
1480
1481 /* Calculate epsilon closure of NODE. */
1482
1483 static reg_errcode_t
1484 calc_eclosure_iter (new_set, dfa, node, root)
1485 re_node_set *new_set;
1486 re_dfa_t *dfa;
1487 int node, root;
1488 {
1489 reg_errcode_t err;
1490 unsigned int constraint;
1491 int i, incomplete;
1492 re_node_set eclosure;
1493 incomplete = 0;
1494 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1495 if (BE (err != REG_NOERROR, 0))
1496 return err;
1497
1498 /* This indicates that we are calculating this node now.
1499 We reference this value to avoid infinite loop. */
1500 dfa->eclosures[node].nelem = -1;
1501
1502 constraint = ((dfa->nodes[node].type == ANCHOR)
1503 ? dfa->nodes[node].opr.ctx_type : 0);
1504 /* If the current node has constraints, duplicate all nodes.
1505 Since they must inherit the constraints. */
1506 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1507 {
1508 int org_node, cur_node;
1509 org_node = cur_node = node;
1510 err = duplicate_node_closure (dfa, node, node, node, constraint);
1511 if (BE (err != REG_NOERROR, 0))
1512 return err;
1513 }
1514
1515 /* Expand each epsilon destination nodes. */
1516 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1517 for (i = 0; i < dfa->edests[node].nelem; ++i)
1518 {
1519 re_node_set eclosure_elem;
1520 int edest = dfa->edests[node].elems[i];
1521 /* If calculating the epsilon closure of `edest' is in progress,
1522 return intermediate result. */
1523 if (dfa->eclosures[edest].nelem == -1)
1524 {
1525 incomplete = 1;
1526 continue;
1527 }
1528 /* If we haven't calculated the epsilon closure of `edest' yet,
1529 calculate now. Otherwise use calculated epsilon closure. */
1530 if (dfa->eclosures[edest].nelem == 0)
1531 {
1532 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1533 if (BE (err != REG_NOERROR, 0))
1534 return err;
1535 }
1536 else
1537 eclosure_elem = dfa->eclosures[edest];
1538 /* Merge the epsilon closure of `edest'. */
1539 re_node_set_merge (&eclosure, &eclosure_elem);
1540 /* If the epsilon closure of `edest' is incomplete,
1541 the epsilon closure of this node is also incomplete. */
1542 if (dfa->eclosures[edest].nelem == 0)
1543 {
1544 incomplete = 1;
1545 re_node_set_free (&eclosure_elem);
1546 }
1547 }
1548
1549 /* Epsilon closures include itself. */
1550 re_node_set_insert (&eclosure, node);
1551 if (incomplete && !root)
1552 dfa->eclosures[node].nelem = 0;
1553 else
1554 dfa->eclosures[node] = eclosure;
1555 *new_set = eclosure;
1556 return REG_NOERROR;
1557 }
1558 \f
1559 /* Functions for token which are used in the parser. */
1560
1561 /* Fetch a token from INPUT.
1562 We must not use this function inside bracket expressions. */
1563
1564 static re_token_t
1565 fetch_token (input, syntax)
1566 re_string_t *input;
1567 reg_syntax_t syntax;
1568 {
1569 re_token_t token;
1570 int consumed_byte;
1571 consumed_byte = peek_token (&token, input, syntax);
1572 re_string_skip_bytes (input, consumed_byte);
1573 return token;
1574 }
1575
1576 /* Peek a token from INPUT, and return the length of the token.
1577 We must not use this function inside bracket expressions. */
1578
1579 static int
1580 peek_token (token, input, syntax)
1581 re_token_t *token;
1582 re_string_t *input;
1583 reg_syntax_t syntax;
1584 {
1585 unsigned char c;
1586
1587 if (re_string_eoi (input))
1588 {
1589 token->type = END_OF_RE;
1590 return 0;
1591 }
1592
1593 c = re_string_peek_byte (input, 0);
1594 token->opr.c = c;
1595
1596 #ifdef RE_ENABLE_I18N
1597 token->mb_partial = 0;
1598 if (input->mb_cur_max > 1 &&
1599 !re_string_first_byte (input, re_string_cur_idx (input)))
1600 {
1601 token->type = CHARACTER;
1602 token->mb_partial = 1;
1603 return 1;
1604 }
1605 #endif
1606 if (c == '\\')
1607 {
1608 unsigned char c2;
1609 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1610 {
1611 token->type = BACK_SLASH;
1612 return 1;
1613 }
1614
1615 c2 = re_string_peek_byte_case (input, 1);
1616 token->opr.c = c2;
1617 token->type = CHARACTER;
1618 switch (c2)
1619 {
1620 case '|':
1621 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1622 token->type = OP_ALT;
1623 break;
1624 case '1': case '2': case '3': case '4': case '5':
1625 case '6': case '7': case '8': case '9':
1626 if (!(syntax & RE_NO_BK_REFS))
1627 {
1628 token->type = OP_BACK_REF;
1629 token->opr.idx = c2 - '0';
1630 }
1631 break;
1632 case '<':
1633 if (!(syntax & RE_NO_GNU_OPS))
1634 {
1635 token->type = ANCHOR;
1636 token->opr.idx = WORD_FIRST;
1637 }
1638 break;
1639 case '>':
1640 if (!(syntax & RE_NO_GNU_OPS))
1641 {
1642 token->type = ANCHOR;
1643 token->opr.idx = WORD_LAST;
1644 }
1645 break;
1646 case 'b':
1647 if (!(syntax & RE_NO_GNU_OPS))
1648 {
1649 token->type = ANCHOR;
1650 token->opr.idx = WORD_DELIM;
1651 }
1652 break;
1653 case 'B':
1654 if (!(syntax & RE_NO_GNU_OPS))
1655 {
1656 token->type = ANCHOR;
1657 token->opr.idx = INSIDE_WORD;
1658 }
1659 break;
1660 case 'w':
1661 if (!(syntax & RE_NO_GNU_OPS))
1662 token->type = OP_WORD;
1663 break;
1664 case 'W':
1665 if (!(syntax & RE_NO_GNU_OPS))
1666 token->type = OP_NOTWORD;
1667 break;
1668 case 's':
1669 if (!(syntax & RE_NO_GNU_OPS))
1670 token->type = OP_SPACE;
1671 break;
1672 case 'S':
1673 if (!(syntax & RE_NO_GNU_OPS))
1674 token->type = OP_NOTSPACE;
1675 break;
1676 case '`':
1677 if (!(syntax & RE_NO_GNU_OPS))
1678 {
1679 token->type = ANCHOR;
1680 token->opr.idx = BUF_FIRST;
1681 }
1682 break;
1683 case '\'':
1684 if (!(syntax & RE_NO_GNU_OPS))
1685 {
1686 token->type = ANCHOR;
1687 token->opr.idx = BUF_LAST;
1688 }
1689 break;
1690 case '(':
1691 if (!(syntax & RE_NO_BK_PARENS))
1692 token->type = OP_OPEN_SUBEXP;
1693 break;
1694 case ')':
1695 if (!(syntax & RE_NO_BK_PARENS))
1696 token->type = OP_CLOSE_SUBEXP;
1697 break;
1698 case '+':
1699 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1700 token->type = OP_DUP_PLUS;
1701 break;
1702 case '?':
1703 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1704 token->type = OP_DUP_QUESTION;
1705 break;
1706 case '{':
1707 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1708 token->type = OP_OPEN_DUP_NUM;
1709 break;
1710 case '}':
1711 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1712 token->type = OP_CLOSE_DUP_NUM;
1713 break;
1714 default:
1715 break;
1716 }
1717 return 2;
1718 }
1719
1720 token->type = CHARACTER;
1721 switch (c)
1722 {
1723 case '\n':
1724 if (syntax & RE_NEWLINE_ALT)
1725 token->type = OP_ALT;
1726 break;
1727 case '|':
1728 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1729 token->type = OP_ALT;
1730 break;
1731 case '*':
1732 token->type = OP_DUP_ASTERISK;
1733 break;
1734 case '+':
1735 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1736 token->type = OP_DUP_PLUS;
1737 break;
1738 case '?':
1739 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1740 token->type = OP_DUP_QUESTION;
1741 break;
1742 case '{':
1743 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1744 token->type = OP_OPEN_DUP_NUM;
1745 break;
1746 case '}':
1747 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1748 token->type = OP_CLOSE_DUP_NUM;
1749 break;
1750 case '(':
1751 if (syntax & RE_NO_BK_PARENS)
1752 token->type = OP_OPEN_SUBEXP;
1753 break;
1754 case ')':
1755 if (syntax & RE_NO_BK_PARENS)
1756 token->type = OP_CLOSE_SUBEXP;
1757 break;
1758 case '[':
1759 token->type = OP_OPEN_BRACKET;
1760 break;
1761 case '.':
1762 token->type = OP_PERIOD;
1763 break;
1764 case '^':
1765 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1766 re_string_cur_idx (input) != 0)
1767 {
1768 char prev = re_string_peek_byte (input, -1);
1769 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1770 break;
1771 }
1772 token->type = ANCHOR;
1773 token->opr.idx = LINE_FIRST;
1774 break;
1775 case '$':
1776 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1777 re_string_cur_idx (input) + 1 != re_string_length (input))
1778 {
1779 re_token_t next;
1780 re_string_skip_bytes (input, 1);
1781 peek_token (&next, input, syntax);
1782 re_string_skip_bytes (input, -1);
1783 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1784 break;
1785 }
1786 token->type = ANCHOR;
1787 token->opr.idx = LINE_LAST;
1788 break;
1789 default:
1790 break;
1791 }
1792 return 1;
1793 }
1794
1795 /* Peek a token from INPUT, and return the length of the token.
1796 We must not use this function out of bracket expressions. */
1797
1798 static int
1799 peek_token_bracket (token, input, syntax)
1800 re_token_t *token;
1801 re_string_t *input;
1802 reg_syntax_t syntax;
1803 {
1804 unsigned char c;
1805 if (re_string_eoi (input))
1806 {
1807 token->type = END_OF_RE;
1808 return 0;
1809 }
1810 c = re_string_peek_byte (input, 0);
1811 token->opr.c = c;
1812
1813 #ifdef RE_ENABLE_I18N
1814 if (input->mb_cur_max > 1 &&
1815 !re_string_first_byte (input, re_string_cur_idx (input)))
1816 {
1817 token->type = CHARACTER;
1818 return 1;
1819 }
1820 #endif /* RE_ENABLE_I18N */
1821
1822 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1823 {
1824 /* In this case, '\' escape a character. */
1825 unsigned char c2;
1826 re_string_skip_bytes (input, 1);
1827 c2 = re_string_peek_byte (input, 0);
1828 token->opr.c = c2;
1829 token->type = CHARACTER;
1830 return 1;
1831 }
1832 if (c == '[') /* '[' is a special char in a bracket exps. */
1833 {
1834 unsigned char c2;
1835 int token_len;
1836 c2 = re_string_peek_byte (input, 1);
1837 token->opr.c = c2;
1838 token_len = 2;
1839 switch (c2)
1840 {
1841 case '.':
1842 token->type = OP_OPEN_COLL_ELEM;
1843 break;
1844 case '=':
1845 token->type = OP_OPEN_EQUIV_CLASS;
1846 break;
1847 case ':':
1848 if (syntax & RE_CHAR_CLASSES)
1849 {
1850 token->type = OP_OPEN_CHAR_CLASS;
1851 break;
1852 }
1853 /* else fall through. */
1854 default:
1855 token->type = CHARACTER;
1856 token->opr.c = c;
1857 token_len = 1;
1858 break;
1859 }
1860 return token_len;
1861 }
1862 switch (c)
1863 {
1864 case '-':
1865 token->type = OP_CHARSET_RANGE;
1866 break;
1867 case ']':
1868 token->type = OP_CLOSE_BRACKET;
1869 break;
1870 case '^':
1871 token->type = OP_NON_MATCH_LIST;
1872 break;
1873 default:
1874 token->type = CHARACTER;
1875 }
1876 return 1;
1877 }
1878 \f
1879 /* Functions for parser. */
1880
1881 /* Entry point of the parser.
1882 Parse the regular expression REGEXP and return the structure tree.
1883 If an error is occured, ERR is set by error code, and return NULL.
1884 This function build the following tree, from regular expression <reg_exp>:
1885 CAT
1886 / \
1887 / \
1888 <reg_exp> EOR
1889
1890 CAT means concatenation.
1891 EOR means end of regular expression. */
1892
1893 static bin_tree_t *
1894 parse (regexp, preg, syntax, err)
1895 re_string_t *regexp;
1896 regex_t *preg;
1897 reg_syntax_t syntax;
1898 reg_errcode_t *err;
1899 {
1900 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1901 bin_tree_t *tree, *eor, *root;
1902 re_token_t current_token;
1903 int new_idx;
1904 current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1905 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1906 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1907 return NULL;
1908 new_idx = re_dfa_add_node (dfa, current_token, 0);
1909 eor = create_tree (NULL, NULL, 0, new_idx);
1910 if (tree != NULL)
1911 root = create_tree (tree, eor, CONCAT, 0);
1912 else
1913 root = eor;
1914 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1915 {
1916 *err = REG_ESPACE;
1917 return NULL;
1918 }
1919 return root;
1920 }
1921
1922 /* This function build the following tree, from regular expression
1923 <branch1>|<branch2>:
1924 ALT
1925 / \
1926 / \
1927 <branch1> <branch2>
1928
1929 ALT means alternative, which represents the operator `|'. */
1930
1931 static bin_tree_t *
1932 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1933 re_string_t *regexp;
1934 regex_t *preg;
1935 re_token_t *token;
1936 reg_syntax_t syntax;
1937 int nest;
1938 reg_errcode_t *err;
1939 {
1940 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1941 bin_tree_t *tree, *branch = NULL;
1942 int new_idx;
1943 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1944 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1945 return NULL;
1946
1947 while (token->type == OP_ALT)
1948 {
1949 re_token_t alt_token = *token;
1950 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1951 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1952 if (token->type != OP_ALT && token->type != END_OF_RE
1953 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1954 {
1955 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1956 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1957 {
1958 free_bin_tree (tree);
1959 return NULL;
1960 }
1961 }
1962 else
1963 branch = NULL;
1964 tree = create_tree (tree, branch, 0, new_idx);
1965 if (BE (new_idx == -1 || tree == NULL, 0))
1966 {
1967 *err = REG_ESPACE;
1968 return NULL;
1969 }
1970 dfa->has_plural_match = 1;
1971 }
1972 return tree;
1973 }
1974
1975 /* This function build the following tree, from regular expression
1976 <exp1><exp2>:
1977 CAT
1978 / \
1979 / \
1980 <exp1> <exp2>
1981
1982 CAT means concatenation. */
1983
1984 static bin_tree_t *
1985 parse_branch (regexp, preg, token, syntax, nest, err)
1986 re_string_t *regexp;
1987 regex_t *preg;
1988 re_token_t *token;
1989 reg_syntax_t syntax;
1990 int nest;
1991 reg_errcode_t *err;
1992 {
1993 bin_tree_t *tree, *exp;
1994 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1995 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1996 return NULL;
1997
1998 while (token->type != OP_ALT && token->type != END_OF_RE
1999 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2000 {
2001 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2002 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2003 {
2004 free_bin_tree (tree);
2005 return NULL;
2006 }
2007 if (tree != NULL && exp != NULL)
2008 {
2009 tree = create_tree (tree, exp, CONCAT, 0);
2010 if (tree == NULL)
2011 {
2012 *err = REG_ESPACE;
2013 return NULL;
2014 }
2015 }
2016 else if (tree == NULL)
2017 tree = exp;
2018 /* Otherwise exp == NULL, we don't need to create new tree. */
2019 }
2020 return tree;
2021 }
2022
2023 /* This function build the following tree, from regular expression a*:
2024 *
2025 |
2026 a
2027 */
2028
2029 static bin_tree_t *
2030 parse_expression (regexp, preg, token, syntax, nest, err)
2031 re_string_t *regexp;
2032 regex_t *preg;
2033 re_token_t *token;
2034 reg_syntax_t syntax;
2035 int nest;
2036 reg_errcode_t *err;
2037 {
2038 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2039 bin_tree_t *tree;
2040 int new_idx;
2041 switch (token->type)
2042 {
2043 case CHARACTER:
2044 new_idx = re_dfa_add_node (dfa, *token, 0);
2045 tree = create_tree (NULL, NULL, 0, new_idx);
2046 if (BE (new_idx == -1 || tree == NULL, 0))
2047 {
2048 *err = REG_ESPACE;
2049 return NULL;
2050 }
2051 #ifdef RE_ENABLE_I18N
2052 if (dfa->mb_cur_max > 1)
2053 {
2054 while (!re_string_eoi (regexp)
2055 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2056 {
2057 bin_tree_t *mbc_remain;
2058 *token = fetch_token (regexp, syntax);
2059 new_idx = re_dfa_add_node (dfa, *token, 0);
2060 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
2061 tree = create_tree (tree, mbc_remain, CONCAT, 0);
2062 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
2063 {
2064 *err = REG_ESPACE;
2065 return NULL;
2066 }
2067 }
2068 }
2069 #endif
2070 break;
2071 case OP_OPEN_SUBEXP:
2072 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2073 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074 return NULL;
2075 break;
2076 case OP_OPEN_BRACKET:
2077 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2078 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2079 return NULL;
2080 break;
2081 case OP_BACK_REF:
2082 if (BE (preg->re_nsub < token->opr.idx
2083 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2084 {
2085 *err = REG_ESUBREG;
2086 return NULL;
2087 }
2088 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2089 new_idx = re_dfa_add_node (dfa, *token, 0);
2090 tree = create_tree (NULL, NULL, 0, new_idx);
2091 if (BE (new_idx == -1 || tree == NULL, 0))
2092 {
2093 *err = REG_ESPACE;
2094 return NULL;
2095 }
2096 ++dfa->nbackref;
2097 dfa->has_mb_node = 1;
2098 break;
2099 case OP_OPEN_DUP_NUM:
2100 if (syntax & RE_CONTEXT_INVALID_DUP)
2101 {
2102 *err = REG_BADRPT;
2103 return NULL;
2104 }
2105 /* FALLTHROUGH */
2106 case OP_DUP_ASTERISK:
2107 case OP_DUP_PLUS:
2108 case OP_DUP_QUESTION:
2109 if (syntax & RE_CONTEXT_INVALID_OPS)
2110 {
2111 *err = REG_BADRPT;
2112 return NULL;
2113 }
2114 else if (syntax & RE_CONTEXT_INDEP_OPS)
2115 {
2116 *token = fetch_token (regexp, syntax);
2117 return parse_expression (regexp, preg, token, syntax, nest, err);
2118 }
2119 /* else fall through */
2120 case OP_CLOSE_SUBEXP:
2121 if ((token->type == OP_CLOSE_SUBEXP) &&
2122 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2123 {
2124 *err = REG_ERPAREN;
2125 return NULL;
2126 }
2127 /* else fall through */
2128 case OP_CLOSE_DUP_NUM:
2129 /* We treat it as a normal character. */
2130
2131 /* Then we can these characters as normal characters. */
2132 token->type = CHARACTER;
2133 new_idx = re_dfa_add_node (dfa, *token, 0);
2134 tree = create_tree (NULL, NULL, 0, new_idx);
2135 if (BE (new_idx == -1 || tree == NULL, 0))
2136 {
2137 *err = REG_ESPACE;
2138 return NULL;
2139 }
2140 break;
2141 case ANCHOR:
2142 if (dfa->word_char == NULL)
2143 {
2144 *err = init_word_char (dfa);
2145 if (BE (*err != REG_NOERROR, 0))
2146 return NULL;
2147 }
2148 if (token->opr.ctx_type == WORD_DELIM)
2149 {
2150 bin_tree_t *tree_first, *tree_last;
2151 int idx_first, idx_last;
2152 token->opr.ctx_type = WORD_FIRST;
2153 idx_first = re_dfa_add_node (dfa, *token, 0);
2154 tree_first = create_tree (NULL, NULL, 0, idx_first);
2155 token->opr.ctx_type = WORD_LAST;
2156 idx_last = re_dfa_add_node (dfa, *token, 0);
2157 tree_last = create_tree (NULL, NULL, 0, idx_last);
2158 token->type = OP_ALT;
2159 new_idx = re_dfa_add_node (dfa, *token, 0);
2160 tree = create_tree (tree_first, tree_last, 0, new_idx);
2161 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2162 || tree_first == NULL || tree_last == NULL
2163 || tree == NULL, 0))
2164 {
2165 *err = REG_ESPACE;
2166 return NULL;
2167 }
2168 }
2169 else
2170 {
2171 new_idx = re_dfa_add_node (dfa, *token, 0);
2172 tree = create_tree (NULL, NULL, 0, new_idx);
2173 if (BE (new_idx == -1 || tree == NULL, 0))
2174 {
2175 *err = REG_ESPACE;
2176 return NULL;
2177 }
2178 }
2179 /* We must return here, since ANCHORs can't be followed
2180 by repetition operators.
2181 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2182 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2183 *token = fetch_token (regexp, syntax);
2184 return tree;
2185 case OP_PERIOD:
2186 new_idx = re_dfa_add_node (dfa, *token, 0);
2187 tree = create_tree (NULL, NULL, 0, new_idx);
2188 if (BE (new_idx == -1 || tree == NULL, 0))
2189 {
2190 *err = REG_ESPACE;
2191 return NULL;
2192 }
2193 if (dfa->mb_cur_max > 1)
2194 dfa->has_mb_node = 1;
2195 break;
2196 case OP_WORD:
2197 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
2198 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199 return NULL;
2200 break;
2201 case OP_NOTWORD:
2202 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2203 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2204 return NULL;
2205 break;
2206 case OP_SPACE:
2207 tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2208 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2209 return NULL;
2210 break;
2211 case OP_NOTSPACE:
2212 tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
2213 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2214 return NULL;
2215 break;
2216 case OP_ALT:
2217 case END_OF_RE:
2218 return NULL;
2219 case BACK_SLASH:
2220 *err = REG_EESCAPE;
2221 return NULL;
2222 default:
2223 /* Must not happen? */
2224 #ifdef DEBUG
2225 assert (0);
2226 #endif
2227 return NULL;
2228 }
2229 *token = fetch_token (regexp, syntax);
2230
2231 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2232 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2233 {
2234 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2235 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2236 return NULL;
2237 /* In BRE consecutive duplications are not allowed. */
2238 if ((syntax & RE_CONTEXT_INVALID_DUP)
2239 && (token->type == OP_DUP_ASTERISK
2240 || token->type == OP_OPEN_DUP_NUM))
2241 {
2242 *err = REG_BADRPT;
2243 return NULL;
2244 }
2245 dfa->has_plural_match = 1;
2246 }
2247
2248 return tree;
2249 }
2250
2251 /* This function build the following tree, from regular expression
2252 (<reg_exp>):
2253 SUBEXP
2254 |
2255 <reg_exp>
2256 */
2257
2258 static bin_tree_t *
2259 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2260 re_string_t *regexp;
2261 regex_t *preg;
2262 re_token_t *token;
2263 reg_syntax_t syntax;
2264 int nest;
2265 reg_errcode_t *err;
2266 {
2267 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2268 bin_tree_t *tree, *left_par, *right_par;
2269 size_t cur_nsub;
2270 int new_idx;
2271 cur_nsub = preg->re_nsub++;
2272 if (dfa->subexps_alloc < preg->re_nsub)
2273 {
2274 re_subexp_t *new_array;
2275 dfa->subexps_alloc *= 2;
2276 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2277 if (BE (new_array == NULL, 0))
2278 {
2279 dfa->subexps_alloc /= 2;
2280 *err = REG_ESPACE;
2281 return NULL;
2282 }
2283 dfa->subexps = new_array;
2284 }
2285 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2286 dfa->subexps[cur_nsub].end = -1;
2287
2288 new_idx = re_dfa_add_node (dfa, *token, 0);
2289 left_par = create_tree (NULL, NULL, 0, new_idx);
2290 if (BE (new_idx == -1 || left_par == NULL, 0))
2291 {
2292 *err = REG_ESPACE;
2293 return NULL;
2294 }
2295 dfa->nodes[new_idx].opr.idx = cur_nsub;
2296 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
2297
2298 /* The subexpression may be a null string. */
2299 if (token->type == OP_CLOSE_SUBEXP)
2300 tree = NULL;
2301 else
2302 {
2303 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2304 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2305 return NULL;
2306 }
2307 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2308 {
2309 free_bin_tree (tree);
2310 *err = REG_EPAREN;
2311 return NULL;
2312 }
2313 new_idx = re_dfa_add_node (dfa, *token, 0);
2314 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2315 right_par = create_tree (NULL, NULL, 0, new_idx);
2316 tree = ((tree == NULL) ? right_par
2317 : create_tree (tree, right_par, CONCAT, 0));
2318 tree = create_tree (left_par, tree, CONCAT, 0);
2319 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2320 {
2321 *err = REG_ESPACE;
2322 return NULL;
2323 }
2324 dfa->nodes[new_idx].opr.idx = cur_nsub;
2325
2326 return tree;
2327 }
2328
2329 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2330
2331 static bin_tree_t *
2332 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2333 bin_tree_t *dup_elem;
2334 re_string_t *regexp;
2335 re_dfa_t *dfa;
2336 re_token_t *token;
2337 reg_syntax_t syntax;
2338 reg_errcode_t *err;
2339 {
2340 re_token_t dup_token;
2341 bin_tree_t *tree = dup_elem, *work_tree;
2342 int new_idx, start_idx = re_string_cur_idx (regexp);
2343 re_token_t start_token = *token;
2344 if (token->type == OP_OPEN_DUP_NUM)
2345 {
2346 int i;
2347 int end = 0;
2348 int start = fetch_number (regexp, token, syntax);
2349 bin_tree_t *elem;
2350 if (start == -1)
2351 {
2352 if (token->type == CHARACTER && token->opr.c == ',')
2353 start = 0; /* We treat "{,m}" as "{0,m}". */
2354 else
2355 {
2356 *err = REG_BADBR; /* <re>{} is invalid. */
2357 return NULL;
2358 }
2359 }
2360 if (BE (start != -2, 1))
2361 {
2362 /* We treat "{n}" as "{n,n}". */
2363 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2364 : ((token->type == CHARACTER && token->opr.c == ',')
2365 ? fetch_number (regexp, token, syntax) : -2));
2366 }
2367 if (BE (start == -2 || end == -2, 0))
2368 {
2369 /* Invalid sequence. */
2370 if (token->type == OP_CLOSE_DUP_NUM)
2371 goto parse_dup_op_invalid_interval;
2372 else
2373 goto parse_dup_op_ebrace;
2374 }
2375 if (BE (start == 0 && end == 0, 0))
2376 {
2377 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2378 *token = fetch_token (regexp, syntax);
2379 free_bin_tree (dup_elem);
2380 return NULL;
2381 }
2382
2383 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2384 elem = tree;
2385 for (i = 0; i < start; ++i)
2386 if (i != 0)
2387 {
2388 work_tree = duplicate_tree (elem, dfa);
2389 tree = create_tree (tree, work_tree, CONCAT, 0);
2390 if (BE (work_tree == NULL || tree == NULL, 0))
2391 goto parse_dup_op_espace;
2392 }
2393
2394 if (end == -1)
2395 {
2396 /* We treat "<re>{0,}" as "<re>*". */
2397 dup_token.type = OP_DUP_ASTERISK;
2398 if (start > 0)
2399 {
2400 elem = duplicate_tree (elem, dfa);
2401 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2402 work_tree = create_tree (elem, NULL, 0, new_idx);
2403 tree = create_tree (tree, work_tree, CONCAT, 0);
2404 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2405 || tree == NULL, 0))
2406 goto parse_dup_op_espace;
2407 }
2408 else
2409 {
2410 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2411 tree = create_tree (elem, NULL, 0, new_idx);
2412 if (BE (new_idx == -1 || tree == NULL, 0))
2413 goto parse_dup_op_espace;
2414 }
2415 }
2416 else if (BE (start > end, 0))
2417 {
2418 /* First number greater than first. */
2419 *err = REG_BADBR;
2420 return NULL;
2421 }
2422 else if (end - start > 0)
2423 {
2424 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2425 dup_token.type = OP_DUP_QUESTION;
2426 if (start > 0)
2427 {
2428 elem = duplicate_tree (elem, dfa);
2429 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2430 elem = create_tree (elem, NULL, 0, new_idx);
2431 tree = create_tree (tree, elem, CONCAT, 0);
2432 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2433 goto parse_dup_op_espace;
2434 }
2435 else
2436 {
2437 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2438 tree = elem = create_tree (elem, NULL, 0, new_idx);
2439 if (BE (new_idx == -1 || tree == NULL, 0))
2440 goto parse_dup_op_espace;
2441 }
2442 for (i = 1; i < end - start; ++i)
2443 {
2444 work_tree = duplicate_tree (elem, dfa);
2445 tree = create_tree (tree, work_tree, CONCAT, 0);
2446 if (BE (work_tree == NULL || tree == NULL, 0))
2447 {
2448 *err = REG_ESPACE;
2449 return NULL;
2450 }
2451 }
2452 }
2453 }
2454 else
2455 {
2456 new_idx = re_dfa_add_node (dfa, *token, 0);
2457 tree = create_tree (tree, NULL, 0, new_idx);
2458 if (BE (new_idx == -1 || tree == NULL, 0))
2459 {
2460 *err = REG_ESPACE;
2461 return NULL;
2462 }
2463 }
2464 *token = fetch_token (regexp, syntax);
2465 return tree;
2466
2467 parse_dup_op_espace:
2468 free_bin_tree (tree);
2469 *err = REG_ESPACE;
2470 return NULL;
2471
2472 parse_dup_op_ebrace:
2473 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2474 {
2475 *err = REG_EBRACE;
2476 return NULL;
2477 }
2478 goto parse_dup_op_rollback;
2479 parse_dup_op_invalid_interval:
2480 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481 {
2482 *err = REG_BADBR;
2483 return NULL;
2484 }
2485 parse_dup_op_rollback:
2486 re_string_set_index (regexp, start_idx);
2487 *token = start_token;
2488 token->type = CHARACTER;
2489 return dup_elem;
2490 }
2491
2492 /* Size of the names for collating symbol/equivalence_class/character_class.
2493 I'm not sure, but maybe enough. */
2494 #define BRACKET_NAME_BUF_SIZE 32
2495
2496 #ifndef _LIBC
2497 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2498 Build the range expression which starts from START_ELEM, and ends
2499 at END_ELEM. The result are written to MBCSET and SBCSET.
2500 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2501 mbcset->range_ends, is a pointer argument sinse we may
2502 update it. */
2503
2504 static reg_errcode_t
2505 # ifdef RE_ENABLE_I18N
2506 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2507 re_charset_t *mbcset;
2508 int *range_alloc;
2509 # else /* not RE_ENABLE_I18N */
2510 build_range_exp (sbcset, start_elem, end_elem)
2511 # endif /* not RE_ENABLE_I18N */
2512 re_bitset_ptr_t sbcset;
2513 bracket_elem_t *start_elem, *end_elem;
2514 {
2515 unsigned int start_ch, end_ch;
2516 /* Equivalence Classes and Character Classes can't be a range start/end. */
2517 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2518 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2519 0))
2520 return REG_ERANGE;
2521
2522 /* We can handle no multi character collating elements without libc
2523 support. */
2524 if (BE ((start_elem->type == COLL_SYM
2525 && strlen ((char *) start_elem->opr.name) > 1)
2526 || (end_elem->type == COLL_SYM
2527 && strlen ((char *) end_elem->opr.name) > 1), 0))
2528 return REG_ECOLLATE;
2529
2530 # ifdef RE_ENABLE_I18N
2531 {
2532 wchar_t wc, start_wc, end_wc;
2533 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2534
2535 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2536 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2537 : 0));
2538 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2539 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2540 : 0));
2541 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2542 ? __btowc (start_ch) : start_elem->opr.wch);
2543 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2544 ? __btowc (end_ch) : end_elem->opr.wch);
2545 cmp_buf[0] = start_wc;
2546 cmp_buf[4] = end_wc;
2547 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2548 return REG_ERANGE;
2549
2550 /* Check the space of the arrays. */
2551 if (*range_alloc == mbcset->nranges)
2552 {
2553 /* There are not enough space, need realloc. */
2554 wchar_t *new_array_start, *new_array_end;
2555 int new_nranges;
2556
2557 /* +1 in case of mbcset->nranges is 0. */
2558 new_nranges = 2 * mbcset->nranges + 1;
2559 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2560 are NULL if *range_alloc == 0. */
2561 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2562 new_nranges);
2563 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2564 new_nranges);
2565
2566 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2567 return REG_ESPACE;
2568
2569 mbcset->range_starts = new_array_start;
2570 mbcset->range_ends = new_array_end;
2571 *range_alloc = new_nranges;
2572 }
2573
2574 mbcset->range_starts[mbcset->nranges] = start_wc;
2575 mbcset->range_ends[mbcset->nranges++] = end_wc;
2576
2577 /* Build the table for single byte characters. */
2578 for (wc = 0; wc <= SBC_MAX; ++wc)
2579 {
2580 cmp_buf[2] = wc;
2581 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2582 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2583 bitset_set (sbcset, wc);
2584 }
2585 }
2586 # else /* not RE_ENABLE_I18N */
2587 {
2588 unsigned int ch;
2589 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2590 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2591 : 0));
2592 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2593 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2594 : 0));
2595 if (start_ch > end_ch)
2596 return REG_ERANGE;
2597 /* Build the table for single byte characters. */
2598 for (ch = 0; ch <= SBC_MAX; ++ch)
2599 if (start_ch <= ch && ch <= end_ch)
2600 bitset_set (sbcset, ch);
2601 }
2602 # endif /* not RE_ENABLE_I18N */
2603 return REG_NOERROR;
2604 }
2605 #endif /* not _LIBC */
2606
2607 #ifndef _LIBC
2608 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2609 Build the collating element which is represented by NAME.
2610 The result are written to MBCSET and SBCSET.
2611 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2612 pointer argument since we may update it. */
2613
2614 static reg_errcode_t
2615 # ifdef RE_ENABLE_I18N
2616 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2617 re_charset_t *mbcset;
2618 int *coll_sym_alloc;
2619 # else /* not RE_ENABLE_I18N */
2620 build_collating_symbol (sbcset, name)
2621 # endif /* not RE_ENABLE_I18N */
2622 re_bitset_ptr_t sbcset;
2623 const unsigned char *name;
2624 {
2625 size_t name_len = strlen ((const char *) name);
2626 if (BE (name_len != 1, 0))
2627 return REG_ECOLLATE;
2628 else
2629 {
2630 bitset_set (sbcset, name[0]);
2631 return REG_NOERROR;
2632 }
2633 }
2634 #endif /* not _LIBC */
2635
2636 /* This function parse bracket expression like "[abc]", "[a-c]",
2637 "[[.a-a.]]" etc. */
2638
2639 static bin_tree_t *
2640 parse_bracket_exp (regexp, dfa, token, syntax, err)
2641 re_string_t *regexp;
2642 re_dfa_t *dfa;
2643 re_token_t *token;
2644 reg_syntax_t syntax;
2645 reg_errcode_t *err;
2646 {
2647 #ifdef _LIBC
2648 const unsigned char *collseqmb;
2649 const char *collseqwc;
2650 uint32_t nrules;
2651 int32_t table_size;
2652 const int32_t *symb_table;
2653 const unsigned char *extra;
2654
2655 /* Local function for parse_bracket_exp used in _LIBC environement.
2656 Seek the collating symbol entry correspondings to NAME.
2657 Return the index of the symbol in the SYMB_TABLE. */
2658
2659 static inline int32_t
2660 __attribute ((always_inline))
2661 seek_collating_symbol_entry (name, name_len)
2662 const unsigned char *name;
2663 size_t name_len;
2664 {
2665 int32_t hash = elem_hash ((const char *) name, name_len);
2666 int32_t elem = hash % table_size;
2667 int32_t second = hash % (table_size - 2);
2668 while (symb_table[2 * elem] != 0)
2669 {
2670 /* First compare the hashing value. */
2671 if (symb_table[2 * elem] == hash
2672 /* Compare the length of the name. */
2673 && name_len == extra[symb_table[2 * elem + 1]]
2674 /* Compare the name. */
2675 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2676 name_len) == 0)
2677 {
2678 /* Yep, this is the entry. */
2679 break;
2680 }
2681
2682 /* Next entry. */
2683 elem += second;
2684 }
2685 return elem;
2686 }
2687
2688 /* Local function for parse_bracket_exp used in _LIBC environement.
2689 Look up the collation sequence value of BR_ELEM.
2690 Return the value if succeeded, UINT_MAX otherwise. */
2691
2692 static inline unsigned int
2693 __attribute ((always_inline))
2694 lookup_collation_sequence_value (br_elem)
2695 bracket_elem_t *br_elem;
2696 {
2697 if (br_elem->type == SB_CHAR)
2698 {
2699 /*
2700 if (MB_CUR_MAX == 1)
2701 */
2702 if (nrules == 0)
2703 return collseqmb[br_elem->opr.ch];
2704 else
2705 {
2706 wint_t wc = __btowc (br_elem->opr.ch);
2707 return __collseq_table_lookup (collseqwc, wc);
2708 }
2709 }
2710 else if (br_elem->type == MB_CHAR)
2711 {
2712 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2713 }
2714 else if (br_elem->type == COLL_SYM)
2715 {
2716 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2717 if (nrules != 0)
2718 {
2719 int32_t elem, idx;
2720 elem = seek_collating_symbol_entry (br_elem->opr.name,
2721 sym_name_len);
2722 if (symb_table[2 * elem] != 0)
2723 {
2724 /* We found the entry. */
2725 idx = symb_table[2 * elem + 1];
2726 /* Skip the name of collating element name. */
2727 idx += 1 + extra[idx];
2728 /* Skip the byte sequence of the collating element. */
2729 idx += 1 + extra[idx];
2730 /* Adjust for the alignment. */
2731 idx = (idx + 3) & ~3;
2732 /* Skip the multibyte collation sequence value. */
2733 idx += sizeof (unsigned int);
2734 /* Skip the wide char sequence of the collating element. */
2735 idx += sizeof (unsigned int) *
2736 (1 + *(unsigned int *) (extra + idx));
2737 /* Return the collation sequence value. */
2738 return *(unsigned int *) (extra + idx);
2739 }
2740 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2741 {
2742 /* No valid character. Match it as a single byte
2743 character. */
2744 return collseqmb[br_elem->opr.name[0]];
2745 }
2746 }
2747 else if (sym_name_len == 1)
2748 return collseqmb[br_elem->opr.name[0]];
2749 }
2750 return UINT_MAX;
2751 }
2752
2753 /* Local function for parse_bracket_exp used in _LIBC environement.
2754 Build the range expression which starts from START_ELEM, and ends
2755 at END_ELEM. The result are written to MBCSET and SBCSET.
2756 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2757 mbcset->range_ends, is a pointer argument sinse we may
2758 update it. */
2759
2760 static inline reg_errcode_t
2761 __attribute ((always_inline))
2762 # ifdef RE_ENABLE_I18N
2763 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2764 re_charset_t *mbcset;
2765 int *range_alloc;
2766 # else /* not RE_ENABLE_I18N */
2767 build_range_exp (sbcset, start_elem, end_elem)
2768 # endif /* not RE_ENABLE_I18N */
2769 re_bitset_ptr_t sbcset;
2770 bracket_elem_t *start_elem, *end_elem;
2771 {
2772 unsigned int ch;
2773 uint32_t start_collseq;
2774 uint32_t end_collseq;
2775
2776 # ifdef RE_ENABLE_I18N
2777 /* Check the space of the arrays. */
2778 if (*range_alloc == mbcset->nranges)
2779 {
2780 /* There are not enough space, need realloc. */
2781 uint32_t *new_array_start;
2782 uint32_t *new_array_end;
2783 int new_nranges;
2784
2785 /* +1 in case of mbcset->nranges is 0. */
2786 new_nranges = 2 * mbcset->nranges + 1;
2787 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2788 are NULL if *range_alloc == 0. */
2789 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2790 new_nranges);
2791 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2792 new_nranges);
2793
2794 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2795 return REG_ESPACE;
2796
2797 mbcset->range_starts = new_array_start;
2798 mbcset->range_ends = new_array_end;
2799 *range_alloc = new_nranges;
2800 }
2801 # endif /* RE_ENABLE_I18N */
2802
2803 /* Equivalence Classes and Character Classes can't be a range
2804 start/end. */
2805 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2806 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2807 0))
2808 return REG_ERANGE;
2809
2810 start_collseq = lookup_collation_sequence_value (start_elem);
2811 end_collseq = lookup_collation_sequence_value (end_elem);
2812 /* Check start/end collation sequence values. */
2813 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2814 return REG_ECOLLATE;
2815 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2816 return REG_ERANGE;
2817
2818 # ifdef RE_ENABLE_I18N
2819 /* Got valid collation sequence values, add them as a new entry. */
2820 mbcset->range_starts[mbcset->nranges] = start_collseq;
2821 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2822 # endif /* RE_ENABLE_I18N */
2823
2824 /* Build the table for single byte characters. */
2825 for (ch = 0; ch <= SBC_MAX; ch++)
2826 {
2827 uint32_t ch_collseq;
2828 /*
2829 if (MB_CUR_MAX == 1)
2830 */
2831 if (nrules == 0)
2832 ch_collseq = collseqmb[ch];
2833 else
2834 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2835 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2836 bitset_set (sbcset, ch);
2837 }
2838 return REG_NOERROR;
2839 }
2840
2841 /* Local function for parse_bracket_exp used in _LIBC environement.
2842 Build the collating element which is represented by NAME.
2843 The result are written to MBCSET and SBCSET.
2844 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2845 pointer argument sinse we may update it. */
2846
2847 static inline reg_errcode_t
2848 __attribute ((always_inline))
2849 # ifdef RE_ENABLE_I18N
2850 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2851 re_charset_t *mbcset;
2852 int *coll_sym_alloc;
2853 # else /* not RE_ENABLE_I18N */
2854 build_collating_symbol (sbcset, name)
2855 # endif /* not RE_ENABLE_I18N */
2856 re_bitset_ptr_t sbcset;
2857 const unsigned char *name;
2858 {
2859 int32_t elem, idx;
2860 size_t name_len = strlen ((const char *) name);
2861 if (nrules != 0)
2862 {
2863 elem = seek_collating_symbol_entry (name, name_len);
2864 if (symb_table[2 * elem] != 0)
2865 {
2866 /* We found the entry. */
2867 idx = symb_table[2 * elem + 1];
2868 /* Skip the name of collating element name. */
2869 idx += 1 + extra[idx];
2870 }
2871 else if (symb_table[2 * elem] == 0 && name_len == 1)
2872 {
2873 /* No valid character, treat it as a normal
2874 character. */
2875 bitset_set (sbcset, name[0]);
2876 return REG_NOERROR;
2877 }
2878 else
2879 return REG_ECOLLATE;
2880
2881 # ifdef RE_ENABLE_I18N
2882 /* Got valid collation sequence, add it as a new entry. */
2883 /* Check the space of the arrays. */
2884 if (*coll_sym_alloc == mbcset->ncoll_syms)
2885 {
2886 /* Not enough, realloc it. */
2887 /* +1 in case of mbcset->ncoll_syms is 0. */
2888 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2889 /* Use realloc since mbcset->coll_syms is NULL
2890 if *alloc == 0. */
2891 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2892 *coll_sym_alloc);
2893 if (BE (mbcset->coll_syms == NULL, 0))
2894 return REG_ESPACE;
2895 }
2896 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2897 # endif /* RE_ENABLE_I18N */
2898 return REG_NOERROR;
2899 }
2900 else
2901 {
2902 if (BE (name_len != 1, 0))
2903 return REG_ECOLLATE;
2904 else
2905 {
2906 bitset_set (sbcset, name[0]);
2907 return REG_NOERROR;
2908 }
2909 }
2910 }
2911 #endif
2912
2913 re_token_t br_token;
2914 re_bitset_ptr_t sbcset;
2915 #ifdef RE_ENABLE_I18N
2916 re_charset_t *mbcset;
2917 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2918 int equiv_class_alloc = 0, char_class_alloc = 0;
2919 #else /* not RE_ENABLE_I18N */
2920 int non_match = 0;
2921 #endif /* not RE_ENABLE_I18N */
2922 bin_tree_t *work_tree;
2923 int token_len, new_idx;
2924 #ifdef _LIBC
2925 collseqmb = (const unsigned char *)
2926 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2927 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2928 if (nrules)
2929 {
2930 /*
2931 if (MB_CUR_MAX > 1)
2932 */
2933 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2934 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2935 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2936 _NL_COLLATE_SYMB_TABLEMB);
2937 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2938 _NL_COLLATE_SYMB_EXTRAMB);
2939 }
2940 #endif
2941 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2942 #ifdef RE_ENABLE_I18N
2943 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2944 #endif /* RE_ENABLE_I18N */
2945 #ifdef RE_ENABLE_I18N
2946 if (BE (sbcset == NULL || mbcset == NULL, 0))
2947 #else
2948 if (BE (sbcset == NULL, 0))
2949 #endif /* RE_ENABLE_I18N */
2950 {
2951 *err = REG_ESPACE;
2952 return NULL;
2953 }
2954
2955 token_len = peek_token_bracket (token, regexp, syntax);
2956 if (BE (token->type == END_OF_RE, 0))
2957 {
2958 *err = REG_BADPAT;
2959 goto parse_bracket_exp_free_return;
2960 }
2961 if (token->type == OP_NON_MATCH_LIST)
2962 {
2963 #ifdef RE_ENABLE_I18N
2964 int i;
2965 mbcset->non_match = 1;
2966 #else /* not RE_ENABLE_I18N */
2967 non_match = 1;
2968 #endif /* not RE_ENABLE_I18N */
2969 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2970 bitset_set (sbcset, '\0');
2971 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2972 token_len = peek_token_bracket (token, regexp, syntax);
2973 if (BE (token->type == END_OF_RE, 0))
2974 {
2975 *err = REG_BADPAT;
2976 goto parse_bracket_exp_free_return;
2977 }
2978 #ifdef RE_ENABLE_I18N
2979 if (dfa->mb_cur_max > 1)
2980 for (i = 0; i < SBC_MAX; ++i)
2981 if (__btowc (i) == WEOF)
2982 bitset_set (sbcset, i);
2983 #endif /* RE_ENABLE_I18N */
2984 }
2985
2986 /* We treat the first ']' as a normal character. */
2987 if (token->type == OP_CLOSE_BRACKET)
2988 token->type = CHARACTER;
2989
2990 int first_round = 1;
2991 while (1)
2992 {
2993 bracket_elem_t start_elem, end_elem;
2994 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2995 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2996 reg_errcode_t ret;
2997 int token_len2 = 0, is_range_exp = 0;
2998 re_token_t token2;
2999
3000 start_elem.opr.name = start_name_buf;
3001 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3002 syntax, first_round);
3003 if (BE (ret != REG_NOERROR, 0))
3004 {
3005 *err = ret;
3006 goto parse_bracket_exp_free_return;
3007 }
3008 first_round = 0;
3009
3010 /* Get information about the next token. We need it in any case. */
3011 token_len = peek_token_bracket (token, regexp, syntax);
3012
3013 /* Do not check for ranges if we know they are not allowed. */
3014 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3015 {
3016 if (BE (token->type == END_OF_RE, 0))
3017 {
3018 *err = REG_EBRACK;
3019 goto parse_bracket_exp_free_return;
3020 }
3021 if (token->type == OP_CHARSET_RANGE)
3022 {
3023 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3024 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3025 if (BE (token2.type == END_OF_RE, 0))
3026 {
3027 *err = REG_EBRACK;
3028 goto parse_bracket_exp_free_return;
3029 }
3030 if (token2.type == OP_CLOSE_BRACKET)
3031 {
3032 /* We treat the last '-' as a normal character. */
3033 re_string_skip_bytes (regexp, -token_len);
3034 token->type = CHARACTER;
3035 }
3036 else
3037 is_range_exp = 1;
3038 }
3039 }
3040
3041 if (is_range_exp == 1)
3042 {
3043 end_elem.opr.name = end_name_buf;
3044 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3045 dfa, syntax, 1);
3046 if (BE (ret != REG_NOERROR, 0))
3047 {
3048 *err = ret;
3049 goto parse_bracket_exp_free_return;
3050 }
3051
3052 token_len = peek_token_bracket (token, regexp, syntax);
3053
3054 *err = build_range_exp (sbcset,
3055 #ifdef RE_ENABLE_I18N
3056 mbcset, &range_alloc,
3057 #endif /* RE_ENABLE_I18N */
3058 &start_elem, &end_elem);
3059 if (BE (*err != REG_NOERROR, 0))
3060 goto parse_bracket_exp_free_return;
3061 }
3062 else
3063 {
3064 switch (start_elem.type)
3065 {
3066 case SB_CHAR:
3067 bitset_set (sbcset, start_elem.opr.ch);
3068 break;
3069 #ifdef RE_ENABLE_I18N
3070 case MB_CHAR:
3071 /* Check whether the array has enough space. */
3072 if (mbchar_alloc == mbcset->nmbchars)
3073 {
3074 /* Not enough, realloc it. */
3075 /* +1 in case of mbcset->nmbchars is 0. */
3076 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3077 /* Use realloc since array is NULL if *alloc == 0. */
3078 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
3079 mbchar_alloc);
3080 if (BE (mbcset->mbchars == NULL, 0))
3081 goto parse_bracket_exp_espace;
3082 }
3083 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3084 break;
3085 #endif /* RE_ENABLE_I18N */
3086 case EQUIV_CLASS:
3087 *err = build_equiv_class (sbcset,
3088 #ifdef RE_ENABLE_I18N
3089 mbcset, &equiv_class_alloc,
3090 #endif /* RE_ENABLE_I18N */
3091 start_elem.opr.name);
3092 if (BE (*err != REG_NOERROR, 0))
3093 goto parse_bracket_exp_free_return;
3094 break;
3095 case COLL_SYM:
3096 *err = build_collating_symbol (sbcset,
3097 #ifdef RE_ENABLE_I18N
3098 mbcset, &coll_sym_alloc,
3099 #endif /* RE_ENABLE_I18N */
3100 start_elem.opr.name);
3101 if (BE (*err != REG_NOERROR, 0))
3102 goto parse_bracket_exp_free_return;
3103 break;
3104 case CHAR_CLASS:
3105 *err = build_charclass (regexp->trans, sbcset,
3106 #ifdef RE_ENABLE_I18N
3107 mbcset, &char_class_alloc,
3108 #endif /* RE_ENABLE_I18N */
3109 start_elem.opr.name, syntax);
3110 if (BE (*err != REG_NOERROR, 0))
3111 goto parse_bracket_exp_free_return;
3112 break;
3113 default:
3114 assert (0);
3115 break;
3116 }
3117 }
3118 if (BE (token->type == END_OF_RE, 0))
3119 {
3120 *err = REG_EBRACK;
3121 goto parse_bracket_exp_free_return;
3122 }
3123 if (token->type == OP_CLOSE_BRACKET)
3124 break;
3125 }
3126
3127 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3128
3129 /* If it is non-matching list. */
3130 #ifdef RE_ENABLE_I18N
3131 if (mbcset->non_match)
3132 #else /* not RE_ENABLE_I18N */
3133 if (non_match)
3134 #endif /* not RE_ENABLE_I18N */
3135 bitset_not (sbcset);
3136
3137 /* Build a tree for simple bracket. */
3138 br_token.type = SIMPLE_BRACKET;
3139 br_token.opr.sbcset = sbcset;
3140 new_idx = re_dfa_add_node (dfa, br_token, 0);
3141 work_tree = create_tree (NULL, NULL, 0, new_idx);
3142 if (BE (new_idx == -1 || work_tree == NULL, 0))
3143 goto parse_bracket_exp_espace;
3144
3145 #ifdef RE_ENABLE_I18N
3146 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3147 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3148 || mbcset->non_match)))
3149 {
3150 re_token_t alt_token;
3151 bin_tree_t *mbc_tree;
3152 /* Build a tree for complex bracket. */
3153 br_token.type = COMPLEX_BRACKET;
3154 br_token.opr.mbcset = mbcset;
3155 dfa->has_mb_node = 1;
3156 new_idx = re_dfa_add_node (dfa, br_token, 0);
3157 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3158 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3159 goto parse_bracket_exp_espace;
3160 /* Then join them by ALT node. */
3161 dfa->has_plural_match = 1;
3162 alt_token.type = OP_ALT;
3163 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3164 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3165 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3166 return work_tree;
3167 }
3168 else
3169 {
3170 free_charset (mbcset);
3171 return work_tree;
3172 }
3173 #else /* not RE_ENABLE_I18N */
3174 return work_tree;
3175 #endif /* not RE_ENABLE_I18N */
3176
3177 parse_bracket_exp_espace:
3178 *err = REG_ESPACE;
3179 parse_bracket_exp_free_return:
3180 re_free (sbcset);
3181 #ifdef RE_ENABLE_I18N
3182 free_charset (mbcset);
3183 #endif /* RE_ENABLE_I18N */
3184 return NULL;
3185 }
3186
3187 /* Parse an element in the bracket expression. */
3188
3189 static reg_errcode_t
3190 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3191 accept_hyphen)
3192 bracket_elem_t *elem;
3193 re_string_t *regexp;
3194 re_token_t *token;
3195 int token_len;
3196 re_dfa_t *dfa;
3197 reg_syntax_t syntax;
3198 int accept_hyphen;
3199 {
3200 #ifdef RE_ENABLE_I18N
3201 int cur_char_size;
3202 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3203 if (cur_char_size > 1)
3204 {
3205 elem->type = MB_CHAR;
3206 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3207 re_string_skip_bytes (regexp, cur_char_size);
3208 return REG_NOERROR;
3209 }
3210 #endif /* RE_ENABLE_I18N */
3211 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3212 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3213 || token->type == OP_OPEN_EQUIV_CLASS)
3214 return parse_bracket_symbol (elem, regexp, token);
3215 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3216 {
3217 /* A '-' must only appear as anything but a range indicator before
3218 the closing bracket. Everything else is an error. */
3219 re_token_t token2;
3220 (void) peek_token_bracket (&token2, regexp, syntax);
3221 if (token2.type != OP_CLOSE_BRACKET)
3222 /* The actual error value is not standardized since this whole
3223 case is undefined. But ERANGE makes good sense. */
3224 return REG_ERANGE;
3225 }
3226 elem->type = SB_CHAR;
3227 elem->opr.ch = token->opr.c;
3228 return REG_NOERROR;
3229 }
3230
3231 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3232 such as [:<character_class>:], [.<collating_element>.], and
3233 [=<equivalent_class>=]. */
3234
3235 static reg_errcode_t
3236 parse_bracket_symbol (elem, regexp, token)
3237 bracket_elem_t *elem;
3238 re_string_t *regexp;
3239 re_token_t *token;
3240 {
3241 unsigned char ch, delim = token->opr.c;
3242 int i = 0;
3243 for (;; ++i)
3244 {
3245 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3246 return REG_EBRACK;
3247 if (token->type == OP_OPEN_CHAR_CLASS)
3248 ch = re_string_fetch_byte_case (regexp);
3249 else
3250 ch = re_string_fetch_byte (regexp);
3251 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3252 break;
3253 elem->opr.name[i] = ch;
3254 }
3255 re_string_skip_bytes (regexp, 1);
3256 elem->opr.name[i] = '\0';
3257 switch (token->type)
3258 {
3259 case OP_OPEN_COLL_ELEM:
3260 elem->type = COLL_SYM;
3261 break;
3262 case OP_OPEN_EQUIV_CLASS:
3263 elem->type = EQUIV_CLASS;
3264 break;
3265 case OP_OPEN_CHAR_CLASS:
3266 elem->type = CHAR_CLASS;
3267 break;
3268 default:
3269 break;
3270 }
3271 return REG_NOERROR;
3272 }
3273
3274 /* Helper function for parse_bracket_exp.
3275 Build the equivalence class which is represented by NAME.
3276 The result are written to MBCSET and SBCSET.
3277 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3278 is a pointer argument sinse we may update it. */
3279
3280 static reg_errcode_t
3281 #ifdef RE_ENABLE_I18N
3282 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3283 re_charset_t *mbcset;
3284 int *equiv_class_alloc;
3285 #else /* not RE_ENABLE_I18N */
3286 build_equiv_class (sbcset, name)
3287 #endif /* not RE_ENABLE_I18N */
3288 re_bitset_ptr_t sbcset;
3289 const unsigned char *name;
3290 {
3291 #if defined _LIBC && defined RE_ENABLE_I18N
3292 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3293 if (nrules != 0)
3294 {
3295 const int32_t *table, *indirect;
3296 const unsigned char *weights, *extra, *cp;
3297 unsigned char char_buf[2];
3298 int32_t idx1, idx2;
3299 unsigned int ch;
3300 size_t len;
3301 /* This #include defines a local function! */
3302 # include <locale/weight.h>
3303 /* Calculate the index for equivalence class. */
3304 cp = name;
3305 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3306 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3307 _NL_COLLATE_WEIGHTMB);
3308 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3309 _NL_COLLATE_EXTRAMB);
3310 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3311 _NL_COLLATE_INDIRECTMB);
3312 idx1 = findidx (&cp);
3313 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3314 /* This isn't a valid character. */
3315 return REG_ECOLLATE;
3316
3317 /* Build single byte matcing table for this equivalence class. */
3318 char_buf[1] = (unsigned char) '\0';
3319 len = weights[idx1];
3320 for (ch = 0; ch < SBC_MAX; ++ch)
3321 {
3322 char_buf[0] = ch;
3323 cp = char_buf;
3324 idx2 = findidx (&cp);
3325 /*
3326 idx2 = table[ch];
3327 */
3328 if (idx2 == 0)
3329 /* This isn't a valid character. */
3330 continue;
3331 if (len == weights[idx2])
3332 {
3333 int cnt = 0;
3334 while (cnt <= len &&
3335 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3336 ++cnt;
3337
3338 if (cnt > len)
3339 bitset_set (sbcset, ch);
3340 }
3341 }
3342 /* Check whether the array has enough space. */
3343 if (*equiv_class_alloc == mbcset->nequiv_classes)
3344 {
3345 /* Not enough, realloc it. */
3346 /* +1 in case of mbcset->nequiv_classes is 0. */
3347 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3348 /* Use realloc since the array is NULL if *alloc == 0. */
3349 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3350 *equiv_class_alloc);
3351 if (BE (mbcset->equiv_classes == NULL, 0))
3352 return REG_ESPACE;
3353 }
3354 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3355 }
3356 else
3357 #endif /* _LIBC && RE_ENABLE_I18N */
3358 {
3359 if (BE (strlen ((const char *) name) != 1, 0))
3360 return REG_ECOLLATE;
3361 bitset_set (sbcset, *name);
3362 }
3363 return REG_NOERROR;
3364 }
3365
3366 /* Helper function for parse_bracket_exp.
3367 Build the character class which is represented by NAME.
3368 The result are written to MBCSET and SBCSET.
3369 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3370 is a pointer argument sinse we may update it. */
3371
3372 static reg_errcode_t
3373 #ifdef RE_ENABLE_I18N
3374 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3375 re_charset_t *mbcset;
3376 int *char_class_alloc;
3377 #else /* not RE_ENABLE_I18N */
3378 build_charclass (trans, sbcset, class_name, syntax)
3379 #endif /* not RE_ENABLE_I18N */
3380 RE_TRANSLATE_TYPE trans;
3381 re_bitset_ptr_t sbcset;
3382 const unsigned char *class_name;
3383 reg_syntax_t syntax;
3384 {
3385 int i;
3386 const char *name = (const char *) class_name;
3387
3388 /* In case of REG_ICASE "upper" and "lower" match the both of
3389 upper and lower cases. */
3390 if ((syntax & RE_ICASE)
3391 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3392 name = "alpha";
3393
3394 #ifdef RE_ENABLE_I18N
3395 /* Check the space of the arrays. */
3396 if (*char_class_alloc == mbcset->nchar_classes)
3397 {
3398 /* Not enough, realloc it. */
3399 /* +1 in case of mbcset->nchar_classes is 0. */
3400 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3401 /* Use realloc since array is NULL if *alloc == 0. */
3402 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3403 *char_class_alloc);
3404 if (BE (mbcset->char_classes == NULL, 0))
3405 return REG_ESPACE;
3406 }
3407 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3408 #endif /* RE_ENABLE_I18N */
3409
3410 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3411 for (i = 0; i < SBC_MAX; ++i) \
3412 { \
3413 if (ctype_func (i)) \
3414 { \
3415 int ch = trans ? trans[i] : i; \
3416 bitset_set (sbcset, ch); \
3417 } \
3418 }
3419
3420 if (strcmp (name, "alnum") == 0)
3421 BUILD_CHARCLASS_LOOP (isalnum)
3422 else if (strcmp (name, "cntrl") == 0)
3423 BUILD_CHARCLASS_LOOP (iscntrl)
3424 else if (strcmp (name, "lower") == 0)
3425 BUILD_CHARCLASS_LOOP (islower)
3426 else if (strcmp (name, "space") == 0)
3427 BUILD_CHARCLASS_LOOP (isspace)
3428 else if (strcmp (name, "alpha") == 0)
3429 BUILD_CHARCLASS_LOOP (isalpha)
3430 else if (strcmp (name, "digit") == 0)
3431 BUILD_CHARCLASS_LOOP (isdigit)
3432 else if (strcmp (name, "print") == 0)
3433 BUILD_CHARCLASS_LOOP (isprint)
3434 else if (strcmp (name, "upper") == 0)
3435 BUILD_CHARCLASS_LOOP (isupper)
3436 else if (strcmp (name, "blank") == 0)
3437 BUILD_CHARCLASS_LOOP (isblank)
3438 else if (strcmp (name, "graph") == 0)
3439 BUILD_CHARCLASS_LOOP (isgraph)
3440 else if (strcmp (name, "punct") == 0)
3441 BUILD_CHARCLASS_LOOP (ispunct)
3442 else if (strcmp (name, "xdigit") == 0)
3443 BUILD_CHARCLASS_LOOP (isxdigit)
3444 else
3445 return REG_ECTYPE;
3446
3447 return REG_NOERROR;
3448 }
3449
3450 static bin_tree_t *
3451 build_charclass_op (dfa, trans, class_name, extra, not, err)
3452 re_dfa_t *dfa;
3453 RE_TRANSLATE_TYPE trans;
3454 const unsigned char *class_name;
3455 const unsigned char *extra;
3456 int not;
3457 reg_errcode_t *err;
3458 {
3459 re_bitset_ptr_t sbcset;
3460 #ifdef RE_ENABLE_I18N
3461 re_charset_t *mbcset;
3462 int alloc = 0;
3463 #else /* not RE_ENABLE_I18N */
3464 int non_match = 0;
3465 #endif /* not RE_ENABLE_I18N */
3466 reg_errcode_t ret;
3467 re_token_t br_token;
3468 bin_tree_t *tree;
3469 int new_idx;
3470
3471 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3472 #ifdef RE_ENABLE_I18N
3473 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3474 #endif /* RE_ENABLE_I18N */
3475
3476 #ifdef RE_ENABLE_I18N
3477 if (BE (sbcset == NULL || mbcset == NULL, 0))
3478 #else /* not RE_ENABLE_I18N */
3479 if (BE (sbcset == NULL, 0))
3480 #endif /* not RE_ENABLE_I18N */
3481 {
3482 *err = REG_ESPACE;
3483 return NULL;
3484 }
3485
3486 if (not)
3487 {
3488 #ifdef RE_ENABLE_I18N
3489 int i;
3490 /*
3491 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3492 bitset_set(cset->sbcset, '\0');
3493 */
3494 mbcset->non_match = 1;
3495 if (dfa->mb_cur_max > 1)
3496 for (i = 0; i < SBC_MAX; ++i)
3497 if (__btowc (i) == WEOF)
3498 bitset_set (sbcset, i);
3499 #else /* not RE_ENABLE_I18N */
3500 non_match = 1;
3501 #endif /* not RE_ENABLE_I18N */
3502 }
3503
3504 /* We don't care the syntax in this case. */
3505 ret = build_charclass (trans, sbcset,
3506 #ifdef RE_ENABLE_I18N
3507 mbcset, &alloc,
3508 #endif /* RE_ENABLE_I18N */
3509 class_name, 0);
3510
3511 if (BE (ret != REG_NOERROR, 0))
3512 {
3513 re_free (sbcset);
3514 #ifdef RE_ENABLE_I18N
3515 free_charset (mbcset);
3516 #endif /* RE_ENABLE_I18N */
3517 *err = ret;
3518 return NULL;
3519 }
3520 /* \w match '_' also. */
3521 for (; *extra; extra++)
3522 bitset_set (sbcset, *extra);
3523
3524 /* If it is non-matching list. */
3525 #ifdef RE_ENABLE_I18N
3526 if (mbcset->non_match)
3527 #else /* not RE_ENABLE_I18N */
3528 if (non_match)
3529 #endif /* not RE_ENABLE_I18N */
3530 bitset_not (sbcset);
3531
3532 /* Build a tree for simple bracket. */
3533 br_token.type = SIMPLE_BRACKET;
3534 br_token.opr.sbcset = sbcset;
3535 new_idx = re_dfa_add_node (dfa, br_token, 0);
3536 tree = create_tree (NULL, NULL, 0, new_idx);
3537 if (BE (new_idx == -1 || tree == NULL, 0))
3538 goto build_word_op_espace;
3539
3540 #ifdef RE_ENABLE_I18N
3541 if (dfa->mb_cur_max > 1)
3542 {
3543 re_token_t alt_token;
3544 bin_tree_t *mbc_tree;
3545 /* Build a tree for complex bracket. */
3546 br_token.type = COMPLEX_BRACKET;
3547 br_token.opr.mbcset = mbcset;
3548 dfa->has_mb_node = 1;
3549 new_idx = re_dfa_add_node (dfa, br_token, 0);
3550 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3551 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3552 goto build_word_op_espace;
3553 /* Then join them by ALT node. */
3554 alt_token.type = OP_ALT;
3555 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3556 tree = create_tree (tree, mbc_tree, 0, new_idx);
3557 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3558 return tree;
3559 }
3560 else
3561 {
3562 free_charset (mbcset);
3563 return tree;
3564 }
3565 #else /* not RE_ENABLE_I18N */
3566 return tree;
3567 #endif /* not RE_ENABLE_I18N */
3568
3569 build_word_op_espace:
3570 re_free (sbcset);
3571 #ifdef RE_ENABLE_I18N
3572 free_charset (mbcset);
3573 #endif /* RE_ENABLE_I18N */
3574 *err = REG_ESPACE;
3575 return NULL;
3576 }
3577
3578 /* This is intended for the expressions like "a{1,3}".
3579 Fetch a number from `input', and return the number.
3580 Return -1, if the number field is empty like "{,1}".
3581 Return -2, If an error is occured. */
3582
3583 static int
3584 fetch_number (input, token, syntax)
3585 re_string_t *input;
3586 re_token_t *token;
3587 reg_syntax_t syntax;
3588 {
3589 int num = -1;
3590 unsigned char c;
3591 while (1)
3592 {
3593 *token = fetch_token (input, syntax);
3594 c = token->opr.c;
3595 if (BE (token->type == END_OF_RE, 0))
3596 return -2;
3597 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3598 break;
3599 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3600 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3601 num = (num > RE_DUP_MAX) ? -2 : num;
3602 }
3603 return num;
3604 }
3605 \f
3606 #ifdef RE_ENABLE_I18N
3607 static void
3608 free_charset (re_charset_t *cset)
3609 {
3610 re_free (cset->mbchars);
3611 # ifdef _LIBC
3612 re_free (cset->coll_syms);
3613 re_free (cset->equiv_classes);
3614 re_free (cset->range_starts);
3615 re_free (cset->range_ends);
3616 # endif
3617 re_free (cset->char_classes);
3618 re_free (cset);
3619 }
3620 #endif /* RE_ENABLE_I18N */
3621 \f
3622 /* Functions for binary tree operation. */
3623
3624 /* Create a node of tree.
3625 Note: This function automatically free left and right if malloc fails. */
3626
3627 static bin_tree_t *
3628 create_tree (left, right, type, index)
3629 bin_tree_t *left;
3630 bin_tree_t *right;
3631 re_token_type_t type;
3632 int index;
3633 {
3634 bin_tree_t *tree;
3635 tree = re_malloc (bin_tree_t, 1);
3636 if (BE (tree == NULL, 0))
3637 {
3638 free_bin_tree (left);
3639 free_bin_tree (right);
3640 return NULL;
3641 }
3642 tree->parent = NULL;
3643 tree->left = left;
3644 tree->right = right;
3645 tree->type = type;
3646 tree->node_idx = index;
3647 tree->first = -1;
3648 tree->next = -1;
3649 re_node_set_init_empty (&tree->eclosure);
3650
3651 if (left != NULL)
3652 left->parent = tree;
3653 if (right != NULL)
3654 right->parent = tree;
3655 return tree;
3656 }
3657
3658 /* Free the sub tree pointed by TREE. */
3659
3660 static void
3661 free_bin_tree (tree)
3662 bin_tree_t *tree;
3663 {
3664 if (tree == NULL)
3665 return;
3666 /*re_node_set_free (&tree->eclosure);*/
3667 free_bin_tree (tree->left);
3668 free_bin_tree (tree->right);
3669 re_free (tree);
3670 }
3671
3672 /* Duplicate the node SRC, and return new node. */
3673
3674 static bin_tree_t *
3675 duplicate_tree (src, dfa)
3676 const bin_tree_t *src;
3677 re_dfa_t *dfa;
3678 {
3679 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3680 int new_node_idx;
3681 /* Since node indies must be according to Post-order of the tree,
3682 we must duplicate the left at first. */
3683 if (src->left != NULL)
3684 {
3685 left = duplicate_tree (src->left, dfa);
3686 if (left == NULL)
3687 return NULL;
3688 }
3689
3690 /* Secondaly, duplicate the right. */
3691 if (src->right != NULL)
3692 {
3693 right = duplicate_tree (src->right, dfa);
3694 if (right == NULL)
3695 {
3696 free_bin_tree (left);
3697 return NULL;
3698 }
3699 }
3700
3701 /* At last, duplicate itself. */
3702 if (src->type == NON_TYPE)
3703 {
3704 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3705 dfa->nodes[new_node_idx].duplicated = 1;
3706 if (BE (new_node_idx == -1, 0))
3707 {
3708 free_bin_tree (left);
3709 free_bin_tree (right);
3710 return NULL;
3711 }
3712 }
3713 else
3714 new_node_idx = src->type;
3715
3716 new_tree = create_tree (left, right, src->type, new_node_idx);
3717 if (BE (new_tree == NULL, 0))
3718 {
3719 free_bin_tree (left);
3720 free_bin_tree (right);
3721 }
3722 return new_tree;
3723 }