]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
Update.
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (re_dfa_t *dfa);
37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
42 int top_clone_node, int root_node,
43 unsigned int constraint);
44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
45 unsigned int constraint);
46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
50 int node, int root);
51 static void calc_inveclosure (re_dfa_t *dfa);
52 static int fetch_number (re_string_t *input, re_token_t *token,
53 reg_syntax_t syntax);
54 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
55 static int peek_token (re_token_t *token, re_string_t *input,
56 reg_syntax_t syntax);
57 static int peek_token_bracket (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 int accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifndef _LIBC
89 # ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
91 re_charset_t *mbcset, int *range_alloc,
92 bracket_elem_t *start_elem,
93 bracket_elem_t *end_elem);
94 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
95 re_charset_t *mbcset,
96 int *coll_sym_alloc,
97 const unsigned char *name);
98 # else /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
100 bracket_elem_t *start_elem,
101 bracket_elem_t *end_elem);
102 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
103 const unsigned char *name);
104 # endif /* not RE_ENABLE_I18N */
105 #endif /* not _LIBC */
106 #ifdef RE_ENABLE_I18N
107 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
108 re_charset_t *mbcset,
109 int *equiv_class_alloc,
110 const unsigned char *name);
111 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
112 re_bitset_ptr_t sbcset,
113 re_charset_t *mbcset,
114 int *char_class_alloc,
115 const unsigned char *class_name,
116 reg_syntax_t syntax);
117 #else /* not RE_ENABLE_I18N */
118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 const unsigned char *name);
120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
121 re_bitset_ptr_t sbcset,
122 const unsigned char *class_name,
123 reg_syntax_t syntax);
124 #endif /* not RE_ENABLE_I18N */
125 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
126 const unsigned char *class_name,
127 const unsigned char *extra, int not,
128 reg_errcode_t *err);
129 static void free_bin_tree (bin_tree_t *tree);
130 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
131 re_token_type_t type, int index);
132 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
133 \f
134 /* This table gives an error message for each of the error codes listed
135 in regex.h. Obviously the order here has to be same as there.
136 POSIX doesn't require that we do anything for REG_NOERROR,
137 but why not be nice? */
138
139 const char __re_error_msgid[] attribute_hidden =
140 {
141 #define REG_NOERROR_IDX 0
142 gettext_noop ("Success") /* REG_NOERROR */
143 "\0"
144 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
145 gettext_noop ("No match") /* REG_NOMATCH */
146 "\0"
147 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
148 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
149 "\0"
150 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
151 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
152 "\0"
153 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
154 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
155 "\0"
156 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
157 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
158 "\0"
159 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
160 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
161 "\0"
162 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
163 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
164 "\0"
165 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
166 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
167 "\0"
168 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
169 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
170 "\0"
171 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
172 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
173 "\0"
174 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
175 gettext_noop ("Invalid range end") /* REG_ERANGE */
176 "\0"
177 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
178 gettext_noop ("Memory exhausted") /* REG_ESPACE */
179 "\0"
180 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
181 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
182 "\0"
183 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
184 gettext_noop ("Premature end of regular expression") /* REG_EEND */
185 "\0"
186 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
187 gettext_noop ("Regular expression too big") /* REG_ESIZE */
188 "\0"
189 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
190 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
191 };
192
193 const size_t __re_error_msgid_idx[] attribute_hidden =
194 {
195 REG_NOERROR_IDX,
196 REG_NOMATCH_IDX,
197 REG_BADPAT_IDX,
198 REG_ECOLLATE_IDX,
199 REG_ECTYPE_IDX,
200 REG_EESCAPE_IDX,
201 REG_ESUBREG_IDX,
202 REG_EBRACK_IDX,
203 REG_EPAREN_IDX,
204 REG_EBRACE_IDX,
205 REG_BADBR_IDX,
206 REG_ERANGE_IDX,
207 REG_ESPACE_IDX,
208 REG_BADRPT_IDX,
209 REG_EEND_IDX,
210 REG_ESIZE_IDX,
211 REG_ERPAREN_IDX
212 };
213 \f
214 /* Entry points for GNU code. */
215
216 /* re_compile_pattern is the GNU regular expression compiler: it
217 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
218 Returns 0 if the pattern was valid, otherwise an error string.
219
220 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
221 are set in BUFP on entry. */
222
223 const char *
224 re_compile_pattern (pattern, length, bufp)
225 const char *pattern;
226 size_t length;
227 struct re_pattern_buffer *bufp;
228 {
229 reg_errcode_t ret;
230
231 /* And GNU code determines whether or not to get register information
232 by passing null for the REGS argument to re_match, etc., not by
233 setting no_sub. */
234 bufp->no_sub = 0;
235
236 /* Match anchors at newline. */
237 bufp->newline_anchor = 1;
238
239 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
240
241 if (!ret)
242 return NULL;
243 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
244 }
245 #ifdef _LIBC
246 weak_alias (__re_compile_pattern, re_compile_pattern)
247 #endif
248
249 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
250 also be assigned to arbitrarily: each pattern buffer stores its own
251 syntax, so it can be changed between regex compilations. */
252 /* This has no initializer because initialized variables in Emacs
253 become read-only after dumping. */
254 reg_syntax_t re_syntax_options;
255
256
257 /* Specify the precise syntax of regexps for compilation. This provides
258 for compatibility for various utilities which historically have
259 different, incompatible syntaxes.
260
261 The argument SYNTAX is a bit mask comprised of the various bits
262 defined in regex.h. We return the old syntax. */
263
264 reg_syntax_t
265 re_set_syntax (syntax)
266 reg_syntax_t syntax;
267 {
268 reg_syntax_t ret = re_syntax_options;
269
270 re_syntax_options = syntax;
271 return ret;
272 }
273 #ifdef _LIBC
274 weak_alias (__re_set_syntax, re_set_syntax)
275 #endif
276
277 int
278 re_compile_fastmap (bufp)
279 struct re_pattern_buffer *bufp;
280 {
281 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
282 char *fastmap = bufp->fastmap;
283
284 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
285 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
286 if (dfa->init_state != dfa->init_state_word)
287 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
288 if (dfa->init_state != dfa->init_state_nl)
289 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
290 if (dfa->init_state != dfa->init_state_begbuf)
291 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
292 bufp->fastmap_accurate = 1;
293 return 0;
294 }
295 #ifdef _LIBC
296 weak_alias (__re_compile_fastmap, re_compile_fastmap)
297 #endif
298
299 static inline void
300 __attribute ((always_inline))
301 re_set_fastmap (char *fastmap, int icase, int ch)
302 {
303 fastmap[ch] = 1;
304 if (icase)
305 fastmap[tolower (ch)] = 1;
306 }
307
308 /* Helper function for re_compile_fastmap.
309 Compile fastmap for the initial_state INIT_STATE. */
310
311 static void
312 re_compile_fastmap_iter (bufp, init_state, fastmap)
313 regex_t *bufp;
314 const re_dfastate_t *init_state;
315 char *fastmap;
316 {
317 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
318 int node_cnt;
319 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
320 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
321 {
322 int node = init_state->nodes.elems[node_cnt];
323 re_token_type_t type = dfa->nodes[node].type;
324
325 if (type == CHARACTER)
326 {
327 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
328 #ifdef RE_ENABLE_I18N
329 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
330 {
331 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
332 wchar_t wc;
333 mbstate_t state;
334
335 p = buf;
336 *p++ = dfa->nodes[node].opr.c;
337 while (++node < dfa->nodes_len
338 && dfa->nodes[node].type == CHARACTER
339 && dfa->nodes[node].mb_partial)
340 *p++ = dfa->nodes[node].opr.c;
341 memset (&state, 0, sizeof (state));
342 if (mbrtowc (&wc, (const char *) buf, p - buf,
343 &state) == p - buf
344 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
345 re_set_fastmap (fastmap, 0, buf[0]);
346 }
347 #endif
348 }
349 else if (type == SIMPLE_BRACKET)
350 {
351 int i, j, ch;
352 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
353 for (j = 0; j < UINT_BITS; ++j, ++ch)
354 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
355 re_set_fastmap (fastmap, icase, ch);
356 }
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
359 {
360 int i;
361 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
363 || cset->nranges || cset->nchar_classes)
364 {
365 # ifdef _LIBC
366 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
367 {
368 /* In this case we want to catch the bytes which are
369 the first byte of any collation elements.
370 e.g. In da_DK, we want to catch 'a' since "aa"
371 is a valid collation element, and don't catch
372 'b' since 'b' is the only collation element
373 which starts from 'b'. */
374 int j, ch;
375 const int32_t *table = (const int32_t *)
376 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
378 for (j = 0; j < UINT_BITS; ++j, ++ch)
379 if (table[ch] < 0)
380 re_set_fastmap (fastmap, icase, ch);
381 }
382 # else
383 if (dfa->mb_cur_max > 1)
384 for (i = 0; i < SBC_MAX; ++i)
385 if (__btowc (i) == WEOF)
386 re_set_fastmap (fastmap, icase, i);
387 # endif /* not _LIBC */
388 }
389 for (i = 0; i < cset->nmbchars; ++i)
390 {
391 char buf[256];
392 mbstate_t state;
393 memset (&state, '\0', sizeof (state));
394 __wcrtomb (buf, cset->mbchars[i], &state);
395 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
396 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
397 {
398 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
399 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
400 }
401 }
402 }
403 #endif /* RE_ENABLE_I18N */
404 else if (type == END_OF_RE || type == OP_PERIOD
405 || type == OP_UTF8_PERIOD)
406 {
407 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
408 if (type == END_OF_RE)
409 bufp->can_be_null = 1;
410 return;
411 }
412 }
413 }
414 \f
415 /* Entry point for POSIX code. */
416 /* regcomp takes a regular expression as a string and compiles it.
417
418 PREG is a regex_t *. We do not expect any fields to be initialized,
419 since POSIX says we shouldn't. Thus, we set
420
421 `buffer' to the compiled pattern;
422 `used' to the length of the compiled pattern;
423 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
424 REG_EXTENDED bit in CFLAGS is set; otherwise, to
425 RE_SYNTAX_POSIX_BASIC;
426 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
427 `fastmap' to an allocated space for the fastmap;
428 `fastmap_accurate' to zero;
429 `re_nsub' to the number of subexpressions in PATTERN.
430
431 PATTERN is the address of the pattern string.
432
433 CFLAGS is a series of bits which affect compilation.
434
435 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
436 use POSIX basic syntax.
437
438 If REG_NEWLINE is set, then . and [^...] don't match newline.
439 Also, regexec will try a match beginning after every newline.
440
441 If REG_ICASE is set, then we considers upper- and lowercase
442 versions of letters to be equivalent when matching.
443
444 If REG_NOSUB is set, then when PREG is passed to regexec, that
445 routine will report only success or failure, and nothing about the
446 registers.
447
448 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
449 the return codes and their meanings.) */
450
451 int
452 regcomp (preg, pattern, cflags)
453 regex_t *__restrict preg;
454 const char *__restrict pattern;
455 int cflags;
456 {
457 reg_errcode_t ret;
458 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
459 : RE_SYNTAX_POSIX_BASIC);
460
461 preg->buffer = NULL;
462 preg->allocated = 0;
463 preg->used = 0;
464
465 /* Try to allocate space for the fastmap. */
466 preg->fastmap = re_malloc (char, SBC_MAX);
467 if (BE (preg->fastmap == NULL, 0))
468 return REG_ESPACE;
469
470 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
471
472 /* If REG_NEWLINE is set, newlines are treated differently. */
473 if (cflags & REG_NEWLINE)
474 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
475 syntax &= ~RE_DOT_NEWLINE;
476 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
477 /* It also changes the matching behavior. */
478 preg->newline_anchor = 1;
479 }
480 else
481 preg->newline_anchor = 0;
482 preg->no_sub = !!(cflags & REG_NOSUB);
483 preg->translate = NULL;
484
485 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
486
487 /* POSIX doesn't distinguish between an unmatched open-group and an
488 unmatched close-group: both are REG_EPAREN. */
489 if (ret == REG_ERPAREN)
490 ret = REG_EPAREN;
491
492 /* We have already checked preg->fastmap != NULL. */
493 if (BE (ret == REG_NOERROR, 1))
494 /* Compute the fastmap now, since regexec cannot modify the pattern
495 buffer. This function nevers fails in this implementation. */
496 (void) re_compile_fastmap (preg);
497 else
498 {
499 /* Some error occurred while compiling the expression. */
500 re_free (preg->fastmap);
501 preg->fastmap = NULL;
502 }
503
504 return (int) ret;
505 }
506 #ifdef _LIBC
507 weak_alias (__regcomp, regcomp)
508 #endif
509
510 /* Returns a message corresponding to an error code, ERRCODE, returned
511 from either regcomp or regexec. We don't use PREG here. */
512
513 size_t
514 regerror (errcode, preg, errbuf, errbuf_size)
515 int errcode;
516 const regex_t *preg;
517 char *errbuf;
518 size_t errbuf_size;
519 {
520 const char *msg;
521 size_t msg_size;
522
523 if (BE (errcode < 0
524 || errcode >= (int) (sizeof (__re_error_msgid_idx)
525 / sizeof (__re_error_msgid_idx[0])), 0))
526 /* Only error codes returned by the rest of the code should be passed
527 to this routine. If we are given anything else, or if other regex
528 code generates an invalid error code, then the program has a bug.
529 Dump core so we can fix it. */
530 abort ();
531
532 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
533
534 msg_size = strlen (msg) + 1; /* Includes the null. */
535
536 if (BE (errbuf_size != 0, 1))
537 {
538 if (BE (msg_size > errbuf_size, 0))
539 {
540 #if defined HAVE_MEMPCPY || defined _LIBC
541 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
542 #else
543 memcpy (errbuf, msg, errbuf_size - 1);
544 errbuf[errbuf_size - 1] = 0;
545 #endif
546 }
547 else
548 memcpy (errbuf, msg, msg_size);
549 }
550
551 return msg_size;
552 }
553 #ifdef _LIBC
554 weak_alias (__regerror, regerror)
555 #endif
556
557
558 static void
559 free_dfa_content (re_dfa_t *dfa)
560 {
561 int i, j;
562
563 re_free (dfa->subexps);
564
565 for (i = 0; i < dfa->nodes_len; ++i)
566 {
567 re_token_t *node = dfa->nodes + i;
568 #ifdef RE_ENABLE_I18N
569 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
570 free_charset (node->opr.mbcset);
571 else
572 #endif /* RE_ENABLE_I18N */
573 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
574 re_free (node->opr.sbcset);
575 }
576 re_free (dfa->nexts);
577 for (i = 0; i < dfa->nodes_len; ++i)
578 {
579 if (dfa->eclosures != NULL)
580 re_node_set_free (dfa->eclosures + i);
581 if (dfa->inveclosures != NULL)
582 re_node_set_free (dfa->inveclosures + i);
583 if (dfa->edests != NULL)
584 re_node_set_free (dfa->edests + i);
585 }
586 re_free (dfa->edests);
587 re_free (dfa->eclosures);
588 re_free (dfa->inveclosures);
589 re_free (dfa->nodes);
590
591 for (i = 0; i <= dfa->state_hash_mask; ++i)
592 {
593 struct re_state_table_entry *entry = dfa->state_table + i;
594 for (j = 0; j < entry->num; ++j)
595 {
596 re_dfastate_t *state = entry->array[j];
597 free_state (state);
598 }
599 re_free (entry->array);
600 }
601 re_free (dfa->state_table);
602
603 if (dfa->word_char != NULL)
604 re_free (dfa->word_char);
605 #ifdef DEBUG
606 re_free (dfa->re_str);
607 #endif
608
609 re_free (dfa);
610 }
611
612
613 /* Free dynamically allocated space used by PREG. */
614
615 void
616 regfree (preg)
617 regex_t *preg;
618 {
619 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
620 if (BE (dfa != NULL, 1))
621 free_dfa_content (dfa);
622
623 re_free (preg->fastmap);
624 }
625 #ifdef _LIBC
626 weak_alias (__regfree, regfree)
627 #endif
628 \f
629 /* Entry points compatible with 4.2 BSD regex library. We don't define
630 them unless specifically requested. */
631
632 #if defined _REGEX_RE_COMP || defined _LIBC
633
634 /* BSD has one and only one pattern buffer. */
635 static struct re_pattern_buffer re_comp_buf;
636
637 char *
638 # ifdef _LIBC
639 /* Make these definitions weak in libc, so POSIX programs can redefine
640 these names if they don't use our functions, and still use
641 regcomp/regexec above without link errors. */
642 weak_function
643 # endif
644 re_comp (s)
645 const char *s;
646 {
647 reg_errcode_t ret;
648 char *fastmap;
649
650 if (!s)
651 {
652 if (!re_comp_buf.buffer)
653 return gettext ("No previous regular expression");
654 return 0;
655 }
656
657 if (re_comp_buf.buffer)
658 {
659 fastmap = re_comp_buf.fastmap;
660 re_comp_buf.fastmap = NULL;
661 __regfree (&re_comp_buf);
662 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
663 re_comp_buf.fastmap = fastmap;
664 }
665
666 if (re_comp_buf.fastmap == NULL)
667 {
668 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
669 if (re_comp_buf.fastmap == NULL)
670 return (char *) gettext (__re_error_msgid
671 + __re_error_msgid_idx[(int) REG_ESPACE]);
672 }
673
674 /* Since `re_exec' always passes NULL for the `regs' argument, we
675 don't need to initialize the pattern buffer fields which affect it. */
676
677 /* Match anchors at newlines. */
678 re_comp_buf.newline_anchor = 1;
679
680 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
681
682 if (!ret)
683 return NULL;
684
685 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
686 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
687 }
688
689 #ifdef _LIBC
690 libc_freeres_fn (free_mem)
691 {
692 __regfree (&re_comp_buf);
693 }
694 #endif
695
696 #endif /* _REGEX_RE_COMP */
697 \f
698 /* Internal entry point.
699 Compile the regular expression PATTERN, whose length is LENGTH.
700 SYNTAX indicate regular expression's syntax. */
701
702 static reg_errcode_t
703 re_compile_internal (preg, pattern, length, syntax)
704 regex_t *preg;
705 const char * pattern;
706 int length;
707 reg_syntax_t syntax;
708 {
709 reg_errcode_t err = REG_NOERROR;
710 re_dfa_t *dfa;
711 re_string_t regexp;
712
713 /* Initialize the pattern buffer. */
714 preg->fastmap_accurate = 0;
715 preg->syntax = syntax;
716 preg->not_bol = preg->not_eol = 0;
717 preg->used = 0;
718 preg->re_nsub = 0;
719 preg->can_be_null = 0;
720 preg->regs_allocated = REGS_UNALLOCATED;
721
722 /* Initialize the dfa. */
723 dfa = (re_dfa_t *) preg->buffer;
724 if (preg->allocated < sizeof (re_dfa_t))
725 {
726 /* If zero allocated, but buffer is non-null, try to realloc
727 enough space. This loses if buffer's address is bogus, but
728 that is the user's responsibility. If ->buffer is NULL this
729 is a simple allocation. */
730 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
731 if (dfa == NULL)
732 return REG_ESPACE;
733 preg->allocated = sizeof (re_dfa_t);
734 }
735 preg->buffer = (unsigned char *) dfa;
736 preg->used = sizeof (re_dfa_t);
737
738 err = init_dfa (dfa, length);
739 if (BE (err != REG_NOERROR, 0))
740 {
741 re_free (dfa);
742 preg->buffer = NULL;
743 preg->allocated = 0;
744 return err;
745 }
746 #ifdef DEBUG
747 dfa->re_str = re_malloc (char, length + 1);
748 strncpy (dfa->re_str, pattern, length + 1);
749 #endif
750
751 err = re_string_construct (&regexp, pattern, length, preg->translate,
752 syntax & RE_ICASE, dfa);
753 if (BE (err != REG_NOERROR, 0))
754 {
755 re_free (dfa);
756 preg->buffer = NULL;
757 preg->allocated = 0;
758 return err;
759 }
760
761 /* Parse the regular expression, and build a structure tree. */
762 preg->re_nsub = 0;
763 dfa->str_tree = parse (&regexp, preg, syntax, &err);
764 if (BE (dfa->str_tree == NULL, 0))
765 goto re_compile_internal_free_return;
766
767 #ifdef RE_ENABLE_I18N
768 /* If possible, do searching in single byte encoding to speed things up. */
769 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
770 optimize_utf8 (dfa);
771 #endif
772
773 /* Analyze the tree and collect information which is necessary to
774 create the dfa. */
775 err = analyze (dfa);
776 if (BE (err != REG_NOERROR, 0))
777 goto re_compile_internal_free_return;
778
779 /* Then create the initial state of the dfa. */
780 err = create_initial_state (dfa);
781
782 /* Release work areas. */
783 free_workarea_compile (preg);
784 re_string_destruct (&regexp);
785
786 if (BE (err != REG_NOERROR, 0))
787 {
788 re_compile_internal_free_return:
789 free_dfa_content (dfa);
790 preg->buffer = NULL;
791 preg->allocated = 0;
792 }
793
794 return err;
795 }
796
797 /* Initialize DFA. We use the length of the regular expression PAT_LEN
798 as the initial length of some arrays. */
799
800 static reg_errcode_t
801 init_dfa (dfa, pat_len)
802 re_dfa_t *dfa;
803 int pat_len;
804 {
805 int table_size;
806
807 memset (dfa, '\0', sizeof (re_dfa_t));
808
809 dfa->nodes_alloc = pat_len + 1;
810 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
811
812 dfa->states_alloc = pat_len + 1;
813
814 /* table_size = 2 ^ ceil(log pat_len) */
815 for (table_size = 1; table_size > 0; table_size <<= 1)
816 if (table_size > pat_len)
817 break;
818
819 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
820 dfa->state_hash_mask = table_size - 1;
821
822 dfa->subexps_alloc = 1;
823 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
824 dfa->word_char = NULL;
825
826 dfa->mb_cur_max = MB_CUR_MAX;
827 #ifdef _LIBC
828 if (dfa->mb_cur_max > 1
829 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
830 dfa->is_utf8 = 1;
831 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
832 != 0);
833 #endif
834
835 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
836 || dfa->subexps == NULL, 0))
837 {
838 /* We don't bother to free anything which was allocated. Very
839 soon the process will go down anyway. */
840 dfa->subexps = NULL;
841 dfa->state_table = NULL;
842 dfa->nodes = NULL;
843 return REG_ESPACE;
844 }
845 return REG_NOERROR;
846 }
847
848 /* Initialize WORD_CHAR table, which indicate which character is
849 "word". In this case "word" means that it is the word construction
850 character used by some operators like "\<", "\>", etc. */
851
852 static reg_errcode_t
853 init_word_char (dfa)
854 re_dfa_t *dfa;
855 {
856 int i, j, ch;
857 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
858 if (BE (dfa->word_char == NULL, 0))
859 return REG_ESPACE;
860 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
861 for (j = 0; j < UINT_BITS; ++j, ++ch)
862 if (isalnum (ch) || ch == '_')
863 dfa->word_char[i] |= 1 << j;
864 return REG_NOERROR;
865 }
866
867 /* Free the work area which are only used while compiling. */
868
869 static void
870 free_workarea_compile (preg)
871 regex_t *preg;
872 {
873 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
874 free_bin_tree (dfa->str_tree);
875 dfa->str_tree = NULL;
876 re_free (dfa->org_indices);
877 dfa->org_indices = NULL;
878 }
879
880 /* Create initial states for all contexts. */
881
882 static reg_errcode_t
883 create_initial_state (dfa)
884 re_dfa_t *dfa;
885 {
886 int first, i;
887 reg_errcode_t err;
888 re_node_set init_nodes;
889
890 /* Initial states have the epsilon closure of the node which is
891 the first node of the regular expression. */
892 first = dfa->str_tree->first;
893 dfa->init_node = first;
894 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
895 if (BE (err != REG_NOERROR, 0))
896 return err;
897
898 /* The back-references which are in initial states can epsilon transit,
899 since in this case all of the subexpressions can be null.
900 Then we add epsilon closures of the nodes which are the next nodes of
901 the back-references. */
902 if (dfa->nbackref > 0)
903 for (i = 0; i < init_nodes.nelem; ++i)
904 {
905 int node_idx = init_nodes.elems[i];
906 re_token_type_t type = dfa->nodes[node_idx].type;
907
908 int clexp_idx;
909 if (type != OP_BACK_REF)
910 continue;
911 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
912 {
913 re_token_t *clexp_node;
914 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
915 if (clexp_node->type == OP_CLOSE_SUBEXP
916 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
917 break;
918 }
919 if (clexp_idx == init_nodes.nelem)
920 continue;
921
922 if (type == OP_BACK_REF)
923 {
924 int dest_idx = dfa->edests[node_idx].elems[0];
925 if (!re_node_set_contains (&init_nodes, dest_idx))
926 {
927 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
928 i = 0;
929 }
930 }
931 }
932
933 /* It must be the first time to invoke acquire_state. */
934 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
935 /* We don't check ERR here, since the initial state must not be NULL. */
936 if (BE (dfa->init_state == NULL, 0))
937 return err;
938 if (dfa->init_state->has_constraint)
939 {
940 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
941 CONTEXT_WORD);
942 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
943 CONTEXT_NEWLINE);
944 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
945 &init_nodes,
946 CONTEXT_NEWLINE
947 | CONTEXT_BEGBUF);
948 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
949 || dfa->init_state_begbuf == NULL, 0))
950 return err;
951 }
952 else
953 dfa->init_state_word = dfa->init_state_nl
954 = dfa->init_state_begbuf = dfa->init_state;
955
956 re_node_set_free (&init_nodes);
957 return REG_NOERROR;
958 }
959 \f
960 #ifdef RE_ENABLE_I18N
961 /* If it is possible to do searching in single byte encoding instead of UTF-8
962 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
963 DFA nodes where needed. */
964
965 static void
966 optimize_utf8 (dfa)
967 re_dfa_t *dfa;
968 {
969 int node, i, mb_chars = 0, has_period = 0;
970
971 for (node = 0; node < dfa->nodes_len; ++node)
972 switch (dfa->nodes[node].type)
973 {
974 case CHARACTER:
975 if (dfa->nodes[node].opr.c >= 0x80)
976 mb_chars = 1;
977 break;
978 case ANCHOR:
979 switch (dfa->nodes[node].opr.idx)
980 {
981 case LINE_FIRST:
982 case LINE_LAST:
983 case BUF_FIRST:
984 case BUF_LAST:
985 break;
986 default:
987 /* Word anchors etc. cannot be handled. */
988 return;
989 }
990 break;
991 case OP_PERIOD:
992 has_period = 1;
993 break;
994 case OP_BACK_REF:
995 case OP_ALT:
996 case END_OF_RE:
997 case OP_DUP_ASTERISK:
998 case OP_DUP_QUESTION:
999 case OP_DUP_PLUS:
1000 case OP_OPEN_SUBEXP:
1001 case OP_CLOSE_SUBEXP:
1002 break;
1003 case SIMPLE_BRACKET:
1004 /* Just double check. */
1005 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1006 if (dfa->nodes[node].opr.sbcset[i])
1007 return;
1008 break;
1009 default:
1010 return;
1011 }
1012
1013 if (mb_chars || has_period)
1014 for (node = 0; node < dfa->nodes_len; ++node)
1015 {
1016 if (dfa->nodes[node].type == CHARACTER
1017 && dfa->nodes[node].opr.c >= 0x80)
1018 dfa->nodes[node].mb_partial = 0;
1019 else if (dfa->nodes[node].type == OP_PERIOD)
1020 dfa->nodes[node].type = OP_UTF8_PERIOD;
1021 }
1022
1023 /* The search can be in single byte locale. */
1024 dfa->mb_cur_max = 1;
1025 dfa->is_utf8 = 0;
1026 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1027 }
1028 #endif
1029 \f
1030 /* Analyze the structure tree, and calculate "first", "next", "edest",
1031 "eclosure", and "inveclosure". */
1032
1033 static reg_errcode_t
1034 analyze (dfa)
1035 re_dfa_t *dfa;
1036 {
1037 int i;
1038 reg_errcode_t ret;
1039
1040 /* Allocate arrays. */
1041 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1042 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1043 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1044 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1045 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1046 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1047 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1048 return REG_ESPACE;
1049 /* Initialize them. */
1050 for (i = 0; i < dfa->nodes_len; ++i)
1051 {
1052 dfa->nexts[i] = -1;
1053 re_node_set_init_empty (dfa->edests + i);
1054 re_node_set_init_empty (dfa->eclosures + i);
1055 re_node_set_init_empty (dfa->inveclosures + i);
1056 }
1057
1058 ret = analyze_tree (dfa, dfa->str_tree);
1059 if (BE (ret == REG_NOERROR, 1))
1060 {
1061 ret = calc_eclosure (dfa);
1062 if (ret == REG_NOERROR)
1063 calc_inveclosure (dfa);
1064 }
1065 return ret;
1066 }
1067
1068 /* Helper functions for analyze.
1069 This function calculate "first", "next", and "edest" for the subtree
1070 whose root is NODE. */
1071
1072 static reg_errcode_t
1073 analyze_tree (dfa, node)
1074 re_dfa_t *dfa;
1075 bin_tree_t *node;
1076 {
1077 reg_errcode_t ret;
1078 if (node->first == -1)
1079 calc_first (dfa, node);
1080 if (node->next == -1)
1081 calc_next (dfa, node);
1082 if (node->eclosure.nelem == 0)
1083 calc_epsdest (dfa, node);
1084 /* Calculate "first" etc. for the left child. */
1085 if (node->left != NULL)
1086 {
1087 ret = analyze_tree (dfa, node->left);
1088 if (BE (ret != REG_NOERROR, 0))
1089 return ret;
1090 }
1091 /* Calculate "first" etc. for the right child. */
1092 if (node->right != NULL)
1093 {
1094 ret = analyze_tree (dfa, node->right);
1095 if (BE (ret != REG_NOERROR, 0))
1096 return ret;
1097 }
1098 return REG_NOERROR;
1099 }
1100
1101 /* Calculate "first" for the node NODE. */
1102 static void
1103 calc_first (dfa, node)
1104 re_dfa_t *dfa;
1105 bin_tree_t *node;
1106 {
1107 int idx, type;
1108 idx = node->node_idx;
1109 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1110
1111 switch (type)
1112 {
1113 #ifdef DEBUG
1114 case OP_OPEN_BRACKET:
1115 case OP_CLOSE_BRACKET:
1116 case OP_OPEN_DUP_NUM:
1117 case OP_CLOSE_DUP_NUM:
1118 case OP_NON_MATCH_LIST:
1119 case OP_OPEN_COLL_ELEM:
1120 case OP_CLOSE_COLL_ELEM:
1121 case OP_OPEN_EQUIV_CLASS:
1122 case OP_CLOSE_EQUIV_CLASS:
1123 case OP_OPEN_CHAR_CLASS:
1124 case OP_CLOSE_CHAR_CLASS:
1125 /* These must not be appeared here. */
1126 assert (0);
1127 #endif
1128 case END_OF_RE:
1129 case CHARACTER:
1130 case OP_PERIOD:
1131 case OP_DUP_ASTERISK:
1132 case OP_DUP_QUESTION:
1133 #ifdef RE_ENABLE_I18N
1134 case OP_UTF8_PERIOD:
1135 case COMPLEX_BRACKET:
1136 #endif /* RE_ENABLE_I18N */
1137 case SIMPLE_BRACKET:
1138 case OP_BACK_REF:
1139 case ANCHOR:
1140 case OP_OPEN_SUBEXP:
1141 case OP_CLOSE_SUBEXP:
1142 node->first = idx;
1143 break;
1144 case OP_DUP_PLUS:
1145 #ifdef DEBUG
1146 assert (node->left != NULL);
1147 #endif
1148 if (node->left->first == -1)
1149 calc_first (dfa, node->left);
1150 node->first = node->left->first;
1151 break;
1152 case OP_ALT:
1153 node->first = idx;
1154 break;
1155 /* else fall through */
1156 default:
1157 #ifdef DEBUG
1158 assert (node->left != NULL);
1159 #endif
1160 if (node->left->first == -1)
1161 calc_first (dfa, node->left);
1162 node->first = node->left->first;
1163 break;
1164 }
1165 }
1166
1167 /* Calculate "next" for the node NODE. */
1168
1169 static void
1170 calc_next (dfa, node)
1171 re_dfa_t *dfa;
1172 bin_tree_t *node;
1173 {
1174 int idx, type;
1175 bin_tree_t *parent = node->parent;
1176 if (parent == NULL)
1177 {
1178 node->next = -1;
1179 idx = node->node_idx;
1180 if (node->type == 0)
1181 dfa->nexts[idx] = node->next;
1182 return;
1183 }
1184
1185 idx = parent->node_idx;
1186 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1187
1188 switch (type)
1189 {
1190 case OP_DUP_ASTERISK:
1191 case OP_DUP_PLUS:
1192 node->next = idx;
1193 break;
1194 case CONCAT:
1195 if (parent->left == node)
1196 {
1197 if (parent->right->first == -1)
1198 calc_first (dfa, parent->right);
1199 node->next = parent->right->first;
1200 break;
1201 }
1202 /* else fall through */
1203 default:
1204 if (parent->next == -1)
1205 calc_next (dfa, parent);
1206 node->next = parent->next;
1207 break;
1208 }
1209 idx = node->node_idx;
1210 if (node->type == 0)
1211 dfa->nexts[idx] = node->next;
1212 }
1213
1214 /* Calculate "edest" for the node NODE. */
1215
1216 static void
1217 calc_epsdest (dfa, node)
1218 re_dfa_t *dfa;
1219 bin_tree_t *node;
1220 {
1221 int idx;
1222 idx = node->node_idx;
1223 if (node->type == 0)
1224 {
1225 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1226 || dfa->nodes[idx].type == OP_DUP_PLUS
1227 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1228 {
1229 if (node->left->first == -1)
1230 calc_first (dfa, node->left);
1231 if (node->next == -1)
1232 calc_next (dfa, node);
1233 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1234 node->next);
1235 }
1236 else if (dfa->nodes[idx].type == OP_ALT)
1237 {
1238 int left, right;
1239 if (node->left != NULL)
1240 {
1241 if (node->left->first == -1)
1242 calc_first (dfa, node->left);
1243 left = node->left->first;
1244 }
1245 else
1246 {
1247 if (node->next == -1)
1248 calc_next (dfa, node);
1249 left = node->next;
1250 }
1251 if (node->right != NULL)
1252 {
1253 if (node->right->first == -1)
1254 calc_first (dfa, node->right);
1255 right = node->right->first;
1256 }
1257 else
1258 {
1259 if (node->next == -1)
1260 calc_next (dfa, node);
1261 right = node->next;
1262 }
1263 re_node_set_init_2 (dfa->edests + idx, left, right);
1264 }
1265 else if (dfa->nodes[idx].type == ANCHOR
1266 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1267 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1268 || dfa->nodes[idx].type == OP_BACK_REF)
1269 re_node_set_init_1 (dfa->edests + idx, node->next);
1270 else
1271 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1272 }
1273 }
1274
1275 /* Duplicate the epsilon closure of the node ROOT_NODE.
1276 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1277 to their own constraint. */
1278
1279 static reg_errcode_t
1280 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1281 init_constraint)
1282 re_dfa_t *dfa;
1283 int top_org_node, top_clone_node, root_node;
1284 unsigned int init_constraint;
1285 {
1286 reg_errcode_t err;
1287 int org_node, clone_node, ret;
1288 unsigned int constraint = init_constraint;
1289 for (org_node = top_org_node, clone_node = top_clone_node;;)
1290 {
1291 int org_dest, clone_dest;
1292 if (dfa->nodes[org_node].type == OP_BACK_REF)
1293 {
1294 /* If the back reference epsilon-transit, its destination must
1295 also have the constraint. Then duplicate the epsilon closure
1296 of the destination of the back reference, and store it in
1297 edests of the back reference. */
1298 org_dest = dfa->nexts[org_node];
1299 re_node_set_empty (dfa->edests + clone_node);
1300 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1301 if (BE (err != REG_NOERROR, 0))
1302 return err;
1303 dfa->nexts[clone_node] = dfa->nexts[org_node];
1304 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1305 if (BE (ret < 0, 0))
1306 return REG_ESPACE;
1307 }
1308 else if (dfa->edests[org_node].nelem == 0)
1309 {
1310 /* In case of the node can't epsilon-transit, don't duplicate the
1311 destination and store the original destination as the
1312 destination of the node. */
1313 dfa->nexts[clone_node] = dfa->nexts[org_node];
1314 break;
1315 }
1316 else if (dfa->edests[org_node].nelem == 1)
1317 {
1318 /* In case of the node can epsilon-transit, and it has only one
1319 destination. */
1320 org_dest = dfa->edests[org_node].elems[0];
1321 re_node_set_empty (dfa->edests + clone_node);
1322 if (dfa->nodes[org_node].type == ANCHOR)
1323 {
1324 /* In case of the node has another constraint, append it. */
1325 if (org_node == root_node && clone_node != org_node)
1326 {
1327 /* ...but if the node is root_node itself, it means the
1328 epsilon closure have a loop, then tie it to the
1329 destination of the root_node. */
1330 ret = re_node_set_insert (dfa->edests + clone_node,
1331 org_dest);
1332 if (BE (ret < 0, 0))
1333 return REG_ESPACE;
1334 break;
1335 }
1336 constraint |= dfa->nodes[org_node].opr.ctx_type;
1337 }
1338 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1339 if (BE (err != REG_NOERROR, 0))
1340 return err;
1341 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1342 if (BE (ret < 0, 0))
1343 return REG_ESPACE;
1344 }
1345 else /* dfa->edests[org_node].nelem == 2 */
1346 {
1347 /* In case of the node can epsilon-transit, and it has two
1348 destinations. E.g. '|', '*', '+', '?'. */
1349 org_dest = dfa->edests[org_node].elems[0];
1350 re_node_set_empty (dfa->edests + clone_node);
1351 /* Search for a duplicated node which satisfies the constraint. */
1352 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1353 if (clone_dest == -1)
1354 {
1355 /* There are no such a duplicated node, create a new one. */
1356 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1357 if (BE (err != REG_NOERROR, 0))
1358 return err;
1359 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1360 if (BE (ret < 0, 0))
1361 return REG_ESPACE;
1362 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1363 root_node, constraint);
1364 if (BE (err != REG_NOERROR, 0))
1365 return err;
1366 }
1367 else
1368 {
1369 /* There are a duplicated node which satisfy the constraint,
1370 use it to avoid infinite loop. */
1371 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1372 if (BE (ret < 0, 0))
1373 return REG_ESPACE;
1374 }
1375
1376 org_dest = dfa->edests[org_node].elems[1];
1377 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1378 if (BE (err != REG_NOERROR, 0))
1379 return err;
1380 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1381 if (BE (ret < 0, 0))
1382 return REG_ESPACE;
1383 }
1384 org_node = org_dest;
1385 clone_node = clone_dest;
1386 }
1387 return REG_NOERROR;
1388 }
1389
1390 /* Search for a node which is duplicated from the node ORG_NODE, and
1391 satisfies the constraint CONSTRAINT. */
1392
1393 static int
1394 search_duplicated_node (dfa, org_node, constraint)
1395 re_dfa_t *dfa;
1396 int org_node;
1397 unsigned int constraint;
1398 {
1399 int idx;
1400 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1401 {
1402 if (org_node == dfa->org_indices[idx]
1403 && constraint == dfa->nodes[idx].constraint)
1404 return idx; /* Found. */
1405 }
1406 return -1; /* Not found. */
1407 }
1408
1409 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1410 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1411 otherwise return the error code. */
1412
1413 static reg_errcode_t
1414 duplicate_node (new_idx, dfa, org_idx, constraint)
1415 re_dfa_t *dfa;
1416 int *new_idx, org_idx;
1417 unsigned int constraint;
1418 {
1419 re_token_t dup;
1420 int dup_idx;
1421
1422 dup = dfa->nodes[org_idx];
1423 dup_idx = re_dfa_add_node (dfa, dup, 1);
1424 if (BE (dup_idx == -1, 0))
1425 return REG_ESPACE;
1426 dfa->nodes[dup_idx].constraint = constraint;
1427 if (dfa->nodes[org_idx].type == ANCHOR)
1428 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1429 dfa->nodes[dup_idx].duplicated = 1;
1430 re_node_set_init_empty (dfa->edests + dup_idx);
1431 re_node_set_init_empty (dfa->eclosures + dup_idx);
1432 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1433
1434 /* Store the index of the original node. */
1435 dfa->org_indices[dup_idx] = org_idx;
1436 *new_idx = dup_idx;
1437 return REG_NOERROR;
1438 }
1439
1440 static void
1441 calc_inveclosure (dfa)
1442 re_dfa_t *dfa;
1443 {
1444 int src, idx, dest;
1445 for (src = 0; src < dfa->nodes_len; ++src)
1446 {
1447 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1448 {
1449 dest = dfa->eclosures[src].elems[idx];
1450 re_node_set_insert (dfa->inveclosures + dest, src);
1451 }
1452 }
1453 }
1454
1455 /* Calculate "eclosure" for all the node in DFA. */
1456
1457 static reg_errcode_t
1458 calc_eclosure (dfa)
1459 re_dfa_t *dfa;
1460 {
1461 int node_idx, incomplete;
1462 #ifdef DEBUG
1463 assert (dfa->nodes_len > 0);
1464 #endif
1465 incomplete = 0;
1466 /* For each nodes, calculate epsilon closure. */
1467 for (node_idx = 0; ; ++node_idx)
1468 {
1469 reg_errcode_t err;
1470 re_node_set eclosure_elem;
1471 if (node_idx == dfa->nodes_len)
1472 {
1473 if (!incomplete)
1474 break;
1475 incomplete = 0;
1476 node_idx = 0;
1477 }
1478
1479 #ifdef DEBUG
1480 assert (dfa->eclosures[node_idx].nelem != -1);
1481 #endif
1482 /* If we have already calculated, skip it. */
1483 if (dfa->eclosures[node_idx].nelem != 0)
1484 continue;
1485 /* Calculate epsilon closure of `node_idx'. */
1486 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1487 if (BE (err != REG_NOERROR, 0))
1488 return err;
1489
1490 if (dfa->eclosures[node_idx].nelem == 0)
1491 {
1492 incomplete = 1;
1493 re_node_set_free (&eclosure_elem);
1494 }
1495 }
1496 return REG_NOERROR;
1497 }
1498
1499 /* Calculate epsilon closure of NODE. */
1500
1501 static reg_errcode_t
1502 calc_eclosure_iter (new_set, dfa, node, root)
1503 re_node_set *new_set;
1504 re_dfa_t *dfa;
1505 int node, root;
1506 {
1507 reg_errcode_t err;
1508 unsigned int constraint;
1509 int i, incomplete;
1510 re_node_set eclosure;
1511 incomplete = 0;
1512 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1513 if (BE (err != REG_NOERROR, 0))
1514 return err;
1515
1516 /* This indicates that we are calculating this node now.
1517 We reference this value to avoid infinite loop. */
1518 dfa->eclosures[node].nelem = -1;
1519
1520 constraint = ((dfa->nodes[node].type == ANCHOR)
1521 ? dfa->nodes[node].opr.ctx_type : 0);
1522 /* If the current node has constraints, duplicate all nodes.
1523 Since they must inherit the constraints. */
1524 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1525 {
1526 int org_node, cur_node;
1527 org_node = cur_node = node;
1528 err = duplicate_node_closure (dfa, node, node, node, constraint);
1529 if (BE (err != REG_NOERROR, 0))
1530 return err;
1531 }
1532
1533 /* Expand each epsilon destination nodes. */
1534 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1535 for (i = 0; i < dfa->edests[node].nelem; ++i)
1536 {
1537 re_node_set eclosure_elem;
1538 int edest = dfa->edests[node].elems[i];
1539 /* If calculating the epsilon closure of `edest' is in progress,
1540 return intermediate result. */
1541 if (dfa->eclosures[edest].nelem == -1)
1542 {
1543 incomplete = 1;
1544 continue;
1545 }
1546 /* If we haven't calculated the epsilon closure of `edest' yet,
1547 calculate now. Otherwise use calculated epsilon closure. */
1548 if (dfa->eclosures[edest].nelem == 0)
1549 {
1550 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1551 if (BE (err != REG_NOERROR, 0))
1552 return err;
1553 }
1554 else
1555 eclosure_elem = dfa->eclosures[edest];
1556 /* Merge the epsilon closure of `edest'. */
1557 re_node_set_merge (&eclosure, &eclosure_elem);
1558 /* If the epsilon closure of `edest' is incomplete,
1559 the epsilon closure of this node is also incomplete. */
1560 if (dfa->eclosures[edest].nelem == 0)
1561 {
1562 incomplete = 1;
1563 re_node_set_free (&eclosure_elem);
1564 }
1565 }
1566
1567 /* Epsilon closures include itself. */
1568 re_node_set_insert (&eclosure, node);
1569 if (incomplete && !root)
1570 dfa->eclosures[node].nelem = 0;
1571 else
1572 dfa->eclosures[node] = eclosure;
1573 *new_set = eclosure;
1574 return REG_NOERROR;
1575 }
1576 \f
1577 /* Functions for token which are used in the parser. */
1578
1579 /* Fetch a token from INPUT.
1580 We must not use this function inside bracket expressions. */
1581
1582 static re_token_t
1583 fetch_token (input, syntax)
1584 re_string_t *input;
1585 reg_syntax_t syntax;
1586 {
1587 re_token_t token;
1588 int consumed_byte;
1589 consumed_byte = peek_token (&token, input, syntax);
1590 re_string_skip_bytes (input, consumed_byte);
1591 return token;
1592 }
1593
1594 /* Peek a token from INPUT, and return the length of the token.
1595 We must not use this function inside bracket expressions. */
1596
1597 static int
1598 peek_token (token, input, syntax)
1599 re_token_t *token;
1600 re_string_t *input;
1601 reg_syntax_t syntax;
1602 {
1603 unsigned char c;
1604
1605 if (re_string_eoi (input))
1606 {
1607 token->type = END_OF_RE;
1608 return 0;
1609 }
1610
1611 c = re_string_peek_byte (input, 0);
1612 token->opr.c = c;
1613
1614 #ifdef RE_ENABLE_I18N
1615 token->mb_partial = 0;
1616 if (input->mb_cur_max > 1 &&
1617 !re_string_first_byte (input, re_string_cur_idx (input)))
1618 {
1619 token->type = CHARACTER;
1620 token->mb_partial = 1;
1621 return 1;
1622 }
1623 #endif
1624 if (c == '\\')
1625 {
1626 unsigned char c2;
1627 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1628 {
1629 token->type = BACK_SLASH;
1630 return 1;
1631 }
1632
1633 c2 = re_string_peek_byte_case (input, 1);
1634 token->opr.c = c2;
1635 token->type = CHARACTER;
1636 switch (c2)
1637 {
1638 case '|':
1639 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1640 token->type = OP_ALT;
1641 break;
1642 case '1': case '2': case '3': case '4': case '5':
1643 case '6': case '7': case '8': case '9':
1644 if (!(syntax & RE_NO_BK_REFS))
1645 {
1646 token->type = OP_BACK_REF;
1647 token->opr.idx = c2 - '0';
1648 }
1649 break;
1650 case '<':
1651 if (!(syntax & RE_NO_GNU_OPS))
1652 {
1653 token->type = ANCHOR;
1654 token->opr.idx = WORD_FIRST;
1655 }
1656 break;
1657 case '>':
1658 if (!(syntax & RE_NO_GNU_OPS))
1659 {
1660 token->type = ANCHOR;
1661 token->opr.idx = WORD_LAST;
1662 }
1663 break;
1664 case 'b':
1665 if (!(syntax & RE_NO_GNU_OPS))
1666 {
1667 token->type = ANCHOR;
1668 token->opr.idx = WORD_DELIM;
1669 }
1670 break;
1671 case 'B':
1672 if (!(syntax & RE_NO_GNU_OPS))
1673 {
1674 token->type = ANCHOR;
1675 token->opr.idx = INSIDE_WORD;
1676 }
1677 break;
1678 case 'w':
1679 if (!(syntax & RE_NO_GNU_OPS))
1680 token->type = OP_WORD;
1681 break;
1682 case 'W':
1683 if (!(syntax & RE_NO_GNU_OPS))
1684 token->type = OP_NOTWORD;
1685 break;
1686 case 's':
1687 if (!(syntax & RE_NO_GNU_OPS))
1688 token->type = OP_SPACE;
1689 break;
1690 case 'S':
1691 if (!(syntax & RE_NO_GNU_OPS))
1692 token->type = OP_NOTSPACE;
1693 break;
1694 case '`':
1695 if (!(syntax & RE_NO_GNU_OPS))
1696 {
1697 token->type = ANCHOR;
1698 token->opr.idx = BUF_FIRST;
1699 }
1700 break;
1701 case '\'':
1702 if (!(syntax & RE_NO_GNU_OPS))
1703 {
1704 token->type = ANCHOR;
1705 token->opr.idx = BUF_LAST;
1706 }
1707 break;
1708 case '(':
1709 if (!(syntax & RE_NO_BK_PARENS))
1710 token->type = OP_OPEN_SUBEXP;
1711 break;
1712 case ')':
1713 if (!(syntax & RE_NO_BK_PARENS))
1714 token->type = OP_CLOSE_SUBEXP;
1715 break;
1716 case '+':
1717 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1718 token->type = OP_DUP_PLUS;
1719 break;
1720 case '?':
1721 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1722 token->type = OP_DUP_QUESTION;
1723 break;
1724 case '{':
1725 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1726 token->type = OP_OPEN_DUP_NUM;
1727 break;
1728 case '}':
1729 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1730 token->type = OP_CLOSE_DUP_NUM;
1731 break;
1732 default:
1733 break;
1734 }
1735 return 2;
1736 }
1737
1738 token->type = CHARACTER;
1739 switch (c)
1740 {
1741 case '\n':
1742 if (syntax & RE_NEWLINE_ALT)
1743 token->type = OP_ALT;
1744 break;
1745 case '|':
1746 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1747 token->type = OP_ALT;
1748 break;
1749 case '*':
1750 token->type = OP_DUP_ASTERISK;
1751 break;
1752 case '+':
1753 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1754 token->type = OP_DUP_PLUS;
1755 break;
1756 case '?':
1757 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1758 token->type = OP_DUP_QUESTION;
1759 break;
1760 case '{':
1761 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1762 token->type = OP_OPEN_DUP_NUM;
1763 break;
1764 case '}':
1765 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1766 token->type = OP_CLOSE_DUP_NUM;
1767 break;
1768 case '(':
1769 if (syntax & RE_NO_BK_PARENS)
1770 token->type = OP_OPEN_SUBEXP;
1771 break;
1772 case ')':
1773 if (syntax & RE_NO_BK_PARENS)
1774 token->type = OP_CLOSE_SUBEXP;
1775 break;
1776 case '[':
1777 token->type = OP_OPEN_BRACKET;
1778 break;
1779 case '.':
1780 token->type = OP_PERIOD;
1781 break;
1782 case '^':
1783 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1784 re_string_cur_idx (input) != 0)
1785 {
1786 char prev = re_string_peek_byte (input, -1);
1787 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1788 break;
1789 }
1790 token->type = ANCHOR;
1791 token->opr.idx = LINE_FIRST;
1792 break;
1793 case '$':
1794 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1795 re_string_cur_idx (input) + 1 != re_string_length (input))
1796 {
1797 re_token_t next;
1798 re_string_skip_bytes (input, 1);
1799 peek_token (&next, input, syntax);
1800 re_string_skip_bytes (input, -1);
1801 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1802 break;
1803 }
1804 token->type = ANCHOR;
1805 token->opr.idx = LINE_LAST;
1806 break;
1807 default:
1808 break;
1809 }
1810 return 1;
1811 }
1812
1813 /* Peek a token from INPUT, and return the length of the token.
1814 We must not use this function out of bracket expressions. */
1815
1816 static int
1817 peek_token_bracket (token, input, syntax)
1818 re_token_t *token;
1819 re_string_t *input;
1820 reg_syntax_t syntax;
1821 {
1822 unsigned char c;
1823 if (re_string_eoi (input))
1824 {
1825 token->type = END_OF_RE;
1826 return 0;
1827 }
1828 c = re_string_peek_byte (input, 0);
1829 token->opr.c = c;
1830
1831 #ifdef RE_ENABLE_I18N
1832 if (input->mb_cur_max > 1 &&
1833 !re_string_first_byte (input, re_string_cur_idx (input)))
1834 {
1835 token->type = CHARACTER;
1836 return 1;
1837 }
1838 #endif /* RE_ENABLE_I18N */
1839
1840 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1841 {
1842 /* In this case, '\' escape a character. */
1843 unsigned char c2;
1844 re_string_skip_bytes (input, 1);
1845 c2 = re_string_peek_byte (input, 0);
1846 token->opr.c = c2;
1847 token->type = CHARACTER;
1848 return 1;
1849 }
1850 if (c == '[') /* '[' is a special char in a bracket exps. */
1851 {
1852 unsigned char c2;
1853 int token_len;
1854 c2 = re_string_peek_byte (input, 1);
1855 token->opr.c = c2;
1856 token_len = 2;
1857 switch (c2)
1858 {
1859 case '.':
1860 token->type = OP_OPEN_COLL_ELEM;
1861 break;
1862 case '=':
1863 token->type = OP_OPEN_EQUIV_CLASS;
1864 break;
1865 case ':':
1866 if (syntax & RE_CHAR_CLASSES)
1867 {
1868 token->type = OP_OPEN_CHAR_CLASS;
1869 break;
1870 }
1871 /* else fall through. */
1872 default:
1873 token->type = CHARACTER;
1874 token->opr.c = c;
1875 token_len = 1;
1876 break;
1877 }
1878 return token_len;
1879 }
1880 switch (c)
1881 {
1882 case '-':
1883 token->type = OP_CHARSET_RANGE;
1884 break;
1885 case ']':
1886 token->type = OP_CLOSE_BRACKET;
1887 break;
1888 case '^':
1889 token->type = OP_NON_MATCH_LIST;
1890 break;
1891 default:
1892 token->type = CHARACTER;
1893 }
1894 return 1;
1895 }
1896 \f
1897 /* Functions for parser. */
1898
1899 /* Entry point of the parser.
1900 Parse the regular expression REGEXP and return the structure tree.
1901 If an error is occured, ERR is set by error code, and return NULL.
1902 This function build the following tree, from regular expression <reg_exp>:
1903 CAT
1904 / \
1905 / \
1906 <reg_exp> EOR
1907
1908 CAT means concatenation.
1909 EOR means end of regular expression. */
1910
1911 static bin_tree_t *
1912 parse (regexp, preg, syntax, err)
1913 re_string_t *regexp;
1914 regex_t *preg;
1915 reg_syntax_t syntax;
1916 reg_errcode_t *err;
1917 {
1918 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1919 bin_tree_t *tree, *eor, *root;
1920 re_token_t current_token;
1921 int new_idx;
1922 current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1923 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1924 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1925 return NULL;
1926 new_idx = re_dfa_add_node (dfa, current_token, 0);
1927 eor = create_tree (NULL, NULL, 0, new_idx);
1928 if (tree != NULL)
1929 root = create_tree (tree, eor, CONCAT, 0);
1930 else
1931 root = eor;
1932 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1933 {
1934 *err = REG_ESPACE;
1935 return NULL;
1936 }
1937 return root;
1938 }
1939
1940 /* This function build the following tree, from regular expression
1941 <branch1>|<branch2>:
1942 ALT
1943 / \
1944 / \
1945 <branch1> <branch2>
1946
1947 ALT means alternative, which represents the operator `|'. */
1948
1949 static bin_tree_t *
1950 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1951 re_string_t *regexp;
1952 regex_t *preg;
1953 re_token_t *token;
1954 reg_syntax_t syntax;
1955 int nest;
1956 reg_errcode_t *err;
1957 {
1958 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1959 bin_tree_t *tree, *branch = NULL;
1960 int new_idx;
1961 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1962 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1963 return NULL;
1964
1965 while (token->type == OP_ALT)
1966 {
1967 re_token_t alt_token = *token;
1968 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1969 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1970 if (token->type != OP_ALT && token->type != END_OF_RE
1971 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1972 {
1973 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1974 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1975 {
1976 free_bin_tree (tree);
1977 return NULL;
1978 }
1979 }
1980 else
1981 branch = NULL;
1982 tree = create_tree (tree, branch, 0, new_idx);
1983 if (BE (new_idx == -1 || tree == NULL, 0))
1984 {
1985 *err = REG_ESPACE;
1986 return NULL;
1987 }
1988 dfa->has_plural_match = 1;
1989 }
1990 return tree;
1991 }
1992
1993 /* This function build the following tree, from regular expression
1994 <exp1><exp2>:
1995 CAT
1996 / \
1997 / \
1998 <exp1> <exp2>
1999
2000 CAT means concatenation. */
2001
2002 static bin_tree_t *
2003 parse_branch (regexp, preg, token, syntax, nest, err)
2004 re_string_t *regexp;
2005 regex_t *preg;
2006 re_token_t *token;
2007 reg_syntax_t syntax;
2008 int nest;
2009 reg_errcode_t *err;
2010 {
2011 bin_tree_t *tree, *exp;
2012 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2013 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2014 return NULL;
2015
2016 while (token->type != OP_ALT && token->type != END_OF_RE
2017 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2018 {
2019 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2020 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2021 {
2022 free_bin_tree (tree);
2023 return NULL;
2024 }
2025 if (tree != NULL && exp != NULL)
2026 {
2027 tree = create_tree (tree, exp, CONCAT, 0);
2028 if (tree == NULL)
2029 {
2030 *err = REG_ESPACE;
2031 return NULL;
2032 }
2033 }
2034 else if (tree == NULL)
2035 tree = exp;
2036 /* Otherwise exp == NULL, we don't need to create new tree. */
2037 }
2038 return tree;
2039 }
2040
2041 /* This function build the following tree, from regular expression a*:
2042 *
2043 |
2044 a
2045 */
2046
2047 static bin_tree_t *
2048 parse_expression (regexp, preg, token, syntax, nest, err)
2049 re_string_t *regexp;
2050 regex_t *preg;
2051 re_token_t *token;
2052 reg_syntax_t syntax;
2053 int nest;
2054 reg_errcode_t *err;
2055 {
2056 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2057 bin_tree_t *tree;
2058 int new_idx;
2059 switch (token->type)
2060 {
2061 case CHARACTER:
2062 new_idx = re_dfa_add_node (dfa, *token, 0);
2063 tree = create_tree (NULL, NULL, 0, new_idx);
2064 if (BE (new_idx == -1 || tree == NULL, 0))
2065 {
2066 *err = REG_ESPACE;
2067 return NULL;
2068 }
2069 #ifdef RE_ENABLE_I18N
2070 if (dfa->mb_cur_max > 1)
2071 {
2072 while (!re_string_eoi (regexp)
2073 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2074 {
2075 bin_tree_t *mbc_remain;
2076 *token = fetch_token (regexp, syntax);
2077 new_idx = re_dfa_add_node (dfa, *token, 0);
2078 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
2079 tree = create_tree (tree, mbc_remain, CONCAT, 0);
2080 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
2081 {
2082 *err = REG_ESPACE;
2083 return NULL;
2084 }
2085 }
2086 }
2087 #endif
2088 break;
2089 case OP_OPEN_SUBEXP:
2090 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2091 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2092 return NULL;
2093 break;
2094 case OP_OPEN_BRACKET:
2095 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2096 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2097 return NULL;
2098 break;
2099 case OP_BACK_REF:
2100 if (BE (preg->re_nsub < token->opr.idx
2101 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2102 {
2103 *err = REG_ESUBREG;
2104 return NULL;
2105 }
2106 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2107 new_idx = re_dfa_add_node (dfa, *token, 0);
2108 tree = create_tree (NULL, NULL, 0, new_idx);
2109 if (BE (new_idx == -1 || tree == NULL, 0))
2110 {
2111 *err = REG_ESPACE;
2112 return NULL;
2113 }
2114 ++dfa->nbackref;
2115 dfa->has_mb_node = 1;
2116 break;
2117 case OP_OPEN_DUP_NUM:
2118 if (syntax & RE_CONTEXT_INVALID_DUP)
2119 {
2120 *err = REG_BADRPT;
2121 return NULL;
2122 }
2123 /* FALLTHROUGH */
2124 case OP_DUP_ASTERISK:
2125 case OP_DUP_PLUS:
2126 case OP_DUP_QUESTION:
2127 if (syntax & RE_CONTEXT_INVALID_OPS)
2128 {
2129 *err = REG_BADRPT;
2130 return NULL;
2131 }
2132 else if (syntax & RE_CONTEXT_INDEP_OPS)
2133 {
2134 *token = fetch_token (regexp, syntax);
2135 return parse_expression (regexp, preg, token, syntax, nest, err);
2136 }
2137 /* else fall through */
2138 case OP_CLOSE_SUBEXP:
2139 if ((token->type == OP_CLOSE_SUBEXP) &&
2140 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2141 {
2142 *err = REG_ERPAREN;
2143 return NULL;
2144 }
2145 /* else fall through */
2146 case OP_CLOSE_DUP_NUM:
2147 /* We treat it as a normal character. */
2148
2149 /* Then we can these characters as normal characters. */
2150 token->type = CHARACTER;
2151 new_idx = re_dfa_add_node (dfa, *token, 0);
2152 tree = create_tree (NULL, NULL, 0, new_idx);
2153 if (BE (new_idx == -1 || tree == NULL, 0))
2154 {
2155 *err = REG_ESPACE;
2156 return NULL;
2157 }
2158 break;
2159 case ANCHOR:
2160 if (dfa->word_char == NULL)
2161 {
2162 *err = init_word_char (dfa);
2163 if (BE (*err != REG_NOERROR, 0))
2164 return NULL;
2165 }
2166 if (token->opr.ctx_type == WORD_DELIM)
2167 {
2168 bin_tree_t *tree_first, *tree_last;
2169 int idx_first, idx_last;
2170 token->opr.ctx_type = WORD_FIRST;
2171 idx_first = re_dfa_add_node (dfa, *token, 0);
2172 tree_first = create_tree (NULL, NULL, 0, idx_first);
2173 token->opr.ctx_type = WORD_LAST;
2174 idx_last = re_dfa_add_node (dfa, *token, 0);
2175 tree_last = create_tree (NULL, NULL, 0, idx_last);
2176 token->type = OP_ALT;
2177 new_idx = re_dfa_add_node (dfa, *token, 0);
2178 tree = create_tree (tree_first, tree_last, 0, new_idx);
2179 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2180 || tree_first == NULL || tree_last == NULL
2181 || tree == NULL, 0))
2182 {
2183 *err = REG_ESPACE;
2184 return NULL;
2185 }
2186 }
2187 else
2188 {
2189 new_idx = re_dfa_add_node (dfa, *token, 0);
2190 tree = create_tree (NULL, NULL, 0, new_idx);
2191 if (BE (new_idx == -1 || tree == NULL, 0))
2192 {
2193 *err = REG_ESPACE;
2194 return NULL;
2195 }
2196 }
2197 /* We must return here, since ANCHORs can't be followed
2198 by repetition operators.
2199 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2200 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2201 *token = fetch_token (regexp, syntax);
2202 return tree;
2203 case OP_PERIOD:
2204 new_idx = re_dfa_add_node (dfa, *token, 0);
2205 tree = create_tree (NULL, NULL, 0, new_idx);
2206 if (BE (new_idx == -1 || tree == NULL, 0))
2207 {
2208 *err = REG_ESPACE;
2209 return NULL;
2210 }
2211 if (dfa->mb_cur_max > 1)
2212 dfa->has_mb_node = 1;
2213 break;
2214 case OP_WORD:
2215 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
2216 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217 return NULL;
2218 break;
2219 case OP_NOTWORD:
2220 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2221 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2222 return NULL;
2223 break;
2224 case OP_SPACE:
2225 tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2226 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227 return NULL;
2228 break;
2229 case OP_NOTSPACE:
2230 tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
2231 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2232 return NULL;
2233 break;
2234 case OP_ALT:
2235 case END_OF_RE:
2236 return NULL;
2237 case BACK_SLASH:
2238 *err = REG_EESCAPE;
2239 return NULL;
2240 default:
2241 /* Must not happen? */
2242 #ifdef DEBUG
2243 assert (0);
2244 #endif
2245 return NULL;
2246 }
2247 *token = fetch_token (regexp, syntax);
2248
2249 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2250 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2251 {
2252 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2253 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2254 return NULL;
2255 /* In BRE consecutive duplications are not allowed. */
2256 if ((syntax & RE_CONTEXT_INVALID_DUP)
2257 && (token->type == OP_DUP_ASTERISK
2258 || token->type == OP_OPEN_DUP_NUM))
2259 {
2260 *err = REG_BADRPT;
2261 return NULL;
2262 }
2263 dfa->has_plural_match = 1;
2264 }
2265
2266 return tree;
2267 }
2268
2269 /* This function build the following tree, from regular expression
2270 (<reg_exp>):
2271 SUBEXP
2272 |
2273 <reg_exp>
2274 */
2275
2276 static bin_tree_t *
2277 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2278 re_string_t *regexp;
2279 regex_t *preg;
2280 re_token_t *token;
2281 reg_syntax_t syntax;
2282 int nest;
2283 reg_errcode_t *err;
2284 {
2285 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2286 bin_tree_t *tree, *left_par, *right_par;
2287 size_t cur_nsub;
2288 int new_idx;
2289 cur_nsub = preg->re_nsub++;
2290 if (dfa->subexps_alloc < preg->re_nsub)
2291 {
2292 re_subexp_t *new_array;
2293 dfa->subexps_alloc *= 2;
2294 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2295 if (BE (new_array == NULL, 0))
2296 {
2297 dfa->subexps_alloc /= 2;
2298 *err = REG_ESPACE;
2299 return NULL;
2300 }
2301 dfa->subexps = new_array;
2302 }
2303 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2304 dfa->subexps[cur_nsub].end = -1;
2305
2306 new_idx = re_dfa_add_node (dfa, *token, 0);
2307 left_par = create_tree (NULL, NULL, 0, new_idx);
2308 if (BE (new_idx == -1 || left_par == NULL, 0))
2309 {
2310 *err = REG_ESPACE;
2311 return NULL;
2312 }
2313 dfa->nodes[new_idx].opr.idx = cur_nsub;
2314 *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
2315
2316 /* The subexpression may be a null string. */
2317 if (token->type == OP_CLOSE_SUBEXP)
2318 tree = NULL;
2319 else
2320 {
2321 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2322 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2323 return NULL;
2324 }
2325 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2326 {
2327 free_bin_tree (tree);
2328 *err = REG_EPAREN;
2329 return NULL;
2330 }
2331 new_idx = re_dfa_add_node (dfa, *token, 0);
2332 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2333 right_par = create_tree (NULL, NULL, 0, new_idx);
2334 tree = ((tree == NULL) ? right_par
2335 : create_tree (tree, right_par, CONCAT, 0));
2336 tree = create_tree (left_par, tree, CONCAT, 0);
2337 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2338 {
2339 *err = REG_ESPACE;
2340 return NULL;
2341 }
2342 dfa->nodes[new_idx].opr.idx = cur_nsub;
2343
2344 return tree;
2345 }
2346
2347 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2348
2349 static bin_tree_t *
2350 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2351 bin_tree_t *dup_elem;
2352 re_string_t *regexp;
2353 re_dfa_t *dfa;
2354 re_token_t *token;
2355 reg_syntax_t syntax;
2356 reg_errcode_t *err;
2357 {
2358 re_token_t dup_token;
2359 bin_tree_t *tree = dup_elem, *work_tree;
2360 int new_idx, start_idx = re_string_cur_idx (regexp);
2361 re_token_t start_token = *token;
2362 if (token->type == OP_OPEN_DUP_NUM)
2363 {
2364 int i;
2365 int end = 0;
2366 int start = fetch_number (regexp, token, syntax);
2367 bin_tree_t *elem;
2368 if (start == -1)
2369 {
2370 if (token->type == CHARACTER && token->opr.c == ',')
2371 start = 0; /* We treat "{,m}" as "{0,m}". */
2372 else
2373 {
2374 *err = REG_BADBR; /* <re>{} is invalid. */
2375 return NULL;
2376 }
2377 }
2378 if (BE (start != -2, 1))
2379 {
2380 /* We treat "{n}" as "{n,n}". */
2381 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2382 : ((token->type == CHARACTER && token->opr.c == ',')
2383 ? fetch_number (regexp, token, syntax) : -2));
2384 }
2385 if (BE (start == -2 || end == -2, 0))
2386 {
2387 /* Invalid sequence. */
2388 if (token->type == OP_CLOSE_DUP_NUM)
2389 goto parse_dup_op_invalid_interval;
2390 else
2391 goto parse_dup_op_ebrace;
2392 }
2393 if (BE (start == 0 && end == 0, 0))
2394 {
2395 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2396 *token = fetch_token (regexp, syntax);
2397 free_bin_tree (dup_elem);
2398 return NULL;
2399 }
2400
2401 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2402 elem = tree;
2403 for (i = 0; i < start; ++i)
2404 if (i != 0)
2405 {
2406 work_tree = duplicate_tree (elem, dfa);
2407 tree = create_tree (tree, work_tree, CONCAT, 0);
2408 if (BE (work_tree == NULL || tree == NULL, 0))
2409 goto parse_dup_op_espace;
2410 }
2411
2412 if (end == -1)
2413 {
2414 /* We treat "<re>{0,}" as "<re>*". */
2415 dup_token.type = OP_DUP_ASTERISK;
2416 if (start > 0)
2417 {
2418 elem = duplicate_tree (elem, dfa);
2419 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2420 work_tree = create_tree (elem, NULL, 0, new_idx);
2421 tree = create_tree (tree, work_tree, CONCAT, 0);
2422 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2423 || tree == NULL, 0))
2424 goto parse_dup_op_espace;
2425 }
2426 else
2427 {
2428 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2429 tree = create_tree (elem, NULL, 0, new_idx);
2430 if (BE (new_idx == -1 || tree == NULL, 0))
2431 goto parse_dup_op_espace;
2432 }
2433 }
2434 else if (BE (start > end, 0))
2435 {
2436 /* First number greater than first. */
2437 *err = REG_BADBR;
2438 return NULL;
2439 }
2440 else if (end - start > 0)
2441 {
2442 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2443 dup_token.type = OP_DUP_QUESTION;
2444 if (start > 0)
2445 {
2446 elem = duplicate_tree (elem, dfa);
2447 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2448 elem = create_tree (elem, NULL, 0, new_idx);
2449 tree = create_tree (tree, elem, CONCAT, 0);
2450 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2451 goto parse_dup_op_espace;
2452 }
2453 else
2454 {
2455 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2456 tree = elem = create_tree (elem, NULL, 0, new_idx);
2457 if (BE (new_idx == -1 || tree == NULL, 0))
2458 goto parse_dup_op_espace;
2459 }
2460 for (i = 1; i < end - start; ++i)
2461 {
2462 work_tree = duplicate_tree (elem, dfa);
2463 tree = create_tree (tree, work_tree, CONCAT, 0);
2464 if (BE (work_tree == NULL || tree == NULL, 0))
2465 {
2466 *err = REG_ESPACE;
2467 return NULL;
2468 }
2469 }
2470 }
2471 }
2472 else
2473 {
2474 new_idx = re_dfa_add_node (dfa, *token, 0);
2475 tree = create_tree (tree, NULL, 0, new_idx);
2476 if (BE (new_idx == -1 || tree == NULL, 0))
2477 {
2478 *err = REG_ESPACE;
2479 return NULL;
2480 }
2481 }
2482 *token = fetch_token (regexp, syntax);
2483 return tree;
2484
2485 parse_dup_op_espace:
2486 free_bin_tree (tree);
2487 *err = REG_ESPACE;
2488 return NULL;
2489
2490 parse_dup_op_ebrace:
2491 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2492 {
2493 *err = REG_EBRACE;
2494 return NULL;
2495 }
2496 goto parse_dup_op_rollback;
2497 parse_dup_op_invalid_interval:
2498 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2499 {
2500 *err = REG_BADBR;
2501 return NULL;
2502 }
2503 parse_dup_op_rollback:
2504 re_string_set_index (regexp, start_idx);
2505 *token = start_token;
2506 token->type = CHARACTER;
2507 return dup_elem;
2508 }
2509
2510 /* Size of the names for collating symbol/equivalence_class/character_class.
2511 I'm not sure, but maybe enough. */
2512 #define BRACKET_NAME_BUF_SIZE 32
2513
2514 #ifndef _LIBC
2515 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2516 Build the range expression which starts from START_ELEM, and ends
2517 at END_ELEM. The result are written to MBCSET and SBCSET.
2518 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2519 mbcset->range_ends, is a pointer argument sinse we may
2520 update it. */
2521
2522 static reg_errcode_t
2523 # ifdef RE_ENABLE_I18N
2524 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2525 re_charset_t *mbcset;
2526 int *range_alloc;
2527 # else /* not RE_ENABLE_I18N */
2528 build_range_exp (sbcset, start_elem, end_elem)
2529 # endif /* not RE_ENABLE_I18N */
2530 re_bitset_ptr_t sbcset;
2531 bracket_elem_t *start_elem, *end_elem;
2532 {
2533 unsigned int start_ch, end_ch;
2534 /* Equivalence Classes and Character Classes can't be a range start/end. */
2535 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2536 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2537 0))
2538 return REG_ERANGE;
2539
2540 /* We can handle no multi character collating elements without libc
2541 support. */
2542 if (BE ((start_elem->type == COLL_SYM
2543 && strlen ((char *) start_elem->opr.name) > 1)
2544 || (end_elem->type == COLL_SYM
2545 && strlen ((char *) end_elem->opr.name) > 1), 0))
2546 return REG_ECOLLATE;
2547
2548 # ifdef RE_ENABLE_I18N
2549 {
2550 wchar_t wc, start_wc, end_wc;
2551 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2552
2553 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2554 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2555 : 0));
2556 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2557 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2558 : 0));
2559 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2560 ? __btowc (start_ch) : start_elem->opr.wch);
2561 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2562 ? __btowc (end_ch) : end_elem->opr.wch);
2563 cmp_buf[0] = start_wc;
2564 cmp_buf[4] = end_wc;
2565 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2566 return REG_ERANGE;
2567
2568 /* Check the space of the arrays. */
2569 if (*range_alloc == mbcset->nranges)
2570 {
2571 /* There are not enough space, need realloc. */
2572 wchar_t *new_array_start, *new_array_end;
2573 int new_nranges;
2574
2575 /* +1 in case of mbcset->nranges is 0. */
2576 new_nranges = 2 * mbcset->nranges + 1;
2577 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2578 are NULL if *range_alloc == 0. */
2579 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2580 new_nranges);
2581 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2582 new_nranges);
2583
2584 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2585 return REG_ESPACE;
2586
2587 mbcset->range_starts = new_array_start;
2588 mbcset->range_ends = new_array_end;
2589 *range_alloc = new_nranges;
2590 }
2591
2592 mbcset->range_starts[mbcset->nranges] = start_wc;
2593 mbcset->range_ends[mbcset->nranges++] = end_wc;
2594
2595 /* Build the table for single byte characters. */
2596 for (wc = 0; wc <= SBC_MAX; ++wc)
2597 {
2598 cmp_buf[2] = wc;
2599 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2600 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2601 bitset_set (sbcset, wc);
2602 }
2603 }
2604 # else /* not RE_ENABLE_I18N */
2605 {
2606 unsigned int ch;
2607 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2608 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2609 : 0));
2610 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2611 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2612 : 0));
2613 if (start_ch > end_ch)
2614 return REG_ERANGE;
2615 /* Build the table for single byte characters. */
2616 for (ch = 0; ch <= SBC_MAX; ++ch)
2617 if (start_ch <= ch && ch <= end_ch)
2618 bitset_set (sbcset, ch);
2619 }
2620 # endif /* not RE_ENABLE_I18N */
2621 return REG_NOERROR;
2622 }
2623 #endif /* not _LIBC */
2624
2625 #ifndef _LIBC
2626 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2627 Build the collating element which is represented by NAME.
2628 The result are written to MBCSET and SBCSET.
2629 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2630 pointer argument since we may update it. */
2631
2632 static reg_errcode_t
2633 # ifdef RE_ENABLE_I18N
2634 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2635 re_charset_t *mbcset;
2636 int *coll_sym_alloc;
2637 # else /* not RE_ENABLE_I18N */
2638 build_collating_symbol (sbcset, name)
2639 # endif /* not RE_ENABLE_I18N */
2640 re_bitset_ptr_t sbcset;
2641 const unsigned char *name;
2642 {
2643 size_t name_len = strlen ((const char *) name);
2644 if (BE (name_len != 1, 0))
2645 return REG_ECOLLATE;
2646 else
2647 {
2648 bitset_set (sbcset, name[0]);
2649 return REG_NOERROR;
2650 }
2651 }
2652 #endif /* not _LIBC */
2653
2654 /* This function parse bracket expression like "[abc]", "[a-c]",
2655 "[[.a-a.]]" etc. */
2656
2657 static bin_tree_t *
2658 parse_bracket_exp (regexp, dfa, token, syntax, err)
2659 re_string_t *regexp;
2660 re_dfa_t *dfa;
2661 re_token_t *token;
2662 reg_syntax_t syntax;
2663 reg_errcode_t *err;
2664 {
2665 #ifdef _LIBC
2666 const unsigned char *collseqmb;
2667 const char *collseqwc;
2668 uint32_t nrules;
2669 int32_t table_size;
2670 const int32_t *symb_table;
2671 const unsigned char *extra;
2672
2673 /* Local function for parse_bracket_exp used in _LIBC environement.
2674 Seek the collating symbol entry correspondings to NAME.
2675 Return the index of the symbol in the SYMB_TABLE. */
2676
2677 static inline int32_t
2678 __attribute ((always_inline))
2679 seek_collating_symbol_entry (name, name_len)
2680 const unsigned char *name;
2681 size_t name_len;
2682 {
2683 int32_t hash = elem_hash ((const char *) name, name_len);
2684 int32_t elem = hash % table_size;
2685 int32_t second = hash % (table_size - 2);
2686 while (symb_table[2 * elem] != 0)
2687 {
2688 /* First compare the hashing value. */
2689 if (symb_table[2 * elem] == hash
2690 /* Compare the length of the name. */
2691 && name_len == extra[symb_table[2 * elem + 1]]
2692 /* Compare the name. */
2693 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2694 name_len) == 0)
2695 {
2696 /* Yep, this is the entry. */
2697 break;
2698 }
2699
2700 /* Next entry. */
2701 elem += second;
2702 }
2703 return elem;
2704 }
2705
2706 /* Local function for parse_bracket_exp used in _LIBC environement.
2707 Look up the collation sequence value of BR_ELEM.
2708 Return the value if succeeded, UINT_MAX otherwise. */
2709
2710 static inline unsigned int
2711 __attribute ((always_inline))
2712 lookup_collation_sequence_value (br_elem)
2713 bracket_elem_t *br_elem;
2714 {
2715 if (br_elem->type == SB_CHAR)
2716 {
2717 /*
2718 if (MB_CUR_MAX == 1)
2719 */
2720 if (nrules == 0)
2721 return collseqmb[br_elem->opr.ch];
2722 else
2723 {
2724 wint_t wc = __btowc (br_elem->opr.ch);
2725 return __collseq_table_lookup (collseqwc, wc);
2726 }
2727 }
2728 else if (br_elem->type == MB_CHAR)
2729 {
2730 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2731 }
2732 else if (br_elem->type == COLL_SYM)
2733 {
2734 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2735 if (nrules != 0)
2736 {
2737 int32_t elem, idx;
2738 elem = seek_collating_symbol_entry (br_elem->opr.name,
2739 sym_name_len);
2740 if (symb_table[2 * elem] != 0)
2741 {
2742 /* We found the entry. */
2743 idx = symb_table[2 * elem + 1];
2744 /* Skip the name of collating element name. */
2745 idx += 1 + extra[idx];
2746 /* Skip the byte sequence of the collating element. */
2747 idx += 1 + extra[idx];
2748 /* Adjust for the alignment. */
2749 idx = (idx + 3) & ~3;
2750 /* Skip the multibyte collation sequence value. */
2751 idx += sizeof (unsigned int);
2752 /* Skip the wide char sequence of the collating element. */
2753 idx += sizeof (unsigned int) *
2754 (1 + *(unsigned int *) (extra + idx));
2755 /* Return the collation sequence value. */
2756 return *(unsigned int *) (extra + idx);
2757 }
2758 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2759 {
2760 /* No valid character. Match it as a single byte
2761 character. */
2762 return collseqmb[br_elem->opr.name[0]];
2763 }
2764 }
2765 else if (sym_name_len == 1)
2766 return collseqmb[br_elem->opr.name[0]];
2767 }
2768 return UINT_MAX;
2769 }
2770
2771 /* Local function for parse_bracket_exp used in _LIBC environement.
2772 Build the range expression which starts from START_ELEM, and ends
2773 at END_ELEM. The result are written to MBCSET and SBCSET.
2774 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2775 mbcset->range_ends, is a pointer argument sinse we may
2776 update it. */
2777
2778 static inline reg_errcode_t
2779 __attribute ((always_inline))
2780 # ifdef RE_ENABLE_I18N
2781 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2782 re_charset_t *mbcset;
2783 int *range_alloc;
2784 # else /* not RE_ENABLE_I18N */
2785 build_range_exp (sbcset, start_elem, end_elem)
2786 # endif /* not RE_ENABLE_I18N */
2787 re_bitset_ptr_t sbcset;
2788 bracket_elem_t *start_elem, *end_elem;
2789 {
2790 unsigned int ch;
2791 uint32_t start_collseq;
2792 uint32_t end_collseq;
2793
2794 # ifdef RE_ENABLE_I18N
2795 /* Check the space of the arrays. */
2796 if (*range_alloc == mbcset->nranges)
2797 {
2798 /* There are not enough space, need realloc. */
2799 uint32_t *new_array_start;
2800 uint32_t *new_array_end;
2801 int new_nranges;
2802
2803 /* +1 in case of mbcset->nranges is 0. */
2804 new_nranges = 2 * mbcset->nranges + 1;
2805 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2806 are NULL if *range_alloc == 0. */
2807 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2808 new_nranges);
2809 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2810 new_nranges);
2811
2812 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2813 return REG_ESPACE;
2814
2815 mbcset->range_starts = new_array_start;
2816 mbcset->range_ends = new_array_end;
2817 *range_alloc = new_nranges;
2818 }
2819 # endif /* RE_ENABLE_I18N */
2820
2821 /* Equivalence Classes and Character Classes can't be a range
2822 start/end. */
2823 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2824 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2825 0))
2826 return REG_ERANGE;
2827
2828 start_collseq = lookup_collation_sequence_value (start_elem);
2829 end_collseq = lookup_collation_sequence_value (end_elem);
2830 /* Check start/end collation sequence values. */
2831 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2832 return REG_ECOLLATE;
2833 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2834 return REG_ERANGE;
2835
2836 # ifdef RE_ENABLE_I18N
2837 /* Got valid collation sequence values, add them as a new entry. */
2838 mbcset->range_starts[mbcset->nranges] = start_collseq;
2839 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2840 # endif /* RE_ENABLE_I18N */
2841
2842 /* Build the table for single byte characters. */
2843 for (ch = 0; ch <= SBC_MAX; ch++)
2844 {
2845 uint32_t ch_collseq;
2846 /*
2847 if (MB_CUR_MAX == 1)
2848 */
2849 if (nrules == 0)
2850 ch_collseq = collseqmb[ch];
2851 else
2852 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2853 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2854 bitset_set (sbcset, ch);
2855 }
2856 return REG_NOERROR;
2857 }
2858
2859 /* Local function for parse_bracket_exp used in _LIBC environement.
2860 Build the collating element which is represented by NAME.
2861 The result are written to MBCSET and SBCSET.
2862 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2863 pointer argument sinse we may update it. */
2864
2865 static inline reg_errcode_t
2866 __attribute ((always_inline))
2867 # ifdef RE_ENABLE_I18N
2868 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2869 re_charset_t *mbcset;
2870 int *coll_sym_alloc;
2871 # else /* not RE_ENABLE_I18N */
2872 build_collating_symbol (sbcset, name)
2873 # endif /* not RE_ENABLE_I18N */
2874 re_bitset_ptr_t sbcset;
2875 const unsigned char *name;
2876 {
2877 int32_t elem, idx;
2878 size_t name_len = strlen ((const char *) name);
2879 if (nrules != 0)
2880 {
2881 elem = seek_collating_symbol_entry (name, name_len);
2882 if (symb_table[2 * elem] != 0)
2883 {
2884 /* We found the entry. */
2885 idx = symb_table[2 * elem + 1];
2886 /* Skip the name of collating element name. */
2887 idx += 1 + extra[idx];
2888 }
2889 else if (symb_table[2 * elem] == 0 && name_len == 1)
2890 {
2891 /* No valid character, treat it as a normal
2892 character. */
2893 bitset_set (sbcset, name[0]);
2894 return REG_NOERROR;
2895 }
2896 else
2897 return REG_ECOLLATE;
2898
2899 # ifdef RE_ENABLE_I18N
2900 /* Got valid collation sequence, add it as a new entry. */
2901 /* Check the space of the arrays. */
2902 if (*coll_sym_alloc == mbcset->ncoll_syms)
2903 {
2904 /* Not enough, realloc it. */
2905 /* +1 in case of mbcset->ncoll_syms is 0. */
2906 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2907 /* Use realloc since mbcset->coll_syms is NULL
2908 if *alloc == 0. */
2909 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2910 *coll_sym_alloc);
2911 if (BE (mbcset->coll_syms == NULL, 0))
2912 return REG_ESPACE;
2913 }
2914 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2915 # endif /* RE_ENABLE_I18N */
2916 return REG_NOERROR;
2917 }
2918 else
2919 {
2920 if (BE (name_len != 1, 0))
2921 return REG_ECOLLATE;
2922 else
2923 {
2924 bitset_set (sbcset, name[0]);
2925 return REG_NOERROR;
2926 }
2927 }
2928 }
2929 #endif
2930
2931 re_token_t br_token;
2932 re_bitset_ptr_t sbcset;
2933 #ifdef RE_ENABLE_I18N
2934 re_charset_t *mbcset;
2935 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2936 int equiv_class_alloc = 0, char_class_alloc = 0;
2937 #else /* not RE_ENABLE_I18N */
2938 int non_match = 0;
2939 #endif /* not RE_ENABLE_I18N */
2940 bin_tree_t *work_tree;
2941 int token_len, new_idx;
2942 #ifdef _LIBC
2943 collseqmb = (const unsigned char *)
2944 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2945 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2946 if (nrules)
2947 {
2948 /*
2949 if (MB_CUR_MAX > 1)
2950 */
2951 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2952 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2953 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2954 _NL_COLLATE_SYMB_TABLEMB);
2955 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2956 _NL_COLLATE_SYMB_EXTRAMB);
2957 }
2958 #endif
2959 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2960 #ifdef RE_ENABLE_I18N
2961 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2962 #endif /* RE_ENABLE_I18N */
2963 #ifdef RE_ENABLE_I18N
2964 if (BE (sbcset == NULL || mbcset == NULL, 0))
2965 #else
2966 if (BE (sbcset == NULL, 0))
2967 #endif /* RE_ENABLE_I18N */
2968 {
2969 *err = REG_ESPACE;
2970 return NULL;
2971 }
2972
2973 token_len = peek_token_bracket (token, regexp, syntax);
2974 if (BE (token->type == END_OF_RE, 0))
2975 {
2976 *err = REG_BADPAT;
2977 goto parse_bracket_exp_free_return;
2978 }
2979 if (token->type == OP_NON_MATCH_LIST)
2980 {
2981 #ifdef RE_ENABLE_I18N
2982 int i;
2983 mbcset->non_match = 1;
2984 #else /* not RE_ENABLE_I18N */
2985 non_match = 1;
2986 #endif /* not RE_ENABLE_I18N */
2987 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2988 bitset_set (sbcset, '\0');
2989 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2990 token_len = peek_token_bracket (token, regexp, syntax);
2991 if (BE (token->type == END_OF_RE, 0))
2992 {
2993 *err = REG_BADPAT;
2994 goto parse_bracket_exp_free_return;
2995 }
2996 #ifdef RE_ENABLE_I18N
2997 if (dfa->mb_cur_max > 1)
2998 for (i = 0; i < SBC_MAX; ++i)
2999 if (__btowc (i) == WEOF)
3000 bitset_set (sbcset, i);
3001 #endif /* RE_ENABLE_I18N */
3002 }
3003
3004 /* We treat the first ']' as a normal character. */
3005 if (token->type == OP_CLOSE_BRACKET)
3006 token->type = CHARACTER;
3007
3008 int first_round = 1;
3009 while (1)
3010 {
3011 bracket_elem_t start_elem, end_elem;
3012 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3013 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3014 reg_errcode_t ret;
3015 int token_len2 = 0, is_range_exp = 0;
3016 re_token_t token2;
3017
3018 start_elem.opr.name = start_name_buf;
3019 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3020 syntax, first_round);
3021 if (BE (ret != REG_NOERROR, 0))
3022 {
3023 *err = ret;
3024 goto parse_bracket_exp_free_return;
3025 }
3026 first_round = 0;
3027
3028 /* Get information about the next token. We need it in any case. */
3029 token_len = peek_token_bracket (token, regexp, syntax);
3030
3031 /* Do not check for ranges if we know they are not allowed. */
3032 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3033 {
3034 if (BE (token->type == END_OF_RE, 0))
3035 {
3036 *err = REG_EBRACK;
3037 goto parse_bracket_exp_free_return;
3038 }
3039 if (token->type == OP_CHARSET_RANGE)
3040 {
3041 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3042 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3043 if (BE (token2.type == END_OF_RE, 0))
3044 {
3045 *err = REG_EBRACK;
3046 goto parse_bracket_exp_free_return;
3047 }
3048 if (token2.type == OP_CLOSE_BRACKET)
3049 {
3050 /* We treat the last '-' as a normal character. */
3051 re_string_skip_bytes (regexp, -token_len);
3052 token->type = CHARACTER;
3053 }
3054 else
3055 is_range_exp = 1;
3056 }
3057 }
3058
3059 if (is_range_exp == 1)
3060 {
3061 end_elem.opr.name = end_name_buf;
3062 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3063 dfa, syntax, 1);
3064 if (BE (ret != REG_NOERROR, 0))
3065 {
3066 *err = ret;
3067 goto parse_bracket_exp_free_return;
3068 }
3069
3070 token_len = peek_token_bracket (token, regexp, syntax);
3071
3072 *err = build_range_exp (sbcset,
3073 #ifdef RE_ENABLE_I18N
3074 mbcset, &range_alloc,
3075 #endif /* RE_ENABLE_I18N */
3076 &start_elem, &end_elem);
3077 if (BE (*err != REG_NOERROR, 0))
3078 goto parse_bracket_exp_free_return;
3079 }
3080 else
3081 {
3082 switch (start_elem.type)
3083 {
3084 case SB_CHAR:
3085 bitset_set (sbcset, start_elem.opr.ch);
3086 break;
3087 #ifdef RE_ENABLE_I18N
3088 case MB_CHAR:
3089 /* Check whether the array has enough space. */
3090 if (mbchar_alloc == mbcset->nmbchars)
3091 {
3092 /* Not enough, realloc it. */
3093 /* +1 in case of mbcset->nmbchars is 0. */
3094 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3095 /* Use realloc since array is NULL if *alloc == 0. */
3096 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
3097 mbchar_alloc);
3098 if (BE (mbcset->mbchars == NULL, 0))
3099 goto parse_bracket_exp_espace;
3100 }
3101 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3102 break;
3103 #endif /* RE_ENABLE_I18N */
3104 case EQUIV_CLASS:
3105 *err = build_equiv_class (sbcset,
3106 #ifdef RE_ENABLE_I18N
3107 mbcset, &equiv_class_alloc,
3108 #endif /* RE_ENABLE_I18N */
3109 start_elem.opr.name);
3110 if (BE (*err != REG_NOERROR, 0))
3111 goto parse_bracket_exp_free_return;
3112 break;
3113 case COLL_SYM:
3114 *err = build_collating_symbol (sbcset,
3115 #ifdef RE_ENABLE_I18N
3116 mbcset, &coll_sym_alloc,
3117 #endif /* RE_ENABLE_I18N */
3118 start_elem.opr.name);
3119 if (BE (*err != REG_NOERROR, 0))
3120 goto parse_bracket_exp_free_return;
3121 break;
3122 case CHAR_CLASS:
3123 *err = build_charclass (regexp->trans, sbcset,
3124 #ifdef RE_ENABLE_I18N
3125 mbcset, &char_class_alloc,
3126 #endif /* RE_ENABLE_I18N */
3127 start_elem.opr.name, syntax);
3128 if (BE (*err != REG_NOERROR, 0))
3129 goto parse_bracket_exp_free_return;
3130 break;
3131 default:
3132 assert (0);
3133 break;
3134 }
3135 }
3136 if (BE (token->type == END_OF_RE, 0))
3137 {
3138 *err = REG_EBRACK;
3139 goto parse_bracket_exp_free_return;
3140 }
3141 if (token->type == OP_CLOSE_BRACKET)
3142 break;
3143 }
3144
3145 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3146
3147 /* If it is non-matching list. */
3148 #ifdef RE_ENABLE_I18N
3149 if (mbcset->non_match)
3150 #else /* not RE_ENABLE_I18N */
3151 if (non_match)
3152 #endif /* not RE_ENABLE_I18N */
3153 bitset_not (sbcset);
3154
3155 /* Build a tree for simple bracket. */
3156 br_token.type = SIMPLE_BRACKET;
3157 br_token.opr.sbcset = sbcset;
3158 new_idx = re_dfa_add_node (dfa, br_token, 0);
3159 work_tree = create_tree (NULL, NULL, 0, new_idx);
3160 if (BE (new_idx == -1 || work_tree == NULL, 0))
3161 goto parse_bracket_exp_espace;
3162
3163 #ifdef RE_ENABLE_I18N
3164 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3165 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166 || mbcset->non_match)))
3167 {
3168 re_token_t alt_token;
3169 bin_tree_t *mbc_tree;
3170 int sbc_idx;
3171 /* Build a tree for complex bracket. */
3172 dfa->has_mb_node = 1;
3173 dfa->has_plural_match = 1;
3174 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3175 if (sbcset[sbc_idx])
3176 break;
3177 /* If there are no bits set in sbcset, there is no point
3178 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3179 if (sbc_idx == BITSET_UINTS)
3180 {
3181 re_free (sbcset);
3182 dfa->nodes[new_idx].type = COMPLEX_BRACKET;
3183 dfa->nodes[new_idx].opr.mbcset = mbcset;
3184 return work_tree;
3185 }
3186 br_token.type = COMPLEX_BRACKET;
3187 br_token.opr.mbcset = mbcset;
3188 new_idx = re_dfa_add_node (dfa, br_token, 0);
3189 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3190 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3191 goto parse_bracket_exp_espace;
3192 /* Then join them by ALT node. */
3193 alt_token.type = OP_ALT;
3194 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3195 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3196 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3197 return work_tree;
3198 }
3199 else
3200 {
3201 free_charset (mbcset);
3202 return work_tree;
3203 }
3204 #else /* not RE_ENABLE_I18N */
3205 return work_tree;
3206 #endif /* not RE_ENABLE_I18N */
3207
3208 parse_bracket_exp_espace:
3209 *err = REG_ESPACE;
3210 parse_bracket_exp_free_return:
3211 re_free (sbcset);
3212 #ifdef RE_ENABLE_I18N
3213 free_charset (mbcset);
3214 #endif /* RE_ENABLE_I18N */
3215 return NULL;
3216 }
3217
3218 /* Parse an element in the bracket expression. */
3219
3220 static reg_errcode_t
3221 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3222 accept_hyphen)
3223 bracket_elem_t *elem;
3224 re_string_t *regexp;
3225 re_token_t *token;
3226 int token_len;
3227 re_dfa_t *dfa;
3228 reg_syntax_t syntax;
3229 int accept_hyphen;
3230 {
3231 #ifdef RE_ENABLE_I18N
3232 int cur_char_size;
3233 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3234 if (cur_char_size > 1)
3235 {
3236 elem->type = MB_CHAR;
3237 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3238 re_string_skip_bytes (regexp, cur_char_size);
3239 return REG_NOERROR;
3240 }
3241 #endif /* RE_ENABLE_I18N */
3242 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3243 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3244 || token->type == OP_OPEN_EQUIV_CLASS)
3245 return parse_bracket_symbol (elem, regexp, token);
3246 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3247 {
3248 /* A '-' must only appear as anything but a range indicator before
3249 the closing bracket. Everything else is an error. */
3250 re_token_t token2;
3251 (void) peek_token_bracket (&token2, regexp, syntax);
3252 if (token2.type != OP_CLOSE_BRACKET)
3253 /* The actual error value is not standardized since this whole
3254 case is undefined. But ERANGE makes good sense. */
3255 return REG_ERANGE;
3256 }
3257 elem->type = SB_CHAR;
3258 elem->opr.ch = token->opr.c;
3259 return REG_NOERROR;
3260 }
3261
3262 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3263 such as [:<character_class>:], [.<collating_element>.], and
3264 [=<equivalent_class>=]. */
3265
3266 static reg_errcode_t
3267 parse_bracket_symbol (elem, regexp, token)
3268 bracket_elem_t *elem;
3269 re_string_t *regexp;
3270 re_token_t *token;
3271 {
3272 unsigned char ch, delim = token->opr.c;
3273 int i = 0;
3274 for (;; ++i)
3275 {
3276 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3277 return REG_EBRACK;
3278 if (token->type == OP_OPEN_CHAR_CLASS)
3279 ch = re_string_fetch_byte_case (regexp);
3280 else
3281 ch = re_string_fetch_byte (regexp);
3282 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3283 break;
3284 elem->opr.name[i] = ch;
3285 }
3286 re_string_skip_bytes (regexp, 1);
3287 elem->opr.name[i] = '\0';
3288 switch (token->type)
3289 {
3290 case OP_OPEN_COLL_ELEM:
3291 elem->type = COLL_SYM;
3292 break;
3293 case OP_OPEN_EQUIV_CLASS:
3294 elem->type = EQUIV_CLASS;
3295 break;
3296 case OP_OPEN_CHAR_CLASS:
3297 elem->type = CHAR_CLASS;
3298 break;
3299 default:
3300 break;
3301 }
3302 return REG_NOERROR;
3303 }
3304
3305 /* Helper function for parse_bracket_exp.
3306 Build the equivalence class which is represented by NAME.
3307 The result are written to MBCSET and SBCSET.
3308 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3309 is a pointer argument sinse we may update it. */
3310
3311 static reg_errcode_t
3312 #ifdef RE_ENABLE_I18N
3313 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3314 re_charset_t *mbcset;
3315 int *equiv_class_alloc;
3316 #else /* not RE_ENABLE_I18N */
3317 build_equiv_class (sbcset, name)
3318 #endif /* not RE_ENABLE_I18N */
3319 re_bitset_ptr_t sbcset;
3320 const unsigned char *name;
3321 {
3322 #if defined _LIBC && defined RE_ENABLE_I18N
3323 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3324 if (nrules != 0)
3325 {
3326 const int32_t *table, *indirect;
3327 const unsigned char *weights, *extra, *cp;
3328 unsigned char char_buf[2];
3329 int32_t idx1, idx2;
3330 unsigned int ch;
3331 size_t len;
3332 /* This #include defines a local function! */
3333 # include <locale/weight.h>
3334 /* Calculate the index for equivalence class. */
3335 cp = name;
3336 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3337 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3338 _NL_COLLATE_WEIGHTMB);
3339 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3340 _NL_COLLATE_EXTRAMB);
3341 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3342 _NL_COLLATE_INDIRECTMB);
3343 idx1 = findidx (&cp);
3344 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3345 /* This isn't a valid character. */
3346 return REG_ECOLLATE;
3347
3348 /* Build single byte matcing table for this equivalence class. */
3349 char_buf[1] = (unsigned char) '\0';
3350 len = weights[idx1];
3351 for (ch = 0; ch < SBC_MAX; ++ch)
3352 {
3353 char_buf[0] = ch;
3354 cp = char_buf;
3355 idx2 = findidx (&cp);
3356 /*
3357 idx2 = table[ch];
3358 */
3359 if (idx2 == 0)
3360 /* This isn't a valid character. */
3361 continue;
3362 if (len == weights[idx2])
3363 {
3364 int cnt = 0;
3365 while (cnt <= len &&
3366 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3367 ++cnt;
3368
3369 if (cnt > len)
3370 bitset_set (sbcset, ch);
3371 }
3372 }
3373 /* Check whether the array has enough space. */
3374 if (*equiv_class_alloc == mbcset->nequiv_classes)
3375 {
3376 /* Not enough, realloc it. */
3377 /* +1 in case of mbcset->nequiv_classes is 0. */
3378 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3379 /* Use realloc since the array is NULL if *alloc == 0. */
3380 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3381 *equiv_class_alloc);
3382 if (BE (mbcset->equiv_classes == NULL, 0))
3383 return REG_ESPACE;
3384 }
3385 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3386 }
3387 else
3388 #endif /* _LIBC && RE_ENABLE_I18N */
3389 {
3390 if (BE (strlen ((const char *) name) != 1, 0))
3391 return REG_ECOLLATE;
3392 bitset_set (sbcset, *name);
3393 }
3394 return REG_NOERROR;
3395 }
3396
3397 /* Helper function for parse_bracket_exp.
3398 Build the character class which is represented by NAME.
3399 The result are written to MBCSET and SBCSET.
3400 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3401 is a pointer argument sinse we may update it. */
3402
3403 static reg_errcode_t
3404 #ifdef RE_ENABLE_I18N
3405 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3406 re_charset_t *mbcset;
3407 int *char_class_alloc;
3408 #else /* not RE_ENABLE_I18N */
3409 build_charclass (trans, sbcset, class_name, syntax)
3410 #endif /* not RE_ENABLE_I18N */
3411 RE_TRANSLATE_TYPE trans;
3412 re_bitset_ptr_t sbcset;
3413 const unsigned char *class_name;
3414 reg_syntax_t syntax;
3415 {
3416 int i;
3417 const char *name = (const char *) class_name;
3418
3419 /* In case of REG_ICASE "upper" and "lower" match the both of
3420 upper and lower cases. */
3421 if ((syntax & RE_ICASE)
3422 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3423 name = "alpha";
3424
3425 #ifdef RE_ENABLE_I18N
3426 /* Check the space of the arrays. */
3427 if (*char_class_alloc == mbcset->nchar_classes)
3428 {
3429 /* Not enough, realloc it. */
3430 /* +1 in case of mbcset->nchar_classes is 0. */
3431 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3432 /* Use realloc since array is NULL if *alloc == 0. */
3433 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3434 *char_class_alloc);
3435 if (BE (mbcset->char_classes == NULL, 0))
3436 return REG_ESPACE;
3437 }
3438 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3440
3441 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3442 for (i = 0; i < SBC_MAX; ++i) \
3443 { \
3444 if (ctype_func (i)) \
3445 { \
3446 int ch = trans ? trans[i] : i; \
3447 bitset_set (sbcset, ch); \
3448 } \
3449 }
3450
3451 if (strcmp (name, "alnum") == 0)
3452 BUILD_CHARCLASS_LOOP (isalnum)
3453 else if (strcmp (name, "cntrl") == 0)
3454 BUILD_CHARCLASS_LOOP (iscntrl)
3455 else if (strcmp (name, "lower") == 0)
3456 BUILD_CHARCLASS_LOOP (islower)
3457 else if (strcmp (name, "space") == 0)
3458 BUILD_CHARCLASS_LOOP (isspace)
3459 else if (strcmp (name, "alpha") == 0)
3460 BUILD_CHARCLASS_LOOP (isalpha)
3461 else if (strcmp (name, "digit") == 0)
3462 BUILD_CHARCLASS_LOOP (isdigit)
3463 else if (strcmp (name, "print") == 0)
3464 BUILD_CHARCLASS_LOOP (isprint)
3465 else if (strcmp (name, "upper") == 0)
3466 BUILD_CHARCLASS_LOOP (isupper)
3467 else if (strcmp (name, "blank") == 0)
3468 BUILD_CHARCLASS_LOOP (isblank)
3469 else if (strcmp (name, "graph") == 0)
3470 BUILD_CHARCLASS_LOOP (isgraph)
3471 else if (strcmp (name, "punct") == 0)
3472 BUILD_CHARCLASS_LOOP (ispunct)
3473 else if (strcmp (name, "xdigit") == 0)
3474 BUILD_CHARCLASS_LOOP (isxdigit)
3475 else
3476 return REG_ECTYPE;
3477
3478 return REG_NOERROR;
3479 }
3480
3481 static bin_tree_t *
3482 build_charclass_op (dfa, trans, class_name, extra, not, err)
3483 re_dfa_t *dfa;
3484 RE_TRANSLATE_TYPE trans;
3485 const unsigned char *class_name;
3486 const unsigned char *extra;
3487 int not;
3488 reg_errcode_t *err;
3489 {
3490 re_bitset_ptr_t sbcset;
3491 #ifdef RE_ENABLE_I18N
3492 re_charset_t *mbcset;
3493 int alloc = 0;
3494 #else /* not RE_ENABLE_I18N */
3495 int non_match = 0;
3496 #endif /* not RE_ENABLE_I18N */
3497 reg_errcode_t ret;
3498 re_token_t br_token;
3499 bin_tree_t *tree;
3500 int new_idx;
3501
3502 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3503 #ifdef RE_ENABLE_I18N
3504 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3505 #endif /* RE_ENABLE_I18N */
3506
3507 #ifdef RE_ENABLE_I18N
3508 if (BE (sbcset == NULL || mbcset == NULL, 0))
3509 #else /* not RE_ENABLE_I18N */
3510 if (BE (sbcset == NULL, 0))
3511 #endif /* not RE_ENABLE_I18N */
3512 {
3513 *err = REG_ESPACE;
3514 return NULL;
3515 }
3516
3517 if (not)
3518 {
3519 #ifdef RE_ENABLE_I18N
3520 int i;
3521 /*
3522 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3523 bitset_set(cset->sbcset, '\0');
3524 */
3525 mbcset->non_match = 1;
3526 if (dfa->mb_cur_max > 1)
3527 for (i = 0; i < SBC_MAX; ++i)
3528 if (__btowc (i) == WEOF)
3529 bitset_set (sbcset, i);
3530 #else /* not RE_ENABLE_I18N */
3531 non_match = 1;
3532 #endif /* not RE_ENABLE_I18N */
3533 }
3534
3535 /* We don't care the syntax in this case. */
3536 ret = build_charclass (trans, sbcset,
3537 #ifdef RE_ENABLE_I18N
3538 mbcset, &alloc,
3539 #endif /* RE_ENABLE_I18N */
3540 class_name, 0);
3541
3542 if (BE (ret != REG_NOERROR, 0))
3543 {
3544 re_free (sbcset);
3545 #ifdef RE_ENABLE_I18N
3546 free_charset (mbcset);
3547 #endif /* RE_ENABLE_I18N */
3548 *err = ret;
3549 return NULL;
3550 }
3551 /* \w match '_' also. */
3552 for (; *extra; extra++)
3553 bitset_set (sbcset, *extra);
3554
3555 /* If it is non-matching list. */
3556 #ifdef RE_ENABLE_I18N
3557 if (mbcset->non_match)
3558 #else /* not RE_ENABLE_I18N */
3559 if (non_match)
3560 #endif /* not RE_ENABLE_I18N */
3561 bitset_not (sbcset);
3562
3563 /* Build a tree for simple bracket. */
3564 br_token.type = SIMPLE_BRACKET;
3565 br_token.opr.sbcset = sbcset;
3566 new_idx = re_dfa_add_node (dfa, br_token, 0);
3567 tree = create_tree (NULL, NULL, 0, new_idx);
3568 if (BE (new_idx == -1 || tree == NULL, 0))
3569 goto build_word_op_espace;
3570
3571 #ifdef RE_ENABLE_I18N
3572 if (dfa->mb_cur_max > 1)
3573 {
3574 re_token_t alt_token;
3575 bin_tree_t *mbc_tree;
3576 /* Build a tree for complex bracket. */
3577 br_token.type = COMPLEX_BRACKET;
3578 br_token.opr.mbcset = mbcset;
3579 dfa->has_mb_node = 1;
3580 new_idx = re_dfa_add_node (dfa, br_token, 0);
3581 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3582 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3583 goto build_word_op_espace;
3584 /* Then join them by ALT node. */
3585 alt_token.type = OP_ALT;
3586 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3587 tree = create_tree (tree, mbc_tree, 0, new_idx);
3588 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3589 return tree;
3590 }
3591 else
3592 {
3593 free_charset (mbcset);
3594 return tree;
3595 }
3596 #else /* not RE_ENABLE_I18N */
3597 return tree;
3598 #endif /* not RE_ENABLE_I18N */
3599
3600 build_word_op_espace:
3601 re_free (sbcset);
3602 #ifdef RE_ENABLE_I18N
3603 free_charset (mbcset);
3604 #endif /* RE_ENABLE_I18N */
3605 *err = REG_ESPACE;
3606 return NULL;
3607 }
3608
3609 /* This is intended for the expressions like "a{1,3}".
3610 Fetch a number from `input', and return the number.
3611 Return -1, if the number field is empty like "{,1}".
3612 Return -2, If an error is occured. */
3613
3614 static int
3615 fetch_number (input, token, syntax)
3616 re_string_t *input;
3617 re_token_t *token;
3618 reg_syntax_t syntax;
3619 {
3620 int num = -1;
3621 unsigned char c;
3622 while (1)
3623 {
3624 *token = fetch_token (input, syntax);
3625 c = token->opr.c;
3626 if (BE (token->type == END_OF_RE, 0))
3627 return -2;
3628 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3629 break;
3630 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3631 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3632 num = (num > RE_DUP_MAX) ? -2 : num;
3633 }
3634 return num;
3635 }
3636 \f
3637 #ifdef RE_ENABLE_I18N
3638 static void
3639 free_charset (re_charset_t *cset)
3640 {
3641 re_free (cset->mbchars);
3642 # ifdef _LIBC
3643 re_free (cset->coll_syms);
3644 re_free (cset->equiv_classes);
3645 re_free (cset->range_starts);
3646 re_free (cset->range_ends);
3647 # endif
3648 re_free (cset->char_classes);
3649 re_free (cset);
3650 }
3651 #endif /* RE_ENABLE_I18N */
3652 \f
3653 /* Functions for binary tree operation. */
3654
3655 /* Create a node of tree.
3656 Note: This function automatically free left and right if malloc fails. */
3657
3658 static bin_tree_t *
3659 create_tree (left, right, type, index)
3660 bin_tree_t *left;
3661 bin_tree_t *right;
3662 re_token_type_t type;
3663 int index;
3664 {
3665 bin_tree_t *tree;
3666 tree = re_malloc (bin_tree_t, 1);
3667 if (BE (tree == NULL, 0))
3668 {
3669 free_bin_tree (left);
3670 free_bin_tree (right);
3671 return NULL;
3672 }
3673 tree->parent = NULL;
3674 tree->left = left;
3675 tree->right = right;
3676 tree->type = type;
3677 tree->node_idx = index;
3678 tree->first = -1;
3679 tree->next = -1;
3680 re_node_set_init_empty (&tree->eclosure);
3681
3682 if (left != NULL)
3683 left->parent = tree;
3684 if (right != NULL)
3685 right->parent = tree;
3686 return tree;
3687 }
3688
3689 /* Free the sub tree pointed by TREE. */
3690
3691 static void
3692 free_bin_tree (tree)
3693 bin_tree_t *tree;
3694 {
3695 if (tree == NULL)
3696 return;
3697 /*re_node_set_free (&tree->eclosure);*/
3698 free_bin_tree (tree->left);
3699 free_bin_tree (tree->right);
3700 re_free (tree);
3701 }
3702
3703 /* Duplicate the node SRC, and return new node. */
3704
3705 static bin_tree_t *
3706 duplicate_tree (src, dfa)
3707 const bin_tree_t *src;
3708 re_dfa_t *dfa;
3709 {
3710 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3711 int new_node_idx;
3712 /* Since node indies must be according to Post-order of the tree,
3713 we must duplicate the left at first. */
3714 if (src->left != NULL)
3715 {
3716 left = duplicate_tree (src->left, dfa);
3717 if (left == NULL)
3718 return NULL;
3719 }
3720
3721 /* Secondaly, duplicate the right. */
3722 if (src->right != NULL)
3723 {
3724 right = duplicate_tree (src->right, dfa);
3725 if (right == NULL)
3726 {
3727 free_bin_tree (left);
3728 return NULL;
3729 }
3730 }
3731
3732 /* At last, duplicate itself. */
3733 if (src->type == NON_TYPE)
3734 {
3735 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3736 dfa->nodes[new_node_idx].duplicated = 1;
3737 if (BE (new_node_idx == -1, 0))
3738 {
3739 free_bin_tree (left);
3740 free_bin_tree (right);
3741 return NULL;
3742 }
3743 }
3744 else
3745 new_node_idx = src->type;
3746
3747 new_tree = create_tree (left, right, src->type, new_node_idx);
3748 if (BE (new_tree == NULL, 0))
3749 {
3750 free_bin_tree (left);
3751 free_bin_tree (right);
3752 }
3753 return new_tree;
3754 }