]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regcomp.c
Update.
[thirdparty/glibc.git] / posix / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (re_dfa_t *dfa);
37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
42 int top_clone_node, int root_node,
43 unsigned int constraint);
44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
45 unsigned int constraint);
46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
50 int node, int root);
51 static void calc_inveclosure (re_dfa_t *dfa);
52 static int fetch_number (re_string_t *input, re_token_t *token,
53 reg_syntax_t syntax);
54 static void fetch_token (re_token_t *result, re_string_t *input,
55 reg_syntax_t syntax);
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax);
58 static int peek_token_bracket (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax);
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifndef _LIBC
90 # ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
92 re_charset_t *mbcset, int *range_alloc,
93 bracket_elem_t *start_elem,
94 bracket_elem_t *end_elem);
95 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
96 re_charset_t *mbcset,
97 int *coll_sym_alloc,
98 const unsigned char *name);
99 # else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 const unsigned char *name);
105 # endif /* not RE_ENABLE_I18N */
106 #endif /* not _LIBC */
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
109 re_charset_t *mbcset,
110 int *equiv_class_alloc,
111 const unsigned char *name);
112 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
113 re_bitset_ptr_t sbcset,
114 re_charset_t *mbcset,
115 int *char_class_alloc,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
118 #else /* not RE_ENABLE_I18N */
119 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
120 const unsigned char *name);
121 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
122 re_bitset_ptr_t sbcset,
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
125 #endif /* not RE_ENABLE_I18N */
126 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
127 const unsigned char *class_name,
128 const unsigned char *extra, int not,
129 reg_errcode_t *err);
130 static bin_tree_t *create_tree (re_dfa_t *dfa,
131 bin_tree_t *left, bin_tree_t *right,
132 re_token_type_t type, int index);
133 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
134 bin_tree_t *left, bin_tree_t *right,
135 const re_token_t *token)
136 __attribute ((noinline));
137 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
138 \f
139 /* This table gives an error message for each of the error codes listed
140 in regex.h. Obviously the order here has to be same as there.
141 POSIX doesn't require that we do anything for REG_NOERROR,
142 but why not be nice? */
143
144 const char __re_error_msgid[] attribute_hidden =
145 {
146 #define REG_NOERROR_IDX 0
147 gettext_noop ("Success") /* REG_NOERROR */
148 "\0"
149 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
150 gettext_noop ("No match") /* REG_NOMATCH */
151 "\0"
152 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
153 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
154 "\0"
155 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
156 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
157 "\0"
158 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
159 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
160 "\0"
161 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
162 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
163 "\0"
164 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
165 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
166 "\0"
167 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
168 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
169 "\0"
170 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
171 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
172 "\0"
173 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
174 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
175 "\0"
176 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
177 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
178 "\0"
179 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
180 gettext_noop ("Invalid range end") /* REG_ERANGE */
181 "\0"
182 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
183 gettext_noop ("Memory exhausted") /* REG_ESPACE */
184 "\0"
185 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
186 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
187 "\0"
188 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
189 gettext_noop ("Premature end of regular expression") /* REG_EEND */
190 "\0"
191 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
192 gettext_noop ("Regular expression too big") /* REG_ESIZE */
193 "\0"
194 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
195 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
196 };
197
198 const size_t __re_error_msgid_idx[] attribute_hidden =
199 {
200 REG_NOERROR_IDX,
201 REG_NOMATCH_IDX,
202 REG_BADPAT_IDX,
203 REG_ECOLLATE_IDX,
204 REG_ECTYPE_IDX,
205 REG_EESCAPE_IDX,
206 REG_ESUBREG_IDX,
207 REG_EBRACK_IDX,
208 REG_EPAREN_IDX,
209 REG_EBRACE_IDX,
210 REG_BADBR_IDX,
211 REG_ERANGE_IDX,
212 REG_ESPACE_IDX,
213 REG_BADRPT_IDX,
214 REG_EEND_IDX,
215 REG_ESIZE_IDX,
216 REG_ERPAREN_IDX
217 };
218 \f
219 /* Entry points for GNU code. */
220
221 /* re_compile_pattern is the GNU regular expression compiler: it
222 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
223 Returns 0 if the pattern was valid, otherwise an error string.
224
225 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
226 are set in BUFP on entry. */
227
228 const char *
229 re_compile_pattern (pattern, length, bufp)
230 const char *pattern;
231 size_t length;
232 struct re_pattern_buffer *bufp;
233 {
234 reg_errcode_t ret;
235
236 /* And GNU code determines whether or not to get register information
237 by passing null for the REGS argument to re_match, etc., not by
238 setting no_sub. */
239 bufp->no_sub = 0;
240
241 /* Match anchors at newline. */
242 bufp->newline_anchor = 1;
243
244 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
245
246 if (!ret)
247 return NULL;
248 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
249 }
250 #ifdef _LIBC
251 weak_alias (__re_compile_pattern, re_compile_pattern)
252 #endif
253
254 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
255 also be assigned to arbitrarily: each pattern buffer stores its own
256 syntax, so it can be changed between regex compilations. */
257 /* This has no initializer because initialized variables in Emacs
258 become read-only after dumping. */
259 reg_syntax_t re_syntax_options;
260
261
262 /* Specify the precise syntax of regexps for compilation. This provides
263 for compatibility for various utilities which historically have
264 different, incompatible syntaxes.
265
266 The argument SYNTAX is a bit mask comprised of the various bits
267 defined in regex.h. We return the old syntax. */
268
269 reg_syntax_t
270 re_set_syntax (syntax)
271 reg_syntax_t syntax;
272 {
273 reg_syntax_t ret = re_syntax_options;
274
275 re_syntax_options = syntax;
276 return ret;
277 }
278 #ifdef _LIBC
279 weak_alias (__re_set_syntax, re_set_syntax)
280 #endif
281
282 int
283 re_compile_fastmap (bufp)
284 struct re_pattern_buffer *bufp;
285 {
286 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
287 char *fastmap = bufp->fastmap;
288
289 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
290 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
291 if (dfa->init_state != dfa->init_state_word)
292 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
293 if (dfa->init_state != dfa->init_state_nl)
294 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
295 if (dfa->init_state != dfa->init_state_begbuf)
296 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
297 bufp->fastmap_accurate = 1;
298 return 0;
299 }
300 #ifdef _LIBC
301 weak_alias (__re_compile_fastmap, re_compile_fastmap)
302 #endif
303
304 static inline void
305 __attribute ((always_inline))
306 re_set_fastmap (char *fastmap, int icase, int ch)
307 {
308 fastmap[ch] = 1;
309 if (icase)
310 fastmap[tolower (ch)] = 1;
311 }
312
313 /* Helper function for re_compile_fastmap.
314 Compile fastmap for the initial_state INIT_STATE. */
315
316 static void
317 re_compile_fastmap_iter (bufp, init_state, fastmap)
318 regex_t *bufp;
319 const re_dfastate_t *init_state;
320 char *fastmap;
321 {
322 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
323 int node_cnt;
324 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
325 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
326 {
327 int node = init_state->nodes.elems[node_cnt];
328 re_token_type_t type = dfa->nodes[node].type;
329
330 if (type == CHARACTER)
331 {
332 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
333 #ifdef RE_ENABLE_I18N
334 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
335 {
336 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
337 wchar_t wc;
338 mbstate_t state;
339
340 p = buf;
341 *p++ = dfa->nodes[node].opr.c;
342 while (++node < dfa->nodes_len
343 && dfa->nodes[node].type == CHARACTER
344 && dfa->nodes[node].mb_partial)
345 *p++ = dfa->nodes[node].opr.c;
346 memset (&state, 0, sizeof (state));
347 if (mbrtowc (&wc, (const char *) buf, p - buf,
348 &state) == p - buf
349 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
350 re_set_fastmap (fastmap, 0, buf[0]);
351 }
352 #endif
353 }
354 else if (type == SIMPLE_BRACKET)
355 {
356 int i, j, ch;
357 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
358 for (j = 0; j < UINT_BITS; ++j, ++ch)
359 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
360 re_set_fastmap (fastmap, icase, ch);
361 }
362 #ifdef RE_ENABLE_I18N
363 else if (type == COMPLEX_BRACKET)
364 {
365 int i;
366 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
367 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
368 || cset->nranges || cset->nchar_classes)
369 {
370 # ifdef _LIBC
371 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
372 {
373 /* In this case we want to catch the bytes which are
374 the first byte of any collation elements.
375 e.g. In da_DK, we want to catch 'a' since "aa"
376 is a valid collation element, and don't catch
377 'b' since 'b' is the only collation element
378 which starts from 'b'. */
379 int j, ch;
380 const int32_t *table = (const int32_t *)
381 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
382 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
383 for (j = 0; j < UINT_BITS; ++j, ++ch)
384 if (table[ch] < 0)
385 re_set_fastmap (fastmap, icase, ch);
386 }
387 # else
388 if (dfa->mb_cur_max > 1)
389 for (i = 0; i < SBC_MAX; ++i)
390 if (__btowc (i) == WEOF)
391 re_set_fastmap (fastmap, icase, i);
392 # endif /* not _LIBC */
393 }
394 for (i = 0; i < cset->nmbchars; ++i)
395 {
396 char buf[256];
397 mbstate_t state;
398 memset (&state, '\0', sizeof (state));
399 __wcrtomb (buf, cset->mbchars[i], &state);
400 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
401 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
402 {
403 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
404 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
405 }
406 }
407 }
408 #endif /* RE_ENABLE_I18N */
409 else if (type == END_OF_RE || type == OP_PERIOD
410 || type == OP_UTF8_PERIOD)
411 {
412 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
413 if (type == END_OF_RE)
414 bufp->can_be_null = 1;
415 return;
416 }
417 }
418 }
419 \f
420 /* Entry point for POSIX code. */
421 /* regcomp takes a regular expression as a string and compiles it.
422
423 PREG is a regex_t *. We do not expect any fields to be initialized,
424 since POSIX says we shouldn't. Thus, we set
425
426 `buffer' to the compiled pattern;
427 `used' to the length of the compiled pattern;
428 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
429 REG_EXTENDED bit in CFLAGS is set; otherwise, to
430 RE_SYNTAX_POSIX_BASIC;
431 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
432 `fastmap' to an allocated space for the fastmap;
433 `fastmap_accurate' to zero;
434 `re_nsub' to the number of subexpressions in PATTERN.
435
436 PATTERN is the address of the pattern string.
437
438 CFLAGS is a series of bits which affect compilation.
439
440 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
441 use POSIX basic syntax.
442
443 If REG_NEWLINE is set, then . and [^...] don't match newline.
444 Also, regexec will try a match beginning after every newline.
445
446 If REG_ICASE is set, then we considers upper- and lowercase
447 versions of letters to be equivalent when matching.
448
449 If REG_NOSUB is set, then when PREG is passed to regexec, that
450 routine will report only success or failure, and nothing about the
451 registers.
452
453 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
454 the return codes and their meanings.) */
455
456 int
457 regcomp (preg, pattern, cflags)
458 regex_t *__restrict preg;
459 const char *__restrict pattern;
460 int cflags;
461 {
462 reg_errcode_t ret;
463 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
464 : RE_SYNTAX_POSIX_BASIC);
465
466 preg->buffer = NULL;
467 preg->allocated = 0;
468 preg->used = 0;
469
470 /* Try to allocate space for the fastmap. */
471 preg->fastmap = re_malloc (char, SBC_MAX);
472 if (BE (preg->fastmap == NULL, 0))
473 return REG_ESPACE;
474
475 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
476
477 /* If REG_NEWLINE is set, newlines are treated differently. */
478 if (cflags & REG_NEWLINE)
479 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
480 syntax &= ~RE_DOT_NEWLINE;
481 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
482 /* It also changes the matching behavior. */
483 preg->newline_anchor = 1;
484 }
485 else
486 preg->newline_anchor = 0;
487 preg->no_sub = !!(cflags & REG_NOSUB);
488 preg->translate = NULL;
489
490 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
491
492 /* POSIX doesn't distinguish between an unmatched open-group and an
493 unmatched close-group: both are REG_EPAREN. */
494 if (ret == REG_ERPAREN)
495 ret = REG_EPAREN;
496
497 /* We have already checked preg->fastmap != NULL. */
498 if (BE (ret == REG_NOERROR, 1))
499 /* Compute the fastmap now, since regexec cannot modify the pattern
500 buffer. This function nevers fails in this implementation. */
501 (void) re_compile_fastmap (preg);
502 else
503 {
504 /* Some error occurred while compiling the expression. */
505 re_free (preg->fastmap);
506 preg->fastmap = NULL;
507 }
508
509 return (int) ret;
510 }
511 #ifdef _LIBC
512 weak_alias (__regcomp, regcomp)
513 #endif
514
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
517
518 size_t
519 regerror (errcode, preg, errbuf, errbuf_size)
520 int errcode;
521 const regex_t *preg;
522 char *errbuf;
523 size_t errbuf_size;
524 {
525 const char *msg;
526 size_t msg_size;
527
528 if (BE (errcode < 0
529 || errcode >= (int) (sizeof (__re_error_msgid_idx)
530 / sizeof (__re_error_msgid_idx[0])), 0))
531 /* Only error codes returned by the rest of the code should be passed
532 to this routine. If we are given anything else, or if other regex
533 code generates an invalid error code, then the program has a bug.
534 Dump core so we can fix it. */
535 abort ();
536
537 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
538
539 msg_size = strlen (msg) + 1; /* Includes the null. */
540
541 if (BE (errbuf_size != 0, 1))
542 {
543 if (BE (msg_size > errbuf_size, 0))
544 {
545 #if defined HAVE_MEMPCPY || defined _LIBC
546 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
547 #else
548 memcpy (errbuf, msg, errbuf_size - 1);
549 errbuf[errbuf_size - 1] = 0;
550 #endif
551 }
552 else
553 memcpy (errbuf, msg, msg_size);
554 }
555
556 return msg_size;
557 }
558 #ifdef _LIBC
559 weak_alias (__regerror, regerror)
560 #endif
561
562
563 static void
564 free_dfa_content (re_dfa_t *dfa)
565 {
566 int i, j;
567
568 re_free (dfa->subexps);
569
570 if (dfa->nodes)
571 for (i = 0; i < dfa->nodes_len; ++i)
572 {
573 re_token_t *node = dfa->nodes + i;
574 #ifdef RE_ENABLE_I18N
575 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
576 free_charset (node->opr.mbcset);
577 else
578 #endif /* RE_ENABLE_I18N */
579 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
580 re_free (node->opr.sbcset);
581 }
582 re_free (dfa->nexts);
583 for (i = 0; i < dfa->nodes_len; ++i)
584 {
585 if (dfa->eclosures != NULL)
586 re_node_set_free (dfa->eclosures + i);
587 if (dfa->inveclosures != NULL)
588 re_node_set_free (dfa->inveclosures + i);
589 if (dfa->edests != NULL)
590 re_node_set_free (dfa->edests + i);
591 }
592 re_free (dfa->edests);
593 re_free (dfa->eclosures);
594 re_free (dfa->inveclosures);
595 re_free (dfa->nodes);
596
597 if (dfa->state_table)
598 for (i = 0; i <= dfa->state_hash_mask; ++i)
599 {
600 struct re_state_table_entry *entry = dfa->state_table + i;
601 for (j = 0; j < entry->num; ++j)
602 {
603 re_dfastate_t *state = entry->array[j];
604 free_state (state);
605 }
606 re_free (entry->array);
607 }
608 re_free (dfa->state_table);
609 re_free (dfa->word_char);
610 #ifdef RE_ENABLE_I18N
611 re_free (dfa->sb_char);
612 #endif
613 #ifdef DEBUG
614 re_free (dfa->re_str);
615 #endif
616
617 re_free (dfa);
618 }
619
620
621 /* Free dynamically allocated space used by PREG. */
622
623 void
624 regfree (preg)
625 regex_t *preg;
626 {
627 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
628 if (BE (dfa != NULL, 1))
629 free_dfa_content (dfa);
630
631 re_free (preg->fastmap);
632 }
633 #ifdef _LIBC
634 weak_alias (__regfree, regfree)
635 #endif
636 \f
637 /* Entry points compatible with 4.2 BSD regex library. We don't define
638 them unless specifically requested. */
639
640 #if defined _REGEX_RE_COMP || defined _LIBC
641
642 /* BSD has one and only one pattern buffer. */
643 static struct re_pattern_buffer re_comp_buf;
644
645 char *
646 # ifdef _LIBC
647 /* Make these definitions weak in libc, so POSIX programs can redefine
648 these names if they don't use our functions, and still use
649 regcomp/regexec above without link errors. */
650 weak_function
651 # endif
652 re_comp (s)
653 const char *s;
654 {
655 reg_errcode_t ret;
656 char *fastmap;
657
658 if (!s)
659 {
660 if (!re_comp_buf.buffer)
661 return gettext ("No previous regular expression");
662 return 0;
663 }
664
665 if (re_comp_buf.buffer)
666 {
667 fastmap = re_comp_buf.fastmap;
668 re_comp_buf.fastmap = NULL;
669 __regfree (&re_comp_buf);
670 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
671 re_comp_buf.fastmap = fastmap;
672 }
673
674 if (re_comp_buf.fastmap == NULL)
675 {
676 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
677 if (re_comp_buf.fastmap == NULL)
678 return (char *) gettext (__re_error_msgid
679 + __re_error_msgid_idx[(int) REG_ESPACE]);
680 }
681
682 /* Since `re_exec' always passes NULL for the `regs' argument, we
683 don't need to initialize the pattern buffer fields which affect it. */
684
685 /* Match anchors at newlines. */
686 re_comp_buf.newline_anchor = 1;
687
688 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
689
690 if (!ret)
691 return NULL;
692
693 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
694 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
695 }
696
697 #ifdef _LIBC
698 libc_freeres_fn (free_mem)
699 {
700 __regfree (&re_comp_buf);
701 }
702 #endif
703
704 #endif /* _REGEX_RE_COMP */
705 \f
706 /* Internal entry point.
707 Compile the regular expression PATTERN, whose length is LENGTH.
708 SYNTAX indicate regular expression's syntax. */
709
710 static reg_errcode_t
711 re_compile_internal (preg, pattern, length, syntax)
712 regex_t *preg;
713 const char * pattern;
714 int length;
715 reg_syntax_t syntax;
716 {
717 reg_errcode_t err = REG_NOERROR;
718 re_dfa_t *dfa;
719 re_string_t regexp;
720
721 /* Initialize the pattern buffer. */
722 preg->fastmap_accurate = 0;
723 preg->syntax = syntax;
724 preg->not_bol = preg->not_eol = 0;
725 preg->used = 0;
726 preg->re_nsub = 0;
727 preg->can_be_null = 0;
728 preg->regs_allocated = REGS_UNALLOCATED;
729
730 /* Initialize the dfa. */
731 dfa = (re_dfa_t *) preg->buffer;
732 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
733 {
734 /* If zero allocated, but buffer is non-null, try to realloc
735 enough space. This loses if buffer's address is bogus, but
736 that is the user's responsibility. If ->buffer is NULL this
737 is a simple allocation. */
738 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
739 if (dfa == NULL)
740 return REG_ESPACE;
741 preg->allocated = sizeof (re_dfa_t);
742 preg->buffer = (unsigned char *) dfa;
743 }
744 preg->used = sizeof (re_dfa_t);
745
746 err = init_dfa (dfa, length);
747 if (BE (err != REG_NOERROR, 0))
748 {
749 free_dfa_content (dfa);
750 preg->buffer = NULL;
751 preg->allocated = 0;
752 return err;
753 }
754 #ifdef DEBUG
755 dfa->re_str = re_malloc (char, length + 1);
756 strncpy (dfa->re_str, pattern, length + 1);
757 #endif
758
759 err = re_string_construct (&regexp, pattern, length, preg->translate,
760 syntax & RE_ICASE, dfa);
761 if (BE (err != REG_NOERROR, 0))
762 {
763 re_compile_internal_free_return:
764 free_workarea_compile (preg);
765 re_string_destruct (&regexp);
766 free_dfa_content (dfa);
767 preg->buffer = NULL;
768 preg->allocated = 0;
769 return err;
770 }
771
772 /* Parse the regular expression, and build a structure tree. */
773 preg->re_nsub = 0;
774 dfa->str_tree = parse (&regexp, preg, syntax, &err);
775 if (BE (dfa->str_tree == NULL, 0))
776 goto re_compile_internal_free_return;
777
778 #ifdef RE_ENABLE_I18N
779 /* If possible, do searching in single byte encoding to speed things up. */
780 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
781 optimize_utf8 (dfa);
782 #endif
783
784 /* Analyze the tree and collect information which is necessary to
785 create the dfa. */
786 err = analyze (dfa);
787 if (BE (err != REG_NOERROR, 0))
788 goto re_compile_internal_free_return;
789
790 /* Then create the initial state of the dfa. */
791 err = create_initial_state (dfa);
792
793 /* Release work areas. */
794 free_workarea_compile (preg);
795 re_string_destruct (&regexp);
796
797 if (BE (err != REG_NOERROR, 0))
798 {
799 free_dfa_content (dfa);
800 preg->buffer = NULL;
801 preg->allocated = 0;
802 }
803
804 return err;
805 }
806
807 /* Initialize DFA. We use the length of the regular expression PAT_LEN
808 as the initial length of some arrays. */
809
810 static reg_errcode_t
811 init_dfa (dfa, pat_len)
812 re_dfa_t *dfa;
813 int pat_len;
814 {
815 int table_size;
816
817 memset (dfa, '\0', sizeof (re_dfa_t));
818
819 /* Force allocation of str_tree_storage the first time. */
820 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
821
822 dfa->nodes_alloc = pat_len + 1;
823 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
824
825 dfa->states_alloc = pat_len + 1;
826
827 /* table_size = 2 ^ ceil(log pat_len) */
828 for (table_size = 1; table_size > 0; table_size <<= 1)
829 if (table_size > pat_len)
830 break;
831
832 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
833 dfa->state_hash_mask = table_size - 1;
834
835 dfa->subexps_alloc = 1;
836 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
837 /* dfa->word_char = NULL; */
838
839 dfa->mb_cur_max = MB_CUR_MAX;
840 #ifdef _LIBC
841 if (dfa->mb_cur_max > 1
842 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
843 dfa->is_utf8 = 1;
844 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
845 != 0);
846 #endif
847 #ifdef RE_ENABLE_I18N
848 if (dfa->mb_cur_max > 1)
849 {
850 int i, j, ch;
851
852 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
853 if (BE (dfa->sb_char == NULL, 0))
854 return REG_ESPACE;
855 #ifdef _LIBC
856 if (dfa->is_utf8)
857 memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
858 else
859 #endif
860 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
861 for (j = 0; j < UINT_BITS; ++j, ++ch)
862 if (btowc (ch) != WEOF)
863 dfa->sb_char[i] |= 1 << j;
864 }
865 #endif
866
867 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
868 || dfa->subexps == NULL, 0))
869 return REG_ESPACE;
870 return REG_NOERROR;
871 }
872
873 /* Initialize WORD_CHAR table, which indicate which character is
874 "word". In this case "word" means that it is the word construction
875 character used by some operators like "\<", "\>", etc. */
876
877 static reg_errcode_t
878 init_word_char (dfa)
879 re_dfa_t *dfa;
880 {
881 int i, j, ch;
882 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
883 if (BE (dfa->word_char == NULL, 0))
884 return REG_ESPACE;
885 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
886 for (j = 0; j < UINT_BITS; ++j, ++ch)
887 if (isalnum (ch) || ch == '_')
888 dfa->word_char[i] |= 1 << j;
889 return REG_NOERROR;
890 }
891
892 /* Free the work area which are only used while compiling. */
893
894 static void
895 free_workarea_compile (preg)
896 regex_t *preg;
897 {
898 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
899 bin_tree_storage_t *storage, *next;
900 for (storage = dfa->str_tree_storage; storage; storage = next)
901 {
902 next = storage->next;
903 re_free (storage);
904 }
905 dfa->str_tree_storage = NULL;
906 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
907 dfa->str_tree = NULL;
908 re_free (dfa->org_indices);
909 dfa->org_indices = NULL;
910 }
911
912 /* Create initial states for all contexts. */
913
914 static reg_errcode_t
915 create_initial_state (dfa)
916 re_dfa_t *dfa;
917 {
918 int first, i;
919 reg_errcode_t err;
920 re_node_set init_nodes;
921
922 /* Initial states have the epsilon closure of the node which is
923 the first node of the regular expression. */
924 first = dfa->str_tree->first;
925 dfa->init_node = first;
926 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
927 if (BE (err != REG_NOERROR, 0))
928 return err;
929
930 /* The back-references which are in initial states can epsilon transit,
931 since in this case all of the subexpressions can be null.
932 Then we add epsilon closures of the nodes which are the next nodes of
933 the back-references. */
934 if (dfa->nbackref > 0)
935 for (i = 0; i < init_nodes.nelem; ++i)
936 {
937 int node_idx = init_nodes.elems[i];
938 re_token_type_t type = dfa->nodes[node_idx].type;
939
940 int clexp_idx;
941 if (type != OP_BACK_REF)
942 continue;
943 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
944 {
945 re_token_t *clexp_node;
946 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
947 if (clexp_node->type == OP_CLOSE_SUBEXP
948 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
949 break;
950 }
951 if (clexp_idx == init_nodes.nelem)
952 continue;
953
954 if (type == OP_BACK_REF)
955 {
956 int dest_idx = dfa->edests[node_idx].elems[0];
957 if (!re_node_set_contains (&init_nodes, dest_idx))
958 {
959 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
960 i = 0;
961 }
962 }
963 }
964
965 /* It must be the first time to invoke acquire_state. */
966 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
967 /* We don't check ERR here, since the initial state must not be NULL. */
968 if (BE (dfa->init_state == NULL, 0))
969 return err;
970 if (dfa->init_state->has_constraint)
971 {
972 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
973 CONTEXT_WORD);
974 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
975 CONTEXT_NEWLINE);
976 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
977 &init_nodes,
978 CONTEXT_NEWLINE
979 | CONTEXT_BEGBUF);
980 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
981 || dfa->init_state_begbuf == NULL, 0))
982 return err;
983 }
984 else
985 dfa->init_state_word = dfa->init_state_nl
986 = dfa->init_state_begbuf = dfa->init_state;
987
988 re_node_set_free (&init_nodes);
989 return REG_NOERROR;
990 }
991 \f
992 #ifdef RE_ENABLE_I18N
993 /* If it is possible to do searching in single byte encoding instead of UTF-8
994 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
995 DFA nodes where needed. */
996
997 static void
998 optimize_utf8 (dfa)
999 re_dfa_t *dfa;
1000 {
1001 int node, i, mb_chars = 0, has_period = 0;
1002
1003 for (node = 0; node < dfa->nodes_len; ++node)
1004 switch (dfa->nodes[node].type)
1005 {
1006 case CHARACTER:
1007 if (dfa->nodes[node].opr.c >= 0x80)
1008 mb_chars = 1;
1009 break;
1010 case ANCHOR:
1011 switch (dfa->nodes[node].opr.idx)
1012 {
1013 case LINE_FIRST:
1014 case LINE_LAST:
1015 case BUF_FIRST:
1016 case BUF_LAST:
1017 break;
1018 default:
1019 /* Word anchors etc. cannot be handled. */
1020 return;
1021 }
1022 break;
1023 case OP_PERIOD:
1024 has_period = 1;
1025 break;
1026 case OP_BACK_REF:
1027 case OP_ALT:
1028 case END_OF_RE:
1029 case OP_DUP_ASTERISK:
1030 case OP_DUP_QUESTION:
1031 case OP_DUP_PLUS:
1032 case OP_OPEN_SUBEXP:
1033 case OP_CLOSE_SUBEXP:
1034 break;
1035 case SIMPLE_BRACKET:
1036 /* Just double check. */
1037 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1038 if (dfa->nodes[node].opr.sbcset[i])
1039 return;
1040 break;
1041 default:
1042 return;
1043 }
1044
1045 if (mb_chars || has_period)
1046 for (node = 0; node < dfa->nodes_len; ++node)
1047 {
1048 if (dfa->nodes[node].type == CHARACTER
1049 && dfa->nodes[node].opr.c >= 0x80)
1050 dfa->nodes[node].mb_partial = 0;
1051 else if (dfa->nodes[node].type == OP_PERIOD)
1052 dfa->nodes[node].type = OP_UTF8_PERIOD;
1053 }
1054
1055 /* The search can be in single byte locale. */
1056 dfa->mb_cur_max = 1;
1057 dfa->is_utf8 = 0;
1058 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1059 }
1060 #endif
1061 \f
1062 /* Analyze the structure tree, and calculate "first", "next", "edest",
1063 "eclosure", and "inveclosure". */
1064
1065 static reg_errcode_t
1066 analyze (dfa)
1067 re_dfa_t *dfa;
1068 {
1069 int i;
1070 reg_errcode_t ret;
1071
1072 /* Allocate arrays. */
1073 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1074 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1075 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1076 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1077 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1078 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1079 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1080 return REG_ESPACE;
1081 /* Initialize them. */
1082 for (i = 0; i < dfa->nodes_len; ++i)
1083 {
1084 dfa->nexts[i] = -1;
1085 re_node_set_init_empty (dfa->edests + i);
1086 re_node_set_init_empty (dfa->eclosures + i);
1087 re_node_set_init_empty (dfa->inveclosures + i);
1088 }
1089
1090 ret = analyze_tree (dfa, dfa->str_tree);
1091 if (BE (ret == REG_NOERROR, 1))
1092 {
1093 ret = calc_eclosure (dfa);
1094 if (ret == REG_NOERROR)
1095 calc_inveclosure (dfa);
1096 }
1097 return ret;
1098 }
1099
1100 /* Helper functions for analyze.
1101 This function calculate "first", "next", and "edest" for the subtree
1102 whose root is NODE. */
1103
1104 static reg_errcode_t
1105 analyze_tree (dfa, node)
1106 re_dfa_t *dfa;
1107 bin_tree_t *node;
1108 {
1109 reg_errcode_t ret;
1110 if (node->first == -1)
1111 calc_first (dfa, node);
1112 if (node->next == -1)
1113 calc_next (dfa, node);
1114 if (node->eclosure.nelem == 0)
1115 calc_epsdest (dfa, node);
1116 /* Calculate "first" etc. for the left child. */
1117 if (node->left != NULL)
1118 {
1119 ret = analyze_tree (dfa, node->left);
1120 if (BE (ret != REG_NOERROR, 0))
1121 return ret;
1122 }
1123 /* Calculate "first" etc. for the right child. */
1124 if (node->right != NULL)
1125 {
1126 ret = analyze_tree (dfa, node->right);
1127 if (BE (ret != REG_NOERROR, 0))
1128 return ret;
1129 }
1130 return REG_NOERROR;
1131 }
1132
1133 /* Calculate "first" for the node NODE. */
1134 static void
1135 calc_first (dfa, node)
1136 re_dfa_t *dfa;
1137 bin_tree_t *node;
1138 {
1139 int idx, type;
1140 idx = node->node_idx;
1141 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1142
1143 switch (type)
1144 {
1145 #ifdef DEBUG
1146 case OP_OPEN_BRACKET:
1147 case OP_CLOSE_BRACKET:
1148 case OP_OPEN_DUP_NUM:
1149 case OP_CLOSE_DUP_NUM:
1150 case OP_NON_MATCH_LIST:
1151 case OP_OPEN_COLL_ELEM:
1152 case OP_CLOSE_COLL_ELEM:
1153 case OP_OPEN_EQUIV_CLASS:
1154 case OP_CLOSE_EQUIV_CLASS:
1155 case OP_OPEN_CHAR_CLASS:
1156 case OP_CLOSE_CHAR_CLASS:
1157 /* These must not be appeared here. */
1158 assert (0);
1159 #endif
1160 case END_OF_RE:
1161 case CHARACTER:
1162 case OP_PERIOD:
1163 case OP_DUP_ASTERISK:
1164 case OP_DUP_QUESTION:
1165 #ifdef RE_ENABLE_I18N
1166 case OP_UTF8_PERIOD:
1167 case COMPLEX_BRACKET:
1168 #endif /* RE_ENABLE_I18N */
1169 case SIMPLE_BRACKET:
1170 case OP_BACK_REF:
1171 case ANCHOR:
1172 case OP_OPEN_SUBEXP:
1173 case OP_CLOSE_SUBEXP:
1174 node->first = idx;
1175 break;
1176 case OP_DUP_PLUS:
1177 #ifdef DEBUG
1178 assert (node->left != NULL);
1179 #endif
1180 if (node->left->first == -1)
1181 calc_first (dfa, node->left);
1182 node->first = node->left->first;
1183 break;
1184 case OP_ALT:
1185 node->first = idx;
1186 break;
1187 /* else fall through */
1188 default:
1189 #ifdef DEBUG
1190 assert (node->left != NULL);
1191 #endif
1192 if (node->left->first == -1)
1193 calc_first (dfa, node->left);
1194 node->first = node->left->first;
1195 break;
1196 }
1197 }
1198
1199 /* Calculate "next" for the node NODE. */
1200
1201 static void
1202 calc_next (dfa, node)
1203 re_dfa_t *dfa;
1204 bin_tree_t *node;
1205 {
1206 int idx, type;
1207 bin_tree_t *parent = node->parent;
1208 if (parent == NULL)
1209 {
1210 node->next = -1;
1211 idx = node->node_idx;
1212 if (node->type == 0)
1213 dfa->nexts[idx] = node->next;
1214 return;
1215 }
1216
1217 idx = parent->node_idx;
1218 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1219
1220 switch (type)
1221 {
1222 case OP_DUP_ASTERISK:
1223 case OP_DUP_PLUS:
1224 node->next = idx;
1225 break;
1226 case CONCAT:
1227 if (parent->left == node)
1228 {
1229 if (parent->right->first == -1)
1230 calc_first (dfa, parent->right);
1231 node->next = parent->right->first;
1232 break;
1233 }
1234 /* else fall through */
1235 default:
1236 if (parent->next == -1)
1237 calc_next (dfa, parent);
1238 node->next = parent->next;
1239 break;
1240 }
1241 idx = node->node_idx;
1242 if (node->type == 0)
1243 dfa->nexts[idx] = node->next;
1244 }
1245
1246 /* Calculate "edest" for the node NODE. */
1247
1248 static void
1249 calc_epsdest (dfa, node)
1250 re_dfa_t *dfa;
1251 bin_tree_t *node;
1252 {
1253 int idx;
1254 idx = node->node_idx;
1255 if (node->type == 0)
1256 {
1257 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1258 || dfa->nodes[idx].type == OP_DUP_PLUS
1259 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1260 {
1261 if (node->left->first == -1)
1262 calc_first (dfa, node->left);
1263 if (node->next == -1)
1264 calc_next (dfa, node);
1265 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1266 node->next);
1267 }
1268 else if (dfa->nodes[idx].type == OP_ALT)
1269 {
1270 int left, right;
1271 if (node->left != NULL)
1272 {
1273 if (node->left->first == -1)
1274 calc_first (dfa, node->left);
1275 left = node->left->first;
1276 }
1277 else
1278 {
1279 if (node->next == -1)
1280 calc_next (dfa, node);
1281 left = node->next;
1282 }
1283 if (node->right != NULL)
1284 {
1285 if (node->right->first == -1)
1286 calc_first (dfa, node->right);
1287 right = node->right->first;
1288 }
1289 else
1290 {
1291 if (node->next == -1)
1292 calc_next (dfa, node);
1293 right = node->next;
1294 }
1295 re_node_set_init_2 (dfa->edests + idx, left, right);
1296 }
1297 else if (dfa->nodes[idx].type == ANCHOR
1298 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1299 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1300 || dfa->nodes[idx].type == OP_BACK_REF)
1301 re_node_set_init_1 (dfa->edests + idx, node->next);
1302 else
1303 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1304 }
1305 }
1306
1307 /* Duplicate the epsilon closure of the node ROOT_NODE.
1308 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1309 to their own constraint. */
1310
1311 static reg_errcode_t
1312 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1313 init_constraint)
1314 re_dfa_t *dfa;
1315 int top_org_node, top_clone_node, root_node;
1316 unsigned int init_constraint;
1317 {
1318 reg_errcode_t err;
1319 int org_node, clone_node, ret;
1320 unsigned int constraint = init_constraint;
1321 for (org_node = top_org_node, clone_node = top_clone_node;;)
1322 {
1323 int org_dest, clone_dest;
1324 if (dfa->nodes[org_node].type == OP_BACK_REF)
1325 {
1326 /* If the back reference epsilon-transit, its destination must
1327 also have the constraint. Then duplicate the epsilon closure
1328 of the destination of the back reference, and store it in
1329 edests of the back reference. */
1330 org_dest = dfa->nexts[org_node];
1331 re_node_set_empty (dfa->edests + clone_node);
1332 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1333 if (BE (err != REG_NOERROR, 0))
1334 return err;
1335 dfa->nexts[clone_node] = dfa->nexts[org_node];
1336 if (clone_dest == -1)
1337 break;
1338 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1339 if (BE (ret < 0, 0))
1340 return REG_ESPACE;
1341 }
1342 else if (dfa->edests[org_node].nelem == 0)
1343 {
1344 /* In case of the node can't epsilon-transit, don't duplicate the
1345 destination and store the original destination as the
1346 destination of the node. */
1347 dfa->nexts[clone_node] = dfa->nexts[org_node];
1348 break;
1349 }
1350 else if (dfa->edests[org_node].nelem == 1)
1351 {
1352 /* In case of the node can epsilon-transit, and it has only one
1353 destination. */
1354 org_dest = dfa->edests[org_node].elems[0];
1355 re_node_set_empty (dfa->edests + clone_node);
1356 if (dfa->nodes[org_node].type == ANCHOR)
1357 {
1358 /* In case of the node has another constraint, append it. */
1359 if (org_node == root_node && clone_node != org_node)
1360 {
1361 /* ...but if the node is root_node itself, it means the
1362 epsilon closure have a loop, then tie it to the
1363 destination of the root_node. */
1364 ret = re_node_set_insert (dfa->edests + clone_node,
1365 org_dest);
1366 if (BE (ret < 0, 0))
1367 return REG_ESPACE;
1368 break;
1369 }
1370 constraint |= dfa->nodes[org_node].opr.ctx_type;
1371 }
1372 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1373 if (BE (err != REG_NOERROR, 0))
1374 return err;
1375 if (clone_dest == -1)
1376 break;
1377 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1378 if (BE (ret < 0, 0))
1379 return REG_ESPACE;
1380 }
1381 else /* dfa->edests[org_node].nelem == 2 */
1382 {
1383 /* In case of the node can epsilon-transit, and it has two
1384 destinations. E.g. '|', '*', '+', '?'. */
1385 org_dest = dfa->edests[org_node].elems[0];
1386 re_node_set_empty (dfa->edests + clone_node);
1387 /* Search for a duplicated node which satisfies the constraint. */
1388 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1389 if (clone_dest == -1)
1390 {
1391 /* There are no such a duplicated node, create a new one. */
1392 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1393 if (BE (err != REG_NOERROR, 0))
1394 return err;
1395 if (clone_dest != -1)
1396 {
1397 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1398 if (BE (ret < 0, 0))
1399 return REG_ESPACE;
1400 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1401 root_node, constraint);
1402 if (BE (err != REG_NOERROR, 0))
1403 return err;
1404 }
1405 }
1406 else
1407 {
1408 /* There are a duplicated node which satisfy the constraint,
1409 use it to avoid infinite loop. */
1410 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1411 if (BE (ret < 0, 0))
1412 return REG_ESPACE;
1413 }
1414
1415 org_dest = dfa->edests[org_node].elems[1];
1416 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1417 if (BE (err != REG_NOERROR, 0))
1418 return err;
1419 if (clone_dest == -1)
1420 break;
1421 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1422 if (BE (ret < 0, 0))
1423 return REG_ESPACE;
1424 }
1425 org_node = org_dest;
1426 clone_node = clone_dest;
1427 }
1428 return REG_NOERROR;
1429 }
1430
1431 /* Search for a node which is duplicated from the node ORG_NODE, and
1432 satisfies the constraint CONSTRAINT. */
1433
1434 static int
1435 search_duplicated_node (dfa, org_node, constraint)
1436 re_dfa_t *dfa;
1437 int org_node;
1438 unsigned int constraint;
1439 {
1440 int idx;
1441 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1442 {
1443 if (org_node == dfa->org_indices[idx]
1444 && constraint == dfa->nodes[idx].constraint)
1445 return idx; /* Found. */
1446 }
1447 return -1; /* Not found. */
1448 }
1449
1450 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1451 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1452 otherwise return the error code. */
1453
1454 static reg_errcode_t
1455 duplicate_node (new_idx, dfa, org_idx, constraint)
1456 re_dfa_t *dfa;
1457 int *new_idx, org_idx;
1458 unsigned int constraint;
1459 {
1460 int dup_idx;
1461
1462 if (dfa->nodes[org_idx].type == CHARACTER
1463 && (((constraint & NEXT_WORD_CONSTRAINT)
1464 && !dfa->nodes[org_idx].word_char)
1465 || ((constraint & NEXT_NOTWORD_CONSTRAINT)
1466 && dfa->nodes[org_idx].word_char)))
1467 {
1468 /* \<!, \>W etc. can never match. Don't duplicate them, instead
1469 tell the caller they shouldn't be added to edests. */
1470 *new_idx = -1;
1471 return REG_NOERROR;
1472 }
1473
1474 dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1475 if (BE (dup_idx == -1, 0))
1476 return REG_ESPACE;
1477 dfa->nodes[dup_idx].constraint = constraint;
1478 if (dfa->nodes[org_idx].type == ANCHOR)
1479 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1480 dfa->nodes[dup_idx].duplicated = 1;
1481 re_node_set_init_empty (dfa->edests + dup_idx);
1482 re_node_set_init_empty (dfa->eclosures + dup_idx);
1483 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1484
1485 /* Store the index of the original node. */
1486 dfa->org_indices[dup_idx] = org_idx;
1487 *new_idx = dup_idx;
1488 return REG_NOERROR;
1489 }
1490
1491 static void
1492 calc_inveclosure (dfa)
1493 re_dfa_t *dfa;
1494 {
1495 int src, idx, dest;
1496 for (src = 0; src < dfa->nodes_len; ++src)
1497 {
1498 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1499 {
1500 dest = dfa->eclosures[src].elems[idx];
1501 re_node_set_insert (dfa->inveclosures + dest, src);
1502 }
1503 }
1504 }
1505
1506 /* Calculate "eclosure" for all the node in DFA. */
1507
1508 static reg_errcode_t
1509 calc_eclosure (dfa)
1510 re_dfa_t *dfa;
1511 {
1512 int node_idx, incomplete;
1513 #ifdef DEBUG
1514 assert (dfa->nodes_len > 0);
1515 #endif
1516 incomplete = 0;
1517 /* For each nodes, calculate epsilon closure. */
1518 for (node_idx = 0; ; ++node_idx)
1519 {
1520 reg_errcode_t err;
1521 re_node_set eclosure_elem;
1522 if (node_idx == dfa->nodes_len)
1523 {
1524 if (!incomplete)
1525 break;
1526 incomplete = 0;
1527 node_idx = 0;
1528 }
1529
1530 #ifdef DEBUG
1531 assert (dfa->eclosures[node_idx].nelem != -1);
1532 #endif
1533 /* If we have already calculated, skip it. */
1534 if (dfa->eclosures[node_idx].nelem != 0)
1535 continue;
1536 /* Calculate epsilon closure of `node_idx'. */
1537 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1538 if (BE (err != REG_NOERROR, 0))
1539 return err;
1540
1541 if (dfa->eclosures[node_idx].nelem == 0)
1542 {
1543 incomplete = 1;
1544 re_node_set_free (&eclosure_elem);
1545 }
1546 }
1547 return REG_NOERROR;
1548 }
1549
1550 /* Calculate epsilon closure of NODE. */
1551
1552 static reg_errcode_t
1553 calc_eclosure_iter (new_set, dfa, node, root)
1554 re_node_set *new_set;
1555 re_dfa_t *dfa;
1556 int node, root;
1557 {
1558 reg_errcode_t err;
1559 unsigned int constraint;
1560 int i, incomplete;
1561 re_node_set eclosure;
1562 incomplete = 0;
1563 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1564 if (BE (err != REG_NOERROR, 0))
1565 return err;
1566
1567 /* This indicates that we are calculating this node now.
1568 We reference this value to avoid infinite loop. */
1569 dfa->eclosures[node].nelem = -1;
1570
1571 constraint = ((dfa->nodes[node].type == ANCHOR)
1572 ? dfa->nodes[node].opr.ctx_type : 0);
1573 /* If the current node has constraints, duplicate all nodes.
1574 Since they must inherit the constraints. */
1575 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1576 {
1577 int org_node, cur_node;
1578 org_node = cur_node = node;
1579 err = duplicate_node_closure (dfa, node, node, node, constraint);
1580 if (BE (err != REG_NOERROR, 0))
1581 return err;
1582 }
1583
1584 /* Expand each epsilon destination nodes. */
1585 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1586 for (i = 0; i < dfa->edests[node].nelem; ++i)
1587 {
1588 re_node_set eclosure_elem;
1589 int edest = dfa->edests[node].elems[i];
1590 /* If calculating the epsilon closure of `edest' is in progress,
1591 return intermediate result. */
1592 if (dfa->eclosures[edest].nelem == -1)
1593 {
1594 incomplete = 1;
1595 continue;
1596 }
1597 /* If we haven't calculated the epsilon closure of `edest' yet,
1598 calculate now. Otherwise use calculated epsilon closure. */
1599 if (dfa->eclosures[edest].nelem == 0)
1600 {
1601 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1602 if (BE (err != REG_NOERROR, 0))
1603 return err;
1604 }
1605 else
1606 eclosure_elem = dfa->eclosures[edest];
1607 /* Merge the epsilon closure of `edest'. */
1608 re_node_set_merge (&eclosure, &eclosure_elem);
1609 /* If the epsilon closure of `edest' is incomplete,
1610 the epsilon closure of this node is also incomplete. */
1611 if (dfa->eclosures[edest].nelem == 0)
1612 {
1613 incomplete = 1;
1614 re_node_set_free (&eclosure_elem);
1615 }
1616 }
1617
1618 /* Epsilon closures include itself. */
1619 re_node_set_insert (&eclosure, node);
1620 if (incomplete && !root)
1621 dfa->eclosures[node].nelem = 0;
1622 else
1623 dfa->eclosures[node] = eclosure;
1624 *new_set = eclosure;
1625 return REG_NOERROR;
1626 }
1627 \f
1628 /* Functions for token which are used in the parser. */
1629
1630 /* Fetch a token from INPUT.
1631 We must not use this function inside bracket expressions. */
1632
1633 static void
1634 fetch_token (result, input, syntax)
1635 re_token_t *result;
1636 re_string_t *input;
1637 reg_syntax_t syntax;
1638 {
1639 re_string_skip_bytes (input, peek_token (result, input, syntax));
1640 }
1641
1642 /* Peek a token from INPUT, and return the length of the token.
1643 We must not use this function inside bracket expressions. */
1644
1645 static int
1646 peek_token (token, input, syntax)
1647 re_token_t *token;
1648 re_string_t *input;
1649 reg_syntax_t syntax;
1650 {
1651 unsigned char c;
1652
1653 if (re_string_eoi (input))
1654 {
1655 token->type = END_OF_RE;
1656 return 0;
1657 }
1658
1659 c = re_string_peek_byte (input, 0);
1660 token->opr.c = c;
1661
1662 token->word_char = 0;
1663 #ifdef RE_ENABLE_I18N
1664 token->mb_partial = 0;
1665 if (input->mb_cur_max > 1 &&
1666 !re_string_first_byte (input, re_string_cur_idx (input)))
1667 {
1668 token->type = CHARACTER;
1669 token->mb_partial = 1;
1670 return 1;
1671 }
1672 #endif
1673 if (c == '\\')
1674 {
1675 unsigned char c2;
1676 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1677 {
1678 token->type = BACK_SLASH;
1679 return 1;
1680 }
1681
1682 c2 = re_string_peek_byte_case (input, 1);
1683 token->opr.c = c2;
1684 token->type = CHARACTER;
1685 #ifdef RE_ENABLE_I18N
1686 if (input->mb_cur_max > 1)
1687 {
1688 wint_t wc = re_string_wchar_at (input,
1689 re_string_cur_idx (input) + 1);
1690 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1691 }
1692 else
1693 #endif
1694 token->word_char = IS_WORD_CHAR (c2) != 0;
1695
1696 switch (c2)
1697 {
1698 case '|':
1699 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1700 token->type = OP_ALT;
1701 break;
1702 case '1': case '2': case '3': case '4': case '5':
1703 case '6': case '7': case '8': case '9':
1704 if (!(syntax & RE_NO_BK_REFS))
1705 {
1706 token->type = OP_BACK_REF;
1707 token->opr.idx = c2 - '0';
1708 }
1709 break;
1710 case '<':
1711 if (!(syntax & RE_NO_GNU_OPS))
1712 {
1713 token->type = ANCHOR;
1714 token->opr.idx = WORD_FIRST;
1715 }
1716 break;
1717 case '>':
1718 if (!(syntax & RE_NO_GNU_OPS))
1719 {
1720 token->type = ANCHOR;
1721 token->opr.idx = WORD_LAST;
1722 }
1723 break;
1724 case 'b':
1725 if (!(syntax & RE_NO_GNU_OPS))
1726 {
1727 token->type = ANCHOR;
1728 token->opr.idx = WORD_DELIM;
1729 }
1730 break;
1731 case 'B':
1732 if (!(syntax & RE_NO_GNU_OPS))
1733 {
1734 token->type = ANCHOR;
1735 token->opr.idx = INSIDE_WORD;
1736 }
1737 break;
1738 case 'w':
1739 if (!(syntax & RE_NO_GNU_OPS))
1740 token->type = OP_WORD;
1741 break;
1742 case 'W':
1743 if (!(syntax & RE_NO_GNU_OPS))
1744 token->type = OP_NOTWORD;
1745 break;
1746 case 's':
1747 if (!(syntax & RE_NO_GNU_OPS))
1748 token->type = OP_SPACE;
1749 break;
1750 case 'S':
1751 if (!(syntax & RE_NO_GNU_OPS))
1752 token->type = OP_NOTSPACE;
1753 break;
1754 case '`':
1755 if (!(syntax & RE_NO_GNU_OPS))
1756 {
1757 token->type = ANCHOR;
1758 token->opr.idx = BUF_FIRST;
1759 }
1760 break;
1761 case '\'':
1762 if (!(syntax & RE_NO_GNU_OPS))
1763 {
1764 token->type = ANCHOR;
1765 token->opr.idx = BUF_LAST;
1766 }
1767 break;
1768 case '(':
1769 if (!(syntax & RE_NO_BK_PARENS))
1770 token->type = OP_OPEN_SUBEXP;
1771 break;
1772 case ')':
1773 if (!(syntax & RE_NO_BK_PARENS))
1774 token->type = OP_CLOSE_SUBEXP;
1775 break;
1776 case '+':
1777 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1778 token->type = OP_DUP_PLUS;
1779 break;
1780 case '?':
1781 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1782 token->type = OP_DUP_QUESTION;
1783 break;
1784 case '{':
1785 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1786 token->type = OP_OPEN_DUP_NUM;
1787 break;
1788 case '}':
1789 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1790 token->type = OP_CLOSE_DUP_NUM;
1791 break;
1792 default:
1793 break;
1794 }
1795 return 2;
1796 }
1797
1798 token->type = CHARACTER;
1799 #ifdef RE_ENABLE_I18N
1800 if (input->mb_cur_max > 1)
1801 {
1802 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1803 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1804 }
1805 else
1806 #endif
1807 token->word_char = IS_WORD_CHAR (token->opr.c);
1808
1809 switch (c)
1810 {
1811 case '\n':
1812 if (syntax & RE_NEWLINE_ALT)
1813 token->type = OP_ALT;
1814 break;
1815 case '|':
1816 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1817 token->type = OP_ALT;
1818 break;
1819 case '*':
1820 token->type = OP_DUP_ASTERISK;
1821 break;
1822 case '+':
1823 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1824 token->type = OP_DUP_PLUS;
1825 break;
1826 case '?':
1827 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1828 token->type = OP_DUP_QUESTION;
1829 break;
1830 case '{':
1831 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1832 token->type = OP_OPEN_DUP_NUM;
1833 break;
1834 case '}':
1835 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1836 token->type = OP_CLOSE_DUP_NUM;
1837 break;
1838 case '(':
1839 if (syntax & RE_NO_BK_PARENS)
1840 token->type = OP_OPEN_SUBEXP;
1841 break;
1842 case ')':
1843 if (syntax & RE_NO_BK_PARENS)
1844 token->type = OP_CLOSE_SUBEXP;
1845 break;
1846 case '[':
1847 token->type = OP_OPEN_BRACKET;
1848 break;
1849 case '.':
1850 token->type = OP_PERIOD;
1851 break;
1852 case '^':
1853 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1854 re_string_cur_idx (input) != 0)
1855 {
1856 char prev = re_string_peek_byte (input, -1);
1857 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1858 break;
1859 }
1860 token->type = ANCHOR;
1861 token->opr.idx = LINE_FIRST;
1862 break;
1863 case '$':
1864 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1865 re_string_cur_idx (input) + 1 != re_string_length (input))
1866 {
1867 re_token_t next;
1868 re_string_skip_bytes (input, 1);
1869 peek_token (&next, input, syntax);
1870 re_string_skip_bytes (input, -1);
1871 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1872 break;
1873 }
1874 token->type = ANCHOR;
1875 token->opr.idx = LINE_LAST;
1876 break;
1877 default:
1878 break;
1879 }
1880 return 1;
1881 }
1882
1883 /* Peek a token from INPUT, and return the length of the token.
1884 We must not use this function out of bracket expressions. */
1885
1886 static int
1887 peek_token_bracket (token, input, syntax)
1888 re_token_t *token;
1889 re_string_t *input;
1890 reg_syntax_t syntax;
1891 {
1892 unsigned char c;
1893 if (re_string_eoi (input))
1894 {
1895 token->type = END_OF_RE;
1896 return 0;
1897 }
1898 c = re_string_peek_byte (input, 0);
1899 token->opr.c = c;
1900
1901 #ifdef RE_ENABLE_I18N
1902 if (input->mb_cur_max > 1 &&
1903 !re_string_first_byte (input, re_string_cur_idx (input)))
1904 {
1905 token->type = CHARACTER;
1906 return 1;
1907 }
1908 #endif /* RE_ENABLE_I18N */
1909
1910 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1911 {
1912 /* In this case, '\' escape a character. */
1913 unsigned char c2;
1914 re_string_skip_bytes (input, 1);
1915 c2 = re_string_peek_byte (input, 0);
1916 token->opr.c = c2;
1917 token->type = CHARACTER;
1918 return 1;
1919 }
1920 if (c == '[') /* '[' is a special char in a bracket exps. */
1921 {
1922 unsigned char c2;
1923 int token_len;
1924 c2 = re_string_peek_byte (input, 1);
1925 token->opr.c = c2;
1926 token_len = 2;
1927 switch (c2)
1928 {
1929 case '.':
1930 token->type = OP_OPEN_COLL_ELEM;
1931 break;
1932 case '=':
1933 token->type = OP_OPEN_EQUIV_CLASS;
1934 break;
1935 case ':':
1936 if (syntax & RE_CHAR_CLASSES)
1937 {
1938 token->type = OP_OPEN_CHAR_CLASS;
1939 break;
1940 }
1941 /* else fall through. */
1942 default:
1943 token->type = CHARACTER;
1944 token->opr.c = c;
1945 token_len = 1;
1946 break;
1947 }
1948 return token_len;
1949 }
1950 switch (c)
1951 {
1952 case '-':
1953 token->type = OP_CHARSET_RANGE;
1954 break;
1955 case ']':
1956 token->type = OP_CLOSE_BRACKET;
1957 break;
1958 case '^':
1959 token->type = OP_NON_MATCH_LIST;
1960 break;
1961 default:
1962 token->type = CHARACTER;
1963 }
1964 return 1;
1965 }
1966 \f
1967 /* Functions for parser. */
1968
1969 /* Entry point of the parser.
1970 Parse the regular expression REGEXP and return the structure tree.
1971 If an error is occured, ERR is set by error code, and return NULL.
1972 This function build the following tree, from regular expression <reg_exp>:
1973 CAT
1974 / \
1975 / \
1976 <reg_exp> EOR
1977
1978 CAT means concatenation.
1979 EOR means end of regular expression. */
1980
1981 static bin_tree_t *
1982 parse (regexp, preg, syntax, err)
1983 re_string_t *regexp;
1984 regex_t *preg;
1985 reg_syntax_t syntax;
1986 reg_errcode_t *err;
1987 {
1988 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1989 bin_tree_t *tree, *eor, *root;
1990 re_token_t current_token;
1991 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
1992 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1993 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1994 return NULL;
1995 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
1996 if (tree != NULL)
1997 root = create_tree (dfa, tree, eor, CONCAT, 0);
1998 else
1999 root = eor;
2000 if (BE (eor == NULL || root == NULL, 0))
2001 {
2002 *err = REG_ESPACE;
2003 return NULL;
2004 }
2005 return root;
2006 }
2007
2008 /* This function build the following tree, from regular expression
2009 <branch1>|<branch2>:
2010 ALT
2011 / \
2012 / \
2013 <branch1> <branch2>
2014
2015 ALT means alternative, which represents the operator `|'. */
2016
2017 static bin_tree_t *
2018 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2019 re_string_t *regexp;
2020 regex_t *preg;
2021 re_token_t *token;
2022 reg_syntax_t syntax;
2023 int nest;
2024 reg_errcode_t *err;
2025 {
2026 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2027 bin_tree_t *tree, *branch = NULL;
2028 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2029 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2030 return NULL;
2031
2032 while (token->type == OP_ALT)
2033 {
2034 re_token_t alt_token = *token;
2035 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2036 if (token->type != OP_ALT && token->type != END_OF_RE
2037 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2038 {
2039 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2040 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2041 return NULL;
2042 }
2043 else
2044 branch = NULL;
2045 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2046 if (BE (tree == NULL, 0))
2047 {
2048 *err = REG_ESPACE;
2049 return NULL;
2050 }
2051 dfa->has_plural_match = 1;
2052 }
2053 return tree;
2054 }
2055
2056 /* This function build the following tree, from regular expression
2057 <exp1><exp2>:
2058 CAT
2059 / \
2060 / \
2061 <exp1> <exp2>
2062
2063 CAT means concatenation. */
2064
2065 static bin_tree_t *
2066 parse_branch (regexp, preg, token, syntax, nest, err)
2067 re_string_t *regexp;
2068 regex_t *preg;
2069 re_token_t *token;
2070 reg_syntax_t syntax;
2071 int nest;
2072 reg_errcode_t *err;
2073 {
2074 bin_tree_t *tree, *exp;
2075 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2076 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2078 return NULL;
2079
2080 while (token->type != OP_ALT && token->type != END_OF_RE
2081 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2082 {
2083 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2084 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2085 {
2086 return NULL;
2087 }
2088 if (tree != NULL && exp != NULL)
2089 {
2090 tree = create_tree (dfa, tree, exp, CONCAT, 0);
2091 if (tree == NULL)
2092 {
2093 *err = REG_ESPACE;
2094 return NULL;
2095 }
2096 }
2097 else if (tree == NULL)
2098 tree = exp;
2099 /* Otherwise exp == NULL, we don't need to create new tree. */
2100 }
2101 return tree;
2102 }
2103
2104 /* This function build the following tree, from regular expression a*:
2105 *
2106 |
2107 a
2108 */
2109
2110 static bin_tree_t *
2111 parse_expression (regexp, preg, token, syntax, nest, err)
2112 re_string_t *regexp;
2113 regex_t *preg;
2114 re_token_t *token;
2115 reg_syntax_t syntax;
2116 int nest;
2117 reg_errcode_t *err;
2118 {
2119 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120 bin_tree_t *tree;
2121 switch (token->type)
2122 {
2123 case CHARACTER:
2124 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2125 if (BE (tree == NULL, 0))
2126 {
2127 *err = REG_ESPACE;
2128 return NULL;
2129 }
2130 #ifdef RE_ENABLE_I18N
2131 if (dfa->mb_cur_max > 1)
2132 {
2133 while (!re_string_eoi (regexp)
2134 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2135 {
2136 bin_tree_t *mbc_remain;
2137 fetch_token (token, regexp, syntax);
2138 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2139 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2140 if (BE (mbc_remain == NULL || tree == NULL, 0))
2141 {
2142 *err = REG_ESPACE;
2143 return NULL;
2144 }
2145 }
2146 }
2147 #endif
2148 break;
2149 case OP_OPEN_SUBEXP:
2150 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2151 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152 return NULL;
2153 break;
2154 case OP_OPEN_BRACKET:
2155 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2156 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157 return NULL;
2158 break;
2159 case OP_BACK_REF:
2160 if (BE (preg->re_nsub < token->opr.idx
2161 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2162 {
2163 *err = REG_ESUBREG;
2164 return NULL;
2165 }
2166 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2167 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2168 if (BE (tree == NULL, 0))
2169 {
2170 *err = REG_ESPACE;
2171 return NULL;
2172 }
2173 ++dfa->nbackref;
2174 dfa->has_mb_node = 1;
2175 break;
2176 case OP_OPEN_DUP_NUM:
2177 if (syntax & RE_CONTEXT_INVALID_DUP)
2178 {
2179 *err = REG_BADRPT;
2180 return NULL;
2181 }
2182 /* FALLTHROUGH */
2183 case OP_DUP_ASTERISK:
2184 case OP_DUP_PLUS:
2185 case OP_DUP_QUESTION:
2186 if (syntax & RE_CONTEXT_INVALID_OPS)
2187 {
2188 *err = REG_BADRPT;
2189 return NULL;
2190 }
2191 else if (syntax & RE_CONTEXT_INDEP_OPS)
2192 {
2193 fetch_token (token, regexp, syntax);
2194 return parse_expression (regexp, preg, token, syntax, nest, err);
2195 }
2196 /* else fall through */
2197 case OP_CLOSE_SUBEXP:
2198 if ((token->type == OP_CLOSE_SUBEXP) &&
2199 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2200 {
2201 *err = REG_ERPAREN;
2202 return NULL;
2203 }
2204 /* else fall through */
2205 case OP_CLOSE_DUP_NUM:
2206 /* We treat it as a normal character. */
2207
2208 /* Then we can these characters as normal characters. */
2209 token->type = CHARACTER;
2210 /* mb_partial and word_char bits should be initialized already
2211 by peek_token. */
2212 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2213 if (BE (tree == NULL, 0))
2214 {
2215 *err = REG_ESPACE;
2216 return NULL;
2217 }
2218 break;
2219 case ANCHOR:
2220 if (dfa->word_char == NULL)
2221 {
2222 *err = init_word_char (dfa);
2223 if (BE (*err != REG_NOERROR, 0))
2224 return NULL;
2225 }
2226 if (token->opr.ctx_type == WORD_DELIM)
2227 {
2228 bin_tree_t *tree_first, *tree_last;
2229 token->opr.ctx_type = WORD_FIRST;
2230 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2231 token->opr.ctx_type = WORD_LAST;
2232 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2233 token->type = OP_ALT;
2234 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2235 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2236 {
2237 *err = REG_ESPACE;
2238 return NULL;
2239 }
2240 }
2241 else
2242 {
2243 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2244 if (BE (tree == NULL, 0))
2245 {
2246 *err = REG_ESPACE;
2247 return NULL;
2248 }
2249 }
2250 /* We must return here, since ANCHORs can't be followed
2251 by repetition operators.
2252 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2253 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2254 fetch_token (token, regexp, syntax);
2255 return tree;
2256 case OP_PERIOD:
2257 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2258 if (BE (tree == NULL, 0))
2259 {
2260 *err = REG_ESPACE;
2261 return NULL;
2262 }
2263 if (dfa->mb_cur_max > 1)
2264 dfa->has_mb_node = 1;
2265 break;
2266 case OP_WORD:
2267 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
2268 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2269 return NULL;
2270 break;
2271 case OP_NOTWORD:
2272 tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2273 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2274 return NULL;
2275 break;
2276 case OP_SPACE:
2277 tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2278 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2279 return NULL;
2280 break;
2281 case OP_NOTSPACE:
2282 tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
2283 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2284 return NULL;
2285 break;
2286 case OP_ALT:
2287 case END_OF_RE:
2288 return NULL;
2289 case BACK_SLASH:
2290 *err = REG_EESCAPE;
2291 return NULL;
2292 default:
2293 /* Must not happen? */
2294 #ifdef DEBUG
2295 assert (0);
2296 #endif
2297 return NULL;
2298 }
2299 fetch_token (token, regexp, syntax);
2300
2301 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2302 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2303 {
2304 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2305 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306 return NULL;
2307 /* In BRE consecutive duplications are not allowed. */
2308 if ((syntax & RE_CONTEXT_INVALID_DUP)
2309 && (token->type == OP_DUP_ASTERISK
2310 || token->type == OP_OPEN_DUP_NUM))
2311 {
2312 *err = REG_BADRPT;
2313 return NULL;
2314 }
2315 dfa->has_plural_match = 1;
2316 }
2317
2318 return tree;
2319 }
2320
2321 /* This function build the following tree, from regular expression
2322 (<reg_exp>):
2323 SUBEXP
2324 |
2325 <reg_exp>
2326 */
2327
2328 static bin_tree_t *
2329 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2330 re_string_t *regexp;
2331 regex_t *preg;
2332 re_token_t *token;
2333 reg_syntax_t syntax;
2334 int nest;
2335 reg_errcode_t *err;
2336 {
2337 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2338 bin_tree_t *tree, *left_par, *right_par;
2339 size_t cur_nsub;
2340 cur_nsub = preg->re_nsub++;
2341 if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
2342 {
2343 re_subexp_t *new_array;
2344 dfa->subexps_alloc *= 2;
2345 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2346 if (BE (new_array == NULL, 0))
2347 {
2348 dfa->subexps_alloc /= 2;
2349 *err = REG_ESPACE;
2350 return NULL;
2351 }
2352 dfa->subexps = new_array;
2353 }
2354 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2355 dfa->subexps[cur_nsub].end = -1;
2356
2357 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2358 if (BE (left_par == NULL, 0))
2359 {
2360 *err = REG_ESPACE;
2361 return NULL;
2362 }
2363 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2364 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2365
2366 /* The subexpression may be a null string. */
2367 if (token->type == OP_CLOSE_SUBEXP)
2368 tree = NULL;
2369 else
2370 {
2371 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2372 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2373 return NULL;
2374 }
2375 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2376 {
2377 *err = REG_EPAREN;
2378 return NULL;
2379 }
2380 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2381 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2382 tree = ((tree == NULL) ? right_par
2383 : create_tree (dfa, tree, right_par, CONCAT, 0));
2384 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2385 if (BE (right_par == NULL || tree == NULL, 0))
2386 {
2387 *err = REG_ESPACE;
2388 return NULL;
2389 }
2390 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2391
2392 return tree;
2393 }
2394
2395 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2396
2397 static bin_tree_t *
2398 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2399 bin_tree_t *dup_elem;
2400 re_string_t *regexp;
2401 re_dfa_t *dfa;
2402 re_token_t *token;
2403 reg_syntax_t syntax;
2404 reg_errcode_t *err;
2405 {
2406 re_token_t dup_token;
2407 bin_tree_t *tree = dup_elem, *work_tree;
2408 int start_idx = re_string_cur_idx (regexp);
2409 re_token_t start_token = *token;
2410 if (token->type == OP_OPEN_DUP_NUM)
2411 {
2412 int i;
2413 int end = 0;
2414 int start = fetch_number (regexp, token, syntax);
2415 bin_tree_t *elem;
2416 if (start == -1)
2417 {
2418 if (token->type == CHARACTER && token->opr.c == ',')
2419 start = 0; /* We treat "{,m}" as "{0,m}". */
2420 else
2421 {
2422 *err = REG_BADBR; /* <re>{} is invalid. */
2423 return NULL;
2424 }
2425 }
2426 if (BE (start != -2, 1))
2427 {
2428 /* We treat "{n}" as "{n,n}". */
2429 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2430 : ((token->type == CHARACTER && token->opr.c == ',')
2431 ? fetch_number (regexp, token, syntax) : -2));
2432 }
2433 if (BE (start == -2 || end == -2, 0))
2434 {
2435 /* Invalid sequence. */
2436 if (token->type == OP_CLOSE_DUP_NUM)
2437 goto parse_dup_op_invalid_interval;
2438 else
2439 goto parse_dup_op_ebrace;
2440 }
2441 if (BE (start == 0 && end == 0, 0))
2442 {
2443 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2444 fetch_token (token, regexp, syntax);
2445 return NULL;
2446 }
2447
2448 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2449 elem = tree;
2450 for (i = 0; i < start; ++i)
2451 if (i != 0)
2452 {
2453 work_tree = duplicate_tree (elem, dfa);
2454 tree = create_tree (dfa, tree, work_tree, CONCAT, 0);
2455 if (BE (work_tree == NULL || tree == NULL, 0))
2456 goto parse_dup_op_espace;
2457 }
2458
2459 if (end == -1)
2460 {
2461 /* We treat "<re>{0,}" as "<re>*". */
2462 dup_token.type = OP_DUP_ASTERISK;
2463 if (start > 0)
2464 {
2465 elem = duplicate_tree (elem, dfa);
2466 work_tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2467 tree = create_tree (dfa, tree, work_tree, CONCAT, 0);
2468 if (BE (elem == NULL || work_tree == NULL || tree == NULL, 0))
2469 goto parse_dup_op_espace;
2470 }
2471 else
2472 {
2473 tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2474 if (BE (tree == NULL, 0))
2475 goto parse_dup_op_espace;
2476 }
2477 }
2478 else if (BE (start > end, 0))
2479 {
2480 /* First number greater than first. */
2481 *err = REG_BADBR;
2482 return NULL;
2483 }
2484 else if (end - start > 0)
2485 {
2486 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2487 dup_token.type = OP_DUP_QUESTION;
2488 if (start > 0)
2489 {
2490 elem = duplicate_tree (elem, dfa);
2491 elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2492 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2493 if (BE (elem == NULL || tree == NULL, 0))
2494 goto parse_dup_op_espace;
2495 }
2496 else
2497 {
2498 tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2499 if (BE (tree == NULL, 0))
2500 goto parse_dup_op_espace;
2501 }
2502 for (i = 1; i < end - start; ++i)
2503 {
2504 work_tree = duplicate_tree (elem, dfa);
2505 tree = create_tree (dfa, tree, work_tree, CONCAT, 0);
2506 if (BE (work_tree == NULL || tree == NULL, 0))
2507 {
2508 *err = REG_ESPACE;
2509 return NULL;
2510 }
2511 }
2512 }
2513 }
2514 else
2515 {
2516 tree = re_dfa_add_tree_node (dfa, tree, NULL, token);
2517 if (BE (tree == NULL, 0))
2518 {
2519 *err = REG_ESPACE;
2520 return NULL;
2521 }
2522 }
2523 fetch_token (token, regexp, syntax);
2524 return tree;
2525
2526 parse_dup_op_espace:
2527 *err = REG_ESPACE;
2528 return NULL;
2529
2530 parse_dup_op_ebrace:
2531 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2532 {
2533 *err = REG_EBRACE;
2534 return NULL;
2535 }
2536 goto parse_dup_op_rollback;
2537 parse_dup_op_invalid_interval:
2538 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2539 {
2540 *err = REG_BADBR;
2541 return NULL;
2542 }
2543 parse_dup_op_rollback:
2544 re_string_set_index (regexp, start_idx);
2545 *token = start_token;
2546 token->type = CHARACTER;
2547 /* mb_partial and word_char bits should be already initialized by
2548 peek_token. */
2549 return dup_elem;
2550 }
2551
2552 /* Size of the names for collating symbol/equivalence_class/character_class.
2553 I'm not sure, but maybe enough. */
2554 #define BRACKET_NAME_BUF_SIZE 32
2555
2556 #ifndef _LIBC
2557 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2558 Build the range expression which starts from START_ELEM, and ends
2559 at END_ELEM. The result are written to MBCSET and SBCSET.
2560 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2561 mbcset->range_ends, is a pointer argument sinse we may
2562 update it. */
2563
2564 static reg_errcode_t
2565 # ifdef RE_ENABLE_I18N
2566 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2567 re_charset_t *mbcset;
2568 int *range_alloc;
2569 # else /* not RE_ENABLE_I18N */
2570 build_range_exp (sbcset, start_elem, end_elem)
2571 # endif /* not RE_ENABLE_I18N */
2572 re_bitset_ptr_t sbcset;
2573 bracket_elem_t *start_elem, *end_elem;
2574 {
2575 unsigned int start_ch, end_ch;
2576 /* Equivalence Classes and Character Classes can't be a range start/end. */
2577 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2578 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2579 0))
2580 return REG_ERANGE;
2581
2582 /* We can handle no multi character collating elements without libc
2583 support. */
2584 if (BE ((start_elem->type == COLL_SYM
2585 && strlen ((char *) start_elem->opr.name) > 1)
2586 || (end_elem->type == COLL_SYM
2587 && strlen ((char *) end_elem->opr.name) > 1), 0))
2588 return REG_ECOLLATE;
2589
2590 # ifdef RE_ENABLE_I18N
2591 {
2592 wchar_t wc, start_wc, end_wc;
2593 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2594
2595 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2596 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2597 : 0));
2598 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2599 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2600 : 0));
2601 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2602 ? __btowc (start_ch) : start_elem->opr.wch);
2603 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2604 ? __btowc (end_ch) : end_elem->opr.wch);
2605 cmp_buf[0] = start_wc;
2606 cmp_buf[4] = end_wc;
2607 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2608 return REG_ERANGE;
2609
2610 /* Check the space of the arrays. */
2611 if (BE (*range_alloc == mbcset->nranges, 0))
2612 {
2613 /* There are not enough space, need realloc. */
2614 wchar_t *new_array_start, *new_array_end;
2615 int new_nranges;
2616
2617 /* +1 in case of mbcset->nranges is 0. */
2618 new_nranges = 2 * mbcset->nranges + 1;
2619 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2620 are NULL if *range_alloc == 0. */
2621 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2622 new_nranges);
2623 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2624 new_nranges);
2625
2626 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2627 return REG_ESPACE;
2628
2629 mbcset->range_starts = new_array_start;
2630 mbcset->range_ends = new_array_end;
2631 *range_alloc = new_nranges;
2632 }
2633
2634 mbcset->range_starts[mbcset->nranges] = start_wc;
2635 mbcset->range_ends[mbcset->nranges++] = end_wc;
2636
2637 /* Build the table for single byte characters. */
2638 for (wc = 0; wc <= SBC_MAX; ++wc)
2639 {
2640 cmp_buf[2] = wc;
2641 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2642 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2643 bitset_set (sbcset, wc);
2644 }
2645 }
2646 # else /* not RE_ENABLE_I18N */
2647 {
2648 unsigned int ch;
2649 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2650 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2651 : 0));
2652 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2653 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2654 : 0));
2655 if (start_ch > end_ch)
2656 return REG_ERANGE;
2657 /* Build the table for single byte characters. */
2658 for (ch = 0; ch <= SBC_MAX; ++ch)
2659 if (start_ch <= ch && ch <= end_ch)
2660 bitset_set (sbcset, ch);
2661 }
2662 # endif /* not RE_ENABLE_I18N */
2663 return REG_NOERROR;
2664 }
2665 #endif /* not _LIBC */
2666
2667 #ifndef _LIBC
2668 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2669 Build the collating element which is represented by NAME.
2670 The result are written to MBCSET and SBCSET.
2671 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2672 pointer argument since we may update it. */
2673
2674 static reg_errcode_t
2675 # ifdef RE_ENABLE_I18N
2676 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2677 re_charset_t *mbcset;
2678 int *coll_sym_alloc;
2679 # else /* not RE_ENABLE_I18N */
2680 build_collating_symbol (sbcset, name)
2681 # endif /* not RE_ENABLE_I18N */
2682 re_bitset_ptr_t sbcset;
2683 const unsigned char *name;
2684 {
2685 size_t name_len = strlen ((const char *) name);
2686 if (BE (name_len != 1, 0))
2687 return REG_ECOLLATE;
2688 else
2689 {
2690 bitset_set (sbcset, name[0]);
2691 return REG_NOERROR;
2692 }
2693 }
2694 #endif /* not _LIBC */
2695
2696 /* This function parse bracket expression like "[abc]", "[a-c]",
2697 "[[.a-a.]]" etc. */
2698
2699 static bin_tree_t *
2700 parse_bracket_exp (regexp, dfa, token, syntax, err)
2701 re_string_t *regexp;
2702 re_dfa_t *dfa;
2703 re_token_t *token;
2704 reg_syntax_t syntax;
2705 reg_errcode_t *err;
2706 {
2707 #ifdef _LIBC
2708 const unsigned char *collseqmb;
2709 const char *collseqwc;
2710 uint32_t nrules;
2711 int32_t table_size;
2712 const int32_t *symb_table;
2713 const unsigned char *extra;
2714
2715 /* Local function for parse_bracket_exp used in _LIBC environement.
2716 Seek the collating symbol entry correspondings to NAME.
2717 Return the index of the symbol in the SYMB_TABLE. */
2718
2719 static inline int32_t
2720 __attribute ((always_inline))
2721 seek_collating_symbol_entry (name, name_len)
2722 const unsigned char *name;
2723 size_t name_len;
2724 {
2725 int32_t hash = elem_hash ((const char *) name, name_len);
2726 int32_t elem = hash % table_size;
2727 int32_t second = hash % (table_size - 2);
2728 while (symb_table[2 * elem] != 0)
2729 {
2730 /* First compare the hashing value. */
2731 if (symb_table[2 * elem] == hash
2732 /* Compare the length of the name. */
2733 && name_len == extra[symb_table[2 * elem + 1]]
2734 /* Compare the name. */
2735 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2736 name_len) == 0)
2737 {
2738 /* Yep, this is the entry. */
2739 break;
2740 }
2741
2742 /* Next entry. */
2743 elem += second;
2744 }
2745 return elem;
2746 }
2747
2748 /* Local function for parse_bracket_exp used in _LIBC environement.
2749 Look up the collation sequence value of BR_ELEM.
2750 Return the value if succeeded, UINT_MAX otherwise. */
2751
2752 static inline unsigned int
2753 __attribute ((always_inline))
2754 lookup_collation_sequence_value (br_elem)
2755 bracket_elem_t *br_elem;
2756 {
2757 if (br_elem->type == SB_CHAR)
2758 {
2759 /*
2760 if (MB_CUR_MAX == 1)
2761 */
2762 if (nrules == 0)
2763 return collseqmb[br_elem->opr.ch];
2764 else
2765 {
2766 wint_t wc = __btowc (br_elem->opr.ch);
2767 return __collseq_table_lookup (collseqwc, wc);
2768 }
2769 }
2770 else if (br_elem->type == MB_CHAR)
2771 {
2772 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2773 }
2774 else if (br_elem->type == COLL_SYM)
2775 {
2776 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2777 if (nrules != 0)
2778 {
2779 int32_t elem, idx;
2780 elem = seek_collating_symbol_entry (br_elem->opr.name,
2781 sym_name_len);
2782 if (symb_table[2 * elem] != 0)
2783 {
2784 /* We found the entry. */
2785 idx = symb_table[2 * elem + 1];
2786 /* Skip the name of collating element name. */
2787 idx += 1 + extra[idx];
2788 /* Skip the byte sequence of the collating element. */
2789 idx += 1 + extra[idx];
2790 /* Adjust for the alignment. */
2791 idx = (idx + 3) & ~3;
2792 /* Skip the multibyte collation sequence value. */
2793 idx += sizeof (unsigned int);
2794 /* Skip the wide char sequence of the collating element. */
2795 idx += sizeof (unsigned int) *
2796 (1 + *(unsigned int *) (extra + idx));
2797 /* Return the collation sequence value. */
2798 return *(unsigned int *) (extra + idx);
2799 }
2800 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2801 {
2802 /* No valid character. Match it as a single byte
2803 character. */
2804 return collseqmb[br_elem->opr.name[0]];
2805 }
2806 }
2807 else if (sym_name_len == 1)
2808 return collseqmb[br_elem->opr.name[0]];
2809 }
2810 return UINT_MAX;
2811 }
2812
2813 /* Local function for parse_bracket_exp used in _LIBC environement.
2814 Build the range expression which starts from START_ELEM, and ends
2815 at END_ELEM. The result are written to MBCSET and SBCSET.
2816 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2817 mbcset->range_ends, is a pointer argument sinse we may
2818 update it. */
2819
2820 static inline reg_errcode_t
2821 __attribute ((always_inline))
2822 # ifdef RE_ENABLE_I18N
2823 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2824 re_charset_t *mbcset;
2825 int *range_alloc;
2826 # else /* not RE_ENABLE_I18N */
2827 build_range_exp (sbcset, start_elem, end_elem)
2828 # endif /* not RE_ENABLE_I18N */
2829 re_bitset_ptr_t sbcset;
2830 bracket_elem_t *start_elem, *end_elem;
2831 {
2832 unsigned int ch;
2833 uint32_t start_collseq;
2834 uint32_t end_collseq;
2835
2836 # ifdef RE_ENABLE_I18N
2837 /* Check the space of the arrays. */
2838 if (BE (*range_alloc == mbcset->nranges, 0))
2839 {
2840 /* There are not enough space, need realloc. */
2841 uint32_t *new_array_start;
2842 uint32_t *new_array_end;
2843 int new_nranges;
2844
2845 /* +1 in case of mbcset->nranges is 0. */
2846 new_nranges = 2 * mbcset->nranges + 1;
2847 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2848 are NULL if *range_alloc == 0. */
2849 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2850 new_nranges);
2851 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2852 new_nranges);
2853
2854 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2855 return REG_ESPACE;
2856
2857 mbcset->range_starts = new_array_start;
2858 mbcset->range_ends = new_array_end;
2859 *range_alloc = new_nranges;
2860 }
2861 # endif /* RE_ENABLE_I18N */
2862
2863 /* Equivalence Classes and Character Classes can't be a range
2864 start/end. */
2865 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2866 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2867 0))
2868 return REG_ERANGE;
2869
2870 start_collseq = lookup_collation_sequence_value (start_elem);
2871 end_collseq = lookup_collation_sequence_value (end_elem);
2872 /* Check start/end collation sequence values. */
2873 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2874 return REG_ECOLLATE;
2875 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2876 return REG_ERANGE;
2877
2878 # ifdef RE_ENABLE_I18N
2879 /* Got valid collation sequence values, add them as a new entry. */
2880 mbcset->range_starts[mbcset->nranges] = start_collseq;
2881 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2882 # endif /* RE_ENABLE_I18N */
2883
2884 /* Build the table for single byte characters. */
2885 for (ch = 0; ch <= SBC_MAX; ch++)
2886 {
2887 uint32_t ch_collseq;
2888 /*
2889 if (MB_CUR_MAX == 1)
2890 */
2891 if (nrules == 0)
2892 ch_collseq = collseqmb[ch];
2893 else
2894 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2895 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2896 bitset_set (sbcset, ch);
2897 }
2898 return REG_NOERROR;
2899 }
2900
2901 /* Local function for parse_bracket_exp used in _LIBC environement.
2902 Build the collating element which is represented by NAME.
2903 The result are written to MBCSET and SBCSET.
2904 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2905 pointer argument sinse we may update it. */
2906
2907 static inline reg_errcode_t
2908 __attribute ((always_inline))
2909 # ifdef RE_ENABLE_I18N
2910 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2911 re_charset_t *mbcset;
2912 int *coll_sym_alloc;
2913 # else /* not RE_ENABLE_I18N */
2914 build_collating_symbol (sbcset, name)
2915 # endif /* not RE_ENABLE_I18N */
2916 re_bitset_ptr_t sbcset;
2917 const unsigned char *name;
2918 {
2919 int32_t elem, idx;
2920 size_t name_len = strlen ((const char *) name);
2921 if (nrules != 0)
2922 {
2923 elem = seek_collating_symbol_entry (name, name_len);
2924 if (symb_table[2 * elem] != 0)
2925 {
2926 /* We found the entry. */
2927 idx = symb_table[2 * elem + 1];
2928 /* Skip the name of collating element name. */
2929 idx += 1 + extra[idx];
2930 }
2931 else if (symb_table[2 * elem] == 0 && name_len == 1)
2932 {
2933 /* No valid character, treat it as a normal
2934 character. */
2935 bitset_set (sbcset, name[0]);
2936 return REG_NOERROR;
2937 }
2938 else
2939 return REG_ECOLLATE;
2940
2941 # ifdef RE_ENABLE_I18N
2942 /* Got valid collation sequence, add it as a new entry. */
2943 /* Check the space of the arrays. */
2944 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2945 {
2946 /* Not enough, realloc it. */
2947 /* +1 in case of mbcset->ncoll_syms is 0. */
2948 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2949 /* Use realloc since mbcset->coll_syms is NULL
2950 if *alloc == 0. */
2951 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2952 new_coll_sym_alloc);
2953 if (BE (new_coll_syms == NULL, 0))
2954 return REG_ESPACE;
2955 mbcset->coll_syms = new_coll_syms;
2956 *coll_sym_alloc = new_coll_sym_alloc;
2957 }
2958 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2959 # endif /* RE_ENABLE_I18N */
2960 return REG_NOERROR;
2961 }
2962 else
2963 {
2964 if (BE (name_len != 1, 0))
2965 return REG_ECOLLATE;
2966 else
2967 {
2968 bitset_set (sbcset, name[0]);
2969 return REG_NOERROR;
2970 }
2971 }
2972 }
2973 #endif
2974
2975 re_token_t br_token;
2976 re_bitset_ptr_t sbcset;
2977 #ifdef RE_ENABLE_I18N
2978 re_charset_t *mbcset;
2979 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2980 int equiv_class_alloc = 0, char_class_alloc = 0;
2981 #else /* not RE_ENABLE_I18N */
2982 int non_match = 0;
2983 #endif /* not RE_ENABLE_I18N */
2984 bin_tree_t *work_tree;
2985 int token_len;
2986 #ifdef _LIBC
2987 collseqmb = (const unsigned char *)
2988 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2989 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2990 if (nrules)
2991 {
2992 /*
2993 if (MB_CUR_MAX > 1)
2994 */
2995 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2996 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2997 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2998 _NL_COLLATE_SYMB_TABLEMB);
2999 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3000 _NL_COLLATE_SYMB_EXTRAMB);
3001 }
3002 #endif
3003 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3004 #ifdef RE_ENABLE_I18N
3005 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3006 #endif /* RE_ENABLE_I18N */
3007 #ifdef RE_ENABLE_I18N
3008 if (BE (sbcset == NULL || mbcset == NULL, 0))
3009 #else
3010 if (BE (sbcset == NULL, 0))
3011 #endif /* RE_ENABLE_I18N */
3012 {
3013 *err = REG_ESPACE;
3014 return NULL;
3015 }
3016
3017 token_len = peek_token_bracket (token, regexp, syntax);
3018 if (BE (token->type == END_OF_RE, 0))
3019 {
3020 *err = REG_BADPAT;
3021 goto parse_bracket_exp_free_return;
3022 }
3023 if (token->type == OP_NON_MATCH_LIST)
3024 {
3025 #ifdef RE_ENABLE_I18N
3026 mbcset->non_match = 1;
3027 #else /* not RE_ENABLE_I18N */
3028 non_match = 1;
3029 #endif /* not RE_ENABLE_I18N */
3030 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3031 bitset_set (sbcset, '\0');
3032 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3033 token_len = peek_token_bracket (token, regexp, syntax);
3034 if (BE (token->type == END_OF_RE, 0))
3035 {
3036 *err = REG_BADPAT;
3037 goto parse_bracket_exp_free_return;
3038 }
3039 }
3040
3041 /* We treat the first ']' as a normal character. */
3042 if (token->type == OP_CLOSE_BRACKET)
3043 token->type = CHARACTER;
3044
3045 int first_round = 1;
3046 while (1)
3047 {
3048 bracket_elem_t start_elem, end_elem;
3049 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3050 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3051 reg_errcode_t ret;
3052 int token_len2 = 0, is_range_exp = 0;
3053 re_token_t token2;
3054
3055 start_elem.opr.name = start_name_buf;
3056 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3057 syntax, first_round);
3058 if (BE (ret != REG_NOERROR, 0))
3059 {
3060 *err = ret;
3061 goto parse_bracket_exp_free_return;
3062 }
3063 first_round = 0;
3064
3065 /* Get information about the next token. We need it in any case. */
3066 token_len = peek_token_bracket (token, regexp, syntax);
3067
3068 /* Do not check for ranges if we know they are not allowed. */
3069 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3070 {
3071 if (BE (token->type == END_OF_RE, 0))
3072 {
3073 *err = REG_EBRACK;
3074 goto parse_bracket_exp_free_return;
3075 }
3076 if (token->type == OP_CHARSET_RANGE)
3077 {
3078 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3079 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3080 if (BE (token2.type == END_OF_RE, 0))
3081 {
3082 *err = REG_EBRACK;
3083 goto parse_bracket_exp_free_return;
3084 }
3085 if (token2.type == OP_CLOSE_BRACKET)
3086 {
3087 /* We treat the last '-' as a normal character. */
3088 re_string_skip_bytes (regexp, -token_len);
3089 token->type = CHARACTER;
3090 }
3091 else
3092 is_range_exp = 1;
3093 }
3094 }
3095
3096 if (is_range_exp == 1)
3097 {
3098 end_elem.opr.name = end_name_buf;
3099 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3100 dfa, syntax, 1);
3101 if (BE (ret != REG_NOERROR, 0))
3102 {
3103 *err = ret;
3104 goto parse_bracket_exp_free_return;
3105 }
3106
3107 token_len = peek_token_bracket (token, regexp, syntax);
3108
3109 *err = build_range_exp (sbcset,
3110 #ifdef RE_ENABLE_I18N
3111 mbcset, &range_alloc,
3112 #endif /* RE_ENABLE_I18N */
3113 &start_elem, &end_elem);
3114 if (BE (*err != REG_NOERROR, 0))
3115 goto parse_bracket_exp_free_return;
3116 }
3117 else
3118 {
3119 switch (start_elem.type)
3120 {
3121 case SB_CHAR:
3122 bitset_set (sbcset, start_elem.opr.ch);
3123 break;
3124 #ifdef RE_ENABLE_I18N
3125 case MB_CHAR:
3126 /* Check whether the array has enough space. */
3127 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3128 {
3129 wchar_t *new_mbchars;
3130 /* Not enough, realloc it. */
3131 /* +1 in case of mbcset->nmbchars is 0. */
3132 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3133 /* Use realloc since array is NULL if *alloc == 0. */
3134 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3135 mbchar_alloc);
3136 if (BE (new_mbchars == NULL, 0))
3137 goto parse_bracket_exp_espace;
3138 mbcset->mbchars = new_mbchars;
3139 }
3140 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3141 break;
3142 #endif /* RE_ENABLE_I18N */
3143 case EQUIV_CLASS:
3144 *err = build_equiv_class (sbcset,
3145 #ifdef RE_ENABLE_I18N
3146 mbcset, &equiv_class_alloc,
3147 #endif /* RE_ENABLE_I18N */
3148 start_elem.opr.name);
3149 if (BE (*err != REG_NOERROR, 0))
3150 goto parse_bracket_exp_free_return;
3151 break;
3152 case COLL_SYM:
3153 *err = build_collating_symbol (sbcset,
3154 #ifdef RE_ENABLE_I18N
3155 mbcset, &coll_sym_alloc,
3156 #endif /* RE_ENABLE_I18N */
3157 start_elem.opr.name);
3158 if (BE (*err != REG_NOERROR, 0))
3159 goto parse_bracket_exp_free_return;
3160 break;
3161 case CHAR_CLASS:
3162 *err = build_charclass (regexp->trans, sbcset,
3163 #ifdef RE_ENABLE_I18N
3164 mbcset, &char_class_alloc,
3165 #endif /* RE_ENABLE_I18N */
3166 start_elem.opr.name, syntax);
3167 if (BE (*err != REG_NOERROR, 0))
3168 goto parse_bracket_exp_free_return;
3169 break;
3170 default:
3171 assert (0);
3172 break;
3173 }
3174 }
3175 if (BE (token->type == END_OF_RE, 0))
3176 {
3177 *err = REG_EBRACK;
3178 goto parse_bracket_exp_free_return;
3179 }
3180 if (token->type == OP_CLOSE_BRACKET)
3181 break;
3182 }
3183
3184 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3185
3186 /* If it is non-matching list. */
3187 #ifdef RE_ENABLE_I18N
3188 if (mbcset->non_match)
3189 #else /* not RE_ENABLE_I18N */
3190 if (non_match)
3191 #endif /* not RE_ENABLE_I18N */
3192 bitset_not (sbcset);
3193 #ifdef RE_ENABLE_I18N
3194 /* Ensure only single byte characters are set. */
3195 if (dfa->mb_cur_max > 1)
3196 bitset_mask (sbcset, dfa->sb_char);
3197 #endif /* RE_ENABLE_I18N */
3198
3199 /* Build a tree for simple bracket. */
3200 br_token.type = SIMPLE_BRACKET;
3201 br_token.opr.sbcset = sbcset;
3202 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3203 if (BE (work_tree == NULL, 0))
3204 goto parse_bracket_exp_espace;
3205
3206 #ifdef RE_ENABLE_I18N
3207 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3208 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3209 || mbcset->non_match)))
3210 {
3211 re_token_t alt_token;
3212 bin_tree_t *mbc_tree;
3213 int sbc_idx;
3214 /* Build a tree for complex bracket. */
3215 dfa->has_mb_node = 1;
3216 dfa->has_plural_match = 1;
3217 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3218 if (sbcset[sbc_idx])
3219 break;
3220 /* If there are no bits set in sbcset, there is no point
3221 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3222 if (sbc_idx == BITSET_UINTS)
3223 {
3224 re_free (sbcset);
3225 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3226 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3227 return work_tree;
3228 }
3229 br_token.type = COMPLEX_BRACKET;
3230 br_token.opr.mbcset = mbcset;
3231 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3232 if (BE (mbc_tree == NULL, 0))
3233 goto parse_bracket_exp_espace;
3234 /* Then join them by ALT node. */
3235 alt_token.type = OP_ALT;
3236 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3237 if (BE (mbc_tree != NULL, 1))
3238 return work_tree;
3239 }
3240 else
3241 {
3242 free_charset (mbcset);
3243 return work_tree;
3244 }
3245 #else /* not RE_ENABLE_I18N */
3246 return work_tree;
3247 #endif /* not RE_ENABLE_I18N */
3248
3249 parse_bracket_exp_espace:
3250 *err = REG_ESPACE;
3251 parse_bracket_exp_free_return:
3252 re_free (sbcset);
3253 #ifdef RE_ENABLE_I18N
3254 free_charset (mbcset);
3255 #endif /* RE_ENABLE_I18N */
3256 return NULL;
3257 }
3258
3259 /* Parse an element in the bracket expression. */
3260
3261 static reg_errcode_t
3262 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3263 accept_hyphen)
3264 bracket_elem_t *elem;
3265 re_string_t *regexp;
3266 re_token_t *token;
3267 int token_len;
3268 re_dfa_t *dfa;
3269 reg_syntax_t syntax;
3270 int accept_hyphen;
3271 {
3272 #ifdef RE_ENABLE_I18N
3273 int cur_char_size;
3274 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3275 if (cur_char_size > 1)
3276 {
3277 elem->type = MB_CHAR;
3278 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3279 re_string_skip_bytes (regexp, cur_char_size);
3280 return REG_NOERROR;
3281 }
3282 #endif /* RE_ENABLE_I18N */
3283 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3284 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3285 || token->type == OP_OPEN_EQUIV_CLASS)
3286 return parse_bracket_symbol (elem, regexp, token);
3287 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3288 {
3289 /* A '-' must only appear as anything but a range indicator before
3290 the closing bracket. Everything else is an error. */
3291 re_token_t token2;
3292 (void) peek_token_bracket (&token2, regexp, syntax);
3293 if (token2.type != OP_CLOSE_BRACKET)
3294 /* The actual error value is not standardized since this whole
3295 case is undefined. But ERANGE makes good sense. */
3296 return REG_ERANGE;
3297 }
3298 elem->type = SB_CHAR;
3299 elem->opr.ch = token->opr.c;
3300 return REG_NOERROR;
3301 }
3302
3303 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3304 such as [:<character_class>:], [.<collating_element>.], and
3305 [=<equivalent_class>=]. */
3306
3307 static reg_errcode_t
3308 parse_bracket_symbol (elem, regexp, token)
3309 bracket_elem_t *elem;
3310 re_string_t *regexp;
3311 re_token_t *token;
3312 {
3313 unsigned char ch, delim = token->opr.c;
3314 int i = 0;
3315 for (;; ++i)
3316 {
3317 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3318 return REG_EBRACK;
3319 if (token->type == OP_OPEN_CHAR_CLASS)
3320 ch = re_string_fetch_byte_case (regexp);
3321 else
3322 ch = re_string_fetch_byte (regexp);
3323 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3324 break;
3325 elem->opr.name[i] = ch;
3326 }
3327 re_string_skip_bytes (regexp, 1);
3328 elem->opr.name[i] = '\0';
3329 switch (token->type)
3330 {
3331 case OP_OPEN_COLL_ELEM:
3332 elem->type = COLL_SYM;
3333 break;
3334 case OP_OPEN_EQUIV_CLASS:
3335 elem->type = EQUIV_CLASS;
3336 break;
3337 case OP_OPEN_CHAR_CLASS:
3338 elem->type = CHAR_CLASS;
3339 break;
3340 default:
3341 break;
3342 }
3343 return REG_NOERROR;
3344 }
3345
3346 /* Helper function for parse_bracket_exp.
3347 Build the equivalence class which is represented by NAME.
3348 The result are written to MBCSET and SBCSET.
3349 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3350 is a pointer argument sinse we may update it. */
3351
3352 static reg_errcode_t
3353 #ifdef RE_ENABLE_I18N
3354 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3355 re_charset_t *mbcset;
3356 int *equiv_class_alloc;
3357 #else /* not RE_ENABLE_I18N */
3358 build_equiv_class (sbcset, name)
3359 #endif /* not RE_ENABLE_I18N */
3360 re_bitset_ptr_t sbcset;
3361 const unsigned char *name;
3362 {
3363 #if defined _LIBC && defined RE_ENABLE_I18N
3364 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3365 if (nrules != 0)
3366 {
3367 const int32_t *table, *indirect;
3368 const unsigned char *weights, *extra, *cp;
3369 unsigned char char_buf[2];
3370 int32_t idx1, idx2;
3371 unsigned int ch;
3372 size_t len;
3373 /* This #include defines a local function! */
3374 # include <locale/weight.h>
3375 /* Calculate the index for equivalence class. */
3376 cp = name;
3377 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3378 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3379 _NL_COLLATE_WEIGHTMB);
3380 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3381 _NL_COLLATE_EXTRAMB);
3382 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3383 _NL_COLLATE_INDIRECTMB);
3384 idx1 = findidx (&cp);
3385 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3386 /* This isn't a valid character. */
3387 return REG_ECOLLATE;
3388
3389 /* Build single byte matcing table for this equivalence class. */
3390 char_buf[1] = (unsigned char) '\0';
3391 len = weights[idx1];
3392 for (ch = 0; ch < SBC_MAX; ++ch)
3393 {
3394 char_buf[0] = ch;
3395 cp = char_buf;
3396 idx2 = findidx (&cp);
3397 /*
3398 idx2 = table[ch];
3399 */
3400 if (idx2 == 0)
3401 /* This isn't a valid character. */
3402 continue;
3403 if (len == weights[idx2])
3404 {
3405 int cnt = 0;
3406 while (cnt <= len &&
3407 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3408 ++cnt;
3409
3410 if (cnt > len)
3411 bitset_set (sbcset, ch);
3412 }
3413 }
3414 /* Check whether the array has enough space. */
3415 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3416 {
3417 /* Not enough, realloc it. */
3418 /* +1 in case of mbcset->nequiv_classes is 0. */
3419 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3420 /* Use realloc since the array is NULL if *alloc == 0. */
3421 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3422 int32_t,
3423 new_equiv_class_alloc);
3424 if (BE (new_equiv_classes == NULL, 0))
3425 return REG_ESPACE;
3426 mbcset->equiv_classes = new_equiv_classes;
3427 *equiv_class_alloc = new_equiv_class_alloc;
3428 }
3429 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3430 }
3431 else
3432 #endif /* _LIBC && RE_ENABLE_I18N */
3433 {
3434 if (BE (strlen ((const char *) name) != 1, 0))
3435 return REG_ECOLLATE;
3436 bitset_set (sbcset, *name);
3437 }
3438 return REG_NOERROR;
3439 }
3440
3441 /* Helper function for parse_bracket_exp.
3442 Build the character class which is represented by NAME.
3443 The result are written to MBCSET and SBCSET.
3444 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3445 is a pointer argument sinse we may update it. */
3446
3447 static reg_errcode_t
3448 #ifdef RE_ENABLE_I18N
3449 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3450 re_charset_t *mbcset;
3451 int *char_class_alloc;
3452 #else /* not RE_ENABLE_I18N */
3453 build_charclass (trans, sbcset, class_name, syntax)
3454 #endif /* not RE_ENABLE_I18N */
3455 RE_TRANSLATE_TYPE trans;
3456 re_bitset_ptr_t sbcset;
3457 const unsigned char *class_name;
3458 reg_syntax_t syntax;
3459 {
3460 int i;
3461 const char *name = (const char *) class_name;
3462
3463 /* In case of REG_ICASE "upper" and "lower" match the both of
3464 upper and lower cases. */
3465 if ((syntax & RE_ICASE)
3466 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3467 name = "alpha";
3468
3469 #ifdef RE_ENABLE_I18N
3470 /* Check the space of the arrays. */
3471 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3472 {
3473 /* Not enough, realloc it. */
3474 /* +1 in case of mbcset->nchar_classes is 0. */
3475 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3476 /* Use realloc since array is NULL if *alloc == 0. */
3477 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3478 new_char_class_alloc);
3479 if (BE (new_char_classes == NULL, 0))
3480 return REG_ESPACE;
3481 mbcset->char_classes = new_char_classes;
3482 *char_class_alloc = new_char_class_alloc;
3483 }
3484 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3485 #endif /* RE_ENABLE_I18N */
3486
3487 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3488 for (i = 0; i < SBC_MAX; ++i) \
3489 { \
3490 if (ctype_func (i)) \
3491 { \
3492 int ch = trans ? trans[i] : i; \
3493 bitset_set (sbcset, ch); \
3494 } \
3495 }
3496
3497 if (strcmp (name, "alnum") == 0)
3498 BUILD_CHARCLASS_LOOP (isalnum)
3499 else if (strcmp (name, "cntrl") == 0)
3500 BUILD_CHARCLASS_LOOP (iscntrl)
3501 else if (strcmp (name, "lower") == 0)
3502 BUILD_CHARCLASS_LOOP (islower)
3503 else if (strcmp (name, "space") == 0)
3504 BUILD_CHARCLASS_LOOP (isspace)
3505 else if (strcmp (name, "alpha") == 0)
3506 BUILD_CHARCLASS_LOOP (isalpha)
3507 else if (strcmp (name, "digit") == 0)
3508 BUILD_CHARCLASS_LOOP (isdigit)
3509 else if (strcmp (name, "print") == 0)
3510 BUILD_CHARCLASS_LOOP (isprint)
3511 else if (strcmp (name, "upper") == 0)
3512 BUILD_CHARCLASS_LOOP (isupper)
3513 else if (strcmp (name, "blank") == 0)
3514 BUILD_CHARCLASS_LOOP (isblank)
3515 else if (strcmp (name, "graph") == 0)
3516 BUILD_CHARCLASS_LOOP (isgraph)
3517 else if (strcmp (name, "punct") == 0)
3518 BUILD_CHARCLASS_LOOP (ispunct)
3519 else if (strcmp (name, "xdigit") == 0)
3520 BUILD_CHARCLASS_LOOP (isxdigit)
3521 else
3522 return REG_ECTYPE;
3523
3524 return REG_NOERROR;
3525 }
3526
3527 static bin_tree_t *
3528 build_charclass_op (dfa, trans, class_name, extra, not, err)
3529 re_dfa_t *dfa;
3530 RE_TRANSLATE_TYPE trans;
3531 const unsigned char *class_name;
3532 const unsigned char *extra;
3533 int not;
3534 reg_errcode_t *err;
3535 {
3536 re_bitset_ptr_t sbcset;
3537 #ifdef RE_ENABLE_I18N
3538 re_charset_t *mbcset;
3539 int alloc = 0;
3540 #else /* not RE_ENABLE_I18N */
3541 int non_match = 0;
3542 #endif /* not RE_ENABLE_I18N */
3543 reg_errcode_t ret;
3544 re_token_t br_token;
3545 bin_tree_t *tree;
3546
3547 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3548 #ifdef RE_ENABLE_I18N
3549 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3550 #endif /* RE_ENABLE_I18N */
3551
3552 #ifdef RE_ENABLE_I18N
3553 if (BE (sbcset == NULL || mbcset == NULL, 0))
3554 #else /* not RE_ENABLE_I18N */
3555 if (BE (sbcset == NULL, 0))
3556 #endif /* not RE_ENABLE_I18N */
3557 {
3558 *err = REG_ESPACE;
3559 return NULL;
3560 }
3561
3562 if (not)
3563 {
3564 #ifdef RE_ENABLE_I18N
3565 /*
3566 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3567 bitset_set(cset->sbcset, '\0');
3568 */
3569 mbcset->non_match = 1;
3570 #else /* not RE_ENABLE_I18N */
3571 non_match = 1;
3572 #endif /* not RE_ENABLE_I18N */
3573 }
3574
3575 /* We don't care the syntax in this case. */
3576 ret = build_charclass (trans, sbcset,
3577 #ifdef RE_ENABLE_I18N
3578 mbcset, &alloc,
3579 #endif /* RE_ENABLE_I18N */
3580 class_name, 0);
3581
3582 if (BE (ret != REG_NOERROR, 0))
3583 {
3584 re_free (sbcset);
3585 #ifdef RE_ENABLE_I18N
3586 free_charset (mbcset);
3587 #endif /* RE_ENABLE_I18N */
3588 *err = ret;
3589 return NULL;
3590 }
3591 /* \w match '_' also. */
3592 for (; *extra; extra++)
3593 bitset_set (sbcset, *extra);
3594
3595 /* If it is non-matching list. */
3596 #ifdef RE_ENABLE_I18N
3597 if (mbcset->non_match)
3598 #else /* not RE_ENABLE_I18N */
3599 if (non_match)
3600 #endif /* not RE_ENABLE_I18N */
3601 bitset_not (sbcset);
3602
3603 #ifdef RE_ENABLE_I18N
3604 /* Ensure only single byte characters are set. */
3605 if (dfa->mb_cur_max > 1)
3606 bitset_mask (sbcset, dfa->sb_char);
3607 #endif
3608
3609 /* Build a tree for simple bracket. */
3610 br_token.type = SIMPLE_BRACKET;
3611 br_token.opr.sbcset = sbcset;
3612 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3613 if (BE (tree == NULL, 0))
3614 goto build_word_op_espace;
3615
3616 #ifdef RE_ENABLE_I18N
3617 if (dfa->mb_cur_max > 1)
3618 {
3619 re_token_t alt_token;
3620 bin_tree_t *mbc_tree;
3621 /* Build a tree for complex bracket. */
3622 br_token.type = COMPLEX_BRACKET;
3623 br_token.opr.mbcset = mbcset;
3624 dfa->has_mb_node = 1;
3625 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3626 if (BE (mbc_tree == NULL, 0))
3627 goto build_word_op_espace;
3628 /* Then join them by ALT node. */
3629 alt_token.type = OP_ALT;
3630 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3631 if (BE (mbc_tree != NULL, 1))
3632 return tree;
3633 }
3634 else
3635 {
3636 free_charset (mbcset);
3637 return tree;
3638 }
3639 #else /* not RE_ENABLE_I18N */
3640 return tree;
3641 #endif /* not RE_ENABLE_I18N */
3642
3643 build_word_op_espace:
3644 re_free (sbcset);
3645 #ifdef RE_ENABLE_I18N
3646 free_charset (mbcset);
3647 #endif /* RE_ENABLE_I18N */
3648 *err = REG_ESPACE;
3649 return NULL;
3650 }
3651
3652 /* This is intended for the expressions like "a{1,3}".
3653 Fetch a number from `input', and return the number.
3654 Return -1, if the number field is empty like "{,1}".
3655 Return -2, If an error is occured. */
3656
3657 static int
3658 fetch_number (input, token, syntax)
3659 re_string_t *input;
3660 re_token_t *token;
3661 reg_syntax_t syntax;
3662 {
3663 int num = -1;
3664 unsigned char c;
3665 while (1)
3666 {
3667 fetch_token (token, input, syntax);
3668 c = token->opr.c;
3669 if (BE (token->type == END_OF_RE, 0))
3670 return -2;
3671 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3672 break;
3673 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3674 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3675 num = (num > RE_DUP_MAX) ? -2 : num;
3676 }
3677 return num;
3678 }
3679 \f
3680 #ifdef RE_ENABLE_I18N
3681 static void
3682 free_charset (re_charset_t *cset)
3683 {
3684 re_free (cset->mbchars);
3685 # ifdef _LIBC
3686 re_free (cset->coll_syms);
3687 re_free (cset->equiv_classes);
3688 re_free (cset->range_starts);
3689 re_free (cset->range_ends);
3690 # endif
3691 re_free (cset->char_classes);
3692 re_free (cset);
3693 }
3694 #endif /* RE_ENABLE_I18N */
3695 \f
3696 /* Functions for binary tree operation. */
3697
3698 /* Create a tree node. */
3699
3700 static bin_tree_t *
3701 create_tree (dfa, left, right, type, index)
3702 re_dfa_t *dfa;
3703 bin_tree_t *left;
3704 bin_tree_t *right;
3705 re_token_type_t type;
3706 int index;
3707 {
3708 bin_tree_t *tree;
3709 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3710 {
3711 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3712
3713 if (storage == NULL)
3714 return NULL;
3715 storage->next = dfa->str_tree_storage;
3716 dfa->str_tree_storage = storage;
3717 dfa->str_tree_storage_idx = 0;
3718 }
3719 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3720
3721 tree->parent = NULL;
3722 tree->left = left;
3723 tree->right = right;
3724 tree->type = type;
3725 tree->node_idx = index;
3726 tree->first = -1;
3727 tree->next = -1;
3728 re_node_set_init_empty (&tree->eclosure);
3729
3730 if (left != NULL)
3731 left->parent = tree;
3732 if (right != NULL)
3733 right->parent = tree;
3734 return tree;
3735 }
3736
3737 /* Create both a DFA node and a tree for it. */
3738
3739 static bin_tree_t *
3740 re_dfa_add_tree_node (dfa, left, right, token)
3741 re_dfa_t *dfa;
3742 bin_tree_t *left;
3743 bin_tree_t *right;
3744 const re_token_t *token;
3745 {
3746 int new_idx = re_dfa_add_node (dfa, *token, 0);
3747
3748 if (new_idx == -1)
3749 return NULL;
3750
3751 return create_tree (dfa, left, right, 0, new_idx);
3752 }
3753
3754
3755 /* Duplicate the node SRC, and return new node. */
3756
3757 static bin_tree_t *
3758 duplicate_tree (src, dfa)
3759 const bin_tree_t *src;
3760 re_dfa_t *dfa;
3761 {
3762 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3763 int new_node_idx;
3764 /* Since node indies must be according to Post-order of the tree,
3765 we must duplicate the left at first. */
3766 if (src->left != NULL)
3767 {
3768 left = duplicate_tree (src->left, dfa);
3769 if (left == NULL)
3770 return NULL;
3771 }
3772
3773 /* Secondaly, duplicate the right. */
3774 if (src->right != NULL)
3775 {
3776 right = duplicate_tree (src->right, dfa);
3777 if (right == NULL)
3778 return NULL;
3779 }
3780
3781 /* At last, duplicate itself. */
3782 if (src->type == NON_TYPE)
3783 {
3784 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3785 dfa->nodes[new_node_idx].duplicated = 1;
3786 if (BE (new_node_idx == -1, 0))
3787 return NULL;
3788 }
3789 else
3790 new_node_idx = src->type;
3791
3792 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3793 return new_tree;
3794 }