]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regcomp.c
Update.
[thirdparty/glibc.git] / posix / regcomp.c
CommitLineData
3b0bdc72
UD
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21#include <assert.h>
22#include <ctype.h>
23#include <limits.h>
24#include <locale.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <wchar.h>
29#include <wctype.h>
30
c0a0f9a3
UD
31/* In case that the system doesn't have isblank(). */
32#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
33# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
34#endif
35
3b0bdc72
UD
36#ifdef _LIBC
37# ifndef _RE_DEFINE_LOCALE_FUNCTIONS
38# define _RE_DEFINE_LOCALE_FUNCTIONS 1
39# include <locale/localeinfo.h>
40# include <locale/elem-hash.h>
41# include <locale/coll-lookup.h>
42# endif
43#endif
44
45/* This is for other GNU distributions with internationalized messages. */
46#if HAVE_LIBINTL_H || defined _LIBC
47# include <libintl.h>
48# ifdef _LIBC
49# undef gettext
71319b9c 50# define gettext(msgid) \
c7769404 51 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
3b0bdc72
UD
52# endif
53#else
54# define gettext(msgid) (msgid)
55#endif
56
57#ifndef gettext_noop
58/* This define is so xgettext can find the internationalizable
59 strings. */
60# define gettext_noop(String) String
61#endif
62
37de950b 63#include <regex.h>
3b0bdc72
UD
64#include "regex_internal.h"
65
66static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
67 int length, reg_syntax_t syntax);
68static void re_compile_fastmap_iter (regex_t *bufp,
69 const re_dfastate_t *init_state,
70 char *fastmap);
71static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
a9388965 72static reg_errcode_t init_word_char (re_dfa_t *dfa);
c0a0f9a3 73#ifdef RE_ENABLE_I18N
3b0bdc72 74static void free_charset (re_charset_t *cset);
c0a0f9a3 75#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
76static void free_workarea_compile (regex_t *preg);
77static reg_errcode_t create_initial_state (re_dfa_t *dfa);
78static reg_errcode_t analyze (re_dfa_t *dfa);
79static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
80static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
81static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
82static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
a9388965
UD
83static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
84 unsigned int constraint);
3b0bdc72 85static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
a9388965
UD
86static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
87 int node, int root);
3b0bdc72
UD
88static void calc_inveclosure (re_dfa_t *dfa);
89static int fetch_number (re_string_t *input, re_token_t *token,
90 reg_syntax_t syntax);
91static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
92static int peek_token (re_token_t *token, re_string_t *input,
93 reg_syntax_t syntax);
94static int peek_token_bracket (re_token_t *token, re_string_t *input,
95 reg_syntax_t syntax);
96static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
97 reg_syntax_t syntax, reg_errcode_t *err);
98static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
99 re_token_t *token, reg_syntax_t syntax,
100 int nest, reg_errcode_t *err);
101static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
102 re_token_t *token, reg_syntax_t syntax,
103 int nest, reg_errcode_t *err);
104static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
105 re_token_t *token, reg_syntax_t syntax,
106 int nest, reg_errcode_t *err);
107static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
108 re_token_t *token, reg_syntax_t syntax,
109 int nest, reg_errcode_t *err);
110static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
111 re_dfa_t *dfa, re_token_t *token,
112 reg_syntax_t syntax, reg_errcode_t *err);
113static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
114 re_token_t *token, reg_syntax_t syntax,
115 reg_errcode_t *err);
116static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
117 re_string_t *regexp,
118 re_token_t *token, int token_len,
119 re_dfa_t *dfa,
120 reg_syntax_t syntax);
121static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
122 re_string_t *regexp,
123 re_token_t *token);
434d3784 124#ifndef _LIBC
c0a0f9a3
UD
125# ifdef RE_ENABLE_I18N
126static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
127 re_charset_t *mbcset, int *range_alloc,
434d3784
UD
128 bracket_elem_t *start_elem,
129 bracket_elem_t *end_elem);
c0a0f9a3
UD
130static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
131 re_charset_t *mbcset,
434d3784
UD
132 int *coll_sym_alloc,
133 unsigned char *name);
c0a0f9a3
UD
134# else /* not RE_ENABLE_I18N */
135static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
136 bracket_elem_t *start_elem,
137 bracket_elem_t *end_elem);
138static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
139 unsigned char *name);
140# endif /* not RE_ENABLE_I18N */
434d3784 141#endif /* not _LIBC */
c0a0f9a3
UD
142#ifdef RE_ENABLE_I18N
143static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
144 re_charset_t *mbcset,
3b0bdc72
UD
145 int *equiv_class_alloc,
146 const unsigned char *name);
c0a0f9a3
UD
147static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
148 re_charset_t *mbcset,
3b0bdc72 149 int *char_class_alloc,
602c2f9d
UD
150 const unsigned char *class_name,
151 reg_syntax_t syntax);
c0a0f9a3
UD
152#else /* not RE_ENABLE_I18N */
153static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
154 const unsigned char *name);
155static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
156 const unsigned char *class_name,
157 reg_syntax_t syntax);
158#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
159static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
160static void free_bin_tree (bin_tree_t *tree);
161static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
162 re_token_type_t type, int index);
163static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
164\f
165/* This table gives an error message for each of the error codes listed
166 in regex.h. Obviously the order here has to be same as there.
167 POSIX doesn't require that we do anything for REG_NOERROR,
168 but why not be nice? */
169
6455d255 170const char __re_error_msgid[] attribute_hidden =
3b0bdc72
UD
171 {
172#define REG_NOERROR_IDX 0
173 gettext_noop ("Success") /* REG_NOERROR */
174 "\0"
175#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
176 gettext_noop ("No match") /* REG_NOMATCH */
177 "\0"
178#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
179 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
180 "\0"
181#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
182 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
183 "\0"
184#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
185 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
186 "\0"
187#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
188 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
189 "\0"
190#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
191 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
192 "\0"
193#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
194 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
195 "\0"
196#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
197 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
198 "\0"
199#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
200 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
201 "\0"
202#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
203 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
204 "\0"
205#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
206 gettext_noop ("Invalid range end") /* REG_ERANGE */
207 "\0"
208#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
209 gettext_noop ("Memory exhausted") /* REG_ESPACE */
210 "\0"
211#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
212 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
213 "\0"
214#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
215 gettext_noop ("Premature end of regular expression") /* REG_EEND */
216 "\0"
217#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
218 gettext_noop ("Regular expression too big") /* REG_ESIZE */
219 "\0"
220#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
221 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
222 };
223
6455d255 224const size_t __re_error_msgid_idx[] attribute_hidden =
3b0bdc72
UD
225 {
226 REG_NOERROR_IDX,
227 REG_NOMATCH_IDX,
228 REG_BADPAT_IDX,
229 REG_ECOLLATE_IDX,
230 REG_ECTYPE_IDX,
231 REG_EESCAPE_IDX,
232 REG_ESUBREG_IDX,
233 REG_EBRACK_IDX,
234 REG_EPAREN_IDX,
235 REG_EBRACE_IDX,
236 REG_BADBR_IDX,
237 REG_ERANGE_IDX,
238 REG_ESPACE_IDX,
239 REG_BADRPT_IDX,
240 REG_EEND_IDX,
241 REG_ESIZE_IDX,
242 REG_ERPAREN_IDX
243 };
244\f
245/* Entry points for GNU code. */
246
247/* re_compile_pattern is the GNU regular expression compiler: it
248 compiles PATTERN (of length SIZE) and puts the result in BUFP.
249 Returns 0 if the pattern was valid, otherwise an error string.
250
251 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
252 are set in BUFP on entry. */
253
254const char *
255re_compile_pattern (pattern, length, bufp)
256 const char *pattern;
257 size_t length;
258 struct re_pattern_buffer *bufp;
259{
260 reg_errcode_t ret;
261
262 /* GNU code is written to assume at least RE_NREGS registers will be set
263 (and at least one extra will be -1). */
264 bufp->regs_allocated = REGS_UNALLOCATED;
265
266 /* And GNU code determines whether or not to get register information
267 by passing null for the REGS argument to re_match, etc., not by
268 setting no_sub. */
269 bufp->no_sub = 0;
270
271 /* Match anchors at newline. */
272 bufp->newline_anchor = 1;
273
274 ret = re_compile_internal (bufp, (const unsigned char *) pattern, length,
275 re_syntax_options);
276
277 if (!ret)
278 return NULL;
6455d255 279 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
280}
281#ifdef _LIBC
282weak_alias (__re_compile_pattern, re_compile_pattern)
283#endif
284
285/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
286 also be assigned to arbitrarily: each pattern buffer stores its own
287 syntax, so it can be changed between regex compilations. */
288/* This has no initializer because initialized variables in Emacs
289 become read-only after dumping. */
290reg_syntax_t re_syntax_options;
291
292
293/* Specify the precise syntax of regexps for compilation. This provides
294 for compatibility for various utilities which historically have
295 different, incompatible syntaxes.
296
297 The argument SYNTAX is a bit mask comprised of the various bits
298 defined in regex.h. We return the old syntax. */
299
300reg_syntax_t
301re_set_syntax (syntax)
302 reg_syntax_t syntax;
303{
304 reg_syntax_t ret = re_syntax_options;
305
306 re_syntax_options = syntax;
307 return ret;
308}
309#ifdef _LIBC
310weak_alias (__re_set_syntax, re_set_syntax)
311#endif
312
313int
314re_compile_fastmap (bufp)
315 struct re_pattern_buffer *bufp;
316{
317 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
318 char *fastmap = bufp->fastmap;
319
320 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
321 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
322 if (dfa->init_state != dfa->init_state_word)
323 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
324 if (dfa->init_state != dfa->init_state_nl)
325 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
326 if (dfa->init_state != dfa->init_state_begbuf)
327 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
328 bufp->fastmap_accurate = 1;
329 return 0;
330}
331#ifdef _LIBC
332weak_alias (__re_compile_fastmap, re_compile_fastmap)
333#endif
334
335/* Helper function for re_compile_fastmap.
336 Compile fastmap for the initial_state INIT_STATE. */
337
338static void
339re_compile_fastmap_iter (bufp, init_state, fastmap)
340 regex_t *bufp;
341 const re_dfastate_t *init_state;
342 char *fastmap;
343{
344 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
345 int node_cnt;
346 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
347 {
348 int node = init_state->nodes.elems[node_cnt];
349 re_token_type_t type = dfa->nodes[node].type;
350 if (type == OP_CONTEXT_NODE)
351 {
352 node = dfa->nodes[node].opr.ctx_info->entity;
353 type = dfa->nodes[node].type;
354 }
355
356 if (type == CHARACTER)
357 fastmap[dfa->nodes[node].opr.c] = 1;
358 else if (type == SIMPLE_BRACKET)
359 {
360 int i, j, ch;
361 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
362 for (j = 0; j < UINT_BITS; ++j, ++ch)
363 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
364 fastmap[ch] = 1;
365 }
c0a0f9a3 366#ifdef RE_ENABLE_I18N
3b0bdc72
UD
367 else if (type == COMPLEX_BRACKET)
368 {
369 int i;
370 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
371 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
372 || cset->nranges || cset->nchar_classes)
373 {
c0a0f9a3 374# ifdef _LIBC
3b0bdc72
UD
375 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
376 {
377 /* In this case we want to catch the bytes which are
378 the first byte of any collation elements.
379 e.g. In da_DK, we want to catch 'a' since "aa"
380 is a valid collation element, and don't catch
381 'b' since 'b' is the only collation element
382 which starts from 'b'. */
383 int j, ch;
384 const int32_t *table = (const int32_t *)
385 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
386 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
387 for (j = 0; j < UINT_BITS; ++j, ++ch)
388 if (table[ch] < 0)
389 fastmap[ch] = 1;
390 }
c0a0f9a3 391# else
434d3784
UD
392 if (MB_CUR_MAX > 1)
393 for (i = 0; i < SBC_MAX; ++i)
394 if (__btowc (i) == WEOF)
395 fastmap[i] = 1;
c0a0f9a3 396# endif /* not _LIBC */
3b0bdc72
UD
397 }
398 for (i = 0; i < cset->nmbchars; ++i)
399 {
400 unsigned char buf[256];
401 wctomb (buf, cset->mbchars[i]);
402 fastmap[buf[0]] = 1;
403 }
404 }
c0a0f9a3
UD
405#endif /* RE_ENABLE_I18N */
406 else if (type == END_OF_RE || type == OP_PERIOD
407#ifdef RE_ENABLE_I18N
408 || type == COMPLEX_BRACKET
409#endif /* RE_ENABLE_I18N */
410 )
3b0bdc72
UD
411 {
412 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
413 if (type == END_OF_RE)
414 bufp->can_be_null = 1;
415 return;
416 }
417 }
418}
419\f
420/* Entry point for POSIX code. */
421/* regcomp takes a regular expression as a string and compiles it.
422
423 PREG is a regex_t *. We do not expect any fields to be initialized,
424 since POSIX says we shouldn't. Thus, we set
425
426 `buffer' to the compiled pattern;
427 `used' to the length of the compiled pattern;
428 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
429 REG_EXTENDED bit in CFLAGS is set; otherwise, to
430 RE_SYNTAX_POSIX_BASIC;
431 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
432 `fastmap' to an allocated space for the fastmap;
433 `fastmap_accurate' to zero;
434 `re_nsub' to the number of subexpressions in PATTERN.
435
436 PATTERN is the address of the pattern string.
437
438 CFLAGS is a series of bits which affect compilation.
439
440 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
441 use POSIX basic syntax.
442
443 If REG_NEWLINE is set, then . and [^...] don't match newline.
444 Also, regexec will try a match beginning after every newline.
445
446 If REG_ICASE is set, then we considers upper- and lowercase
447 versions of letters to be equivalent when matching.
448
449 If REG_NOSUB is set, then when PREG is passed to regexec, that
450 routine will report only success or failure, and nothing about the
451 registers.
452
453 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
454 the return codes and their meanings.) */
455
456int
457regcomp (preg, pattern, cflags)
458 regex_t *preg;
459 const char *pattern;
460 int cflags;
461{
462 reg_errcode_t ret;
463 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
464 : RE_SYNTAX_POSIX_BASIC);
465
466 preg->buffer = NULL;
467 preg->allocated = 0;
468 preg->used = 0;
469
470 /* Try to allocate space for the fastmap. */
471 preg->fastmap = re_malloc (char, SBC_MAX);
bc15410e 472 if (BE (preg->fastmap == NULL, 0))
3b0bdc72
UD
473 return REG_ESPACE;
474
475 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
476
477 /* If REG_NEWLINE is set, newlines are treated differently. */
478 if (cflags & REG_NEWLINE)
479 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
480 syntax &= ~RE_DOT_NEWLINE;
481 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
482 /* It also changes the matching behavior. */
483 preg->newline_anchor = 1;
484 }
485 else
486 preg->newline_anchor = 0;
487 preg->no_sub = !!(cflags & REG_NOSUB);
488 preg->translate = NULL;
489
490 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
491
492 /* POSIX doesn't distinguish between an unmatched open-group and an
493 unmatched close-group: both are REG_EPAREN. */
494 if (ret == REG_ERPAREN)
495 ret = REG_EPAREN;
496
a9388965 497 /* We have already checked preg->fastmap != NULL. */
bc15410e 498 if (BE (ret == REG_NOERROR, 1))
3b0bdc72
UD
499 {
500 /* Compute the fastmap now, since regexec cannot modify the pattern
501 buffer. */
bc15410e 502 if (BE (re_compile_fastmap (preg) == -2, 0))
3b0bdc72
UD
503 {
504 /* Some error occurred while computing the fastmap, just forget
505 about it. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
508 }
509 }
510
511 return (int) ret;
512}
513#ifdef _LIBC
514weak_alias (__regcomp, regcomp)
515#endif
516
517/* Returns a message corresponding to an error code, ERRCODE, returned
518 from either regcomp or regexec. We don't use PREG here. */
519
520size_t
521regerror (errcode, preg, errbuf, errbuf_size)
522 int errcode;
523 const regex_t *preg;
524 char *errbuf;
525 size_t errbuf_size;
526{
527 const char *msg;
528 size_t msg_size;
529
bc15410e 530 if (BE (errcode < 0
6455d255
UD
531 || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 / sizeof (__re_error_msgid_idx[0])), 0))
3b0bdc72
UD
533 /* Only error codes returned by the rest of the code should be passed
534 to this routine. If we are given anything else, or if other regex
535 code generates an invalid error code, then the program has a bug.
536 Dump core so we can fix it. */
537 abort ();
538
6455d255 539 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3b0bdc72
UD
540
541 msg_size = strlen (msg) + 1; /* Includes the null. */
542
bc15410e 543 if (BE (errbuf_size != 0, 1))
3b0bdc72 544 {
bc15410e 545 if (BE (msg_size > errbuf_size, 0))
3b0bdc72
UD
546 {
547#if defined HAVE_MEMPCPY || defined _LIBC
548 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
549#else
550 memcpy (errbuf, msg, errbuf_size - 1);
551 errbuf[errbuf_size - 1] = 0;
552#endif
553 }
554 else
555 memcpy (errbuf, msg, msg_size);
556 }
557
558 return msg_size;
559}
560#ifdef _LIBC
561weak_alias (__regerror, regerror)
562#endif
563
564/* Free dynamically allocated space used by PREG. */
565
566void
567regfree (preg)
568 regex_t *preg;
569{
570 int i, j;
571 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
bc15410e 572 if (BE (dfa != NULL, 1))
3b0bdc72
UD
573 {
574 re_free (dfa->subexps);
575
576 for (i = 0; i < dfa->nodes_len; ++i)
577 {
578 re_token_t *node = dfa->nodes + i;
c0a0f9a3 579#ifdef RE_ENABLE_I18N
3b0bdc72
UD
580 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
581 free_charset (node->opr.mbcset);
c0a0f9a3
UD
582 else
583#endif /* RE_ENABLE_I18N */
584 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3b0bdc72
UD
585 re_free (node->opr.sbcset);
586 else if (node->type == OP_CONTEXT_NODE)
587 {
588 if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
589 {
590 if (node->opr.ctx_info->bkref_eclosure != NULL)
591 re_node_set_free (node->opr.ctx_info->bkref_eclosure);
592 re_free (node->opr.ctx_info->bkref_eclosure);
593 }
594 re_free (node->opr.ctx_info);
595 }
596 }
597 re_free (dfa->firsts);
598 re_free (dfa->nexts);
599 for (i = 0; i < dfa->nodes_len; ++i)
600 {
601 if (dfa->eclosures != NULL)
602 re_node_set_free (dfa->eclosures + i);
603 if (dfa->inveclosures != NULL)
604 re_node_set_free (dfa->inveclosures + i);
605 if (dfa->edests != NULL)
606 re_node_set_free (dfa->edests + i);
607 }
608 re_free (dfa->edests);
609 re_free (dfa->eclosures);
610 re_free (dfa->inveclosures);
611 re_free (dfa->nodes);
612
613 for (i = 0; i <= dfa->state_hash_mask; ++i)
614 {
615 struct re_state_table_entry *entry = dfa->state_table + i;
bc15410e 616 for (j = 0; j < entry->num; ++j)
3b0bdc72 617 {
bc15410e
UD
618 re_dfastate_t *state = entry->array[j];
619 if (state->entrance_nodes != &state->nodes)
3b0bdc72 620 {
bc15410e
UD
621 re_node_set_free (state->entrance_nodes);
622 re_free (state->entrance_nodes);
3b0bdc72 623 }
bc15410e
UD
624 re_node_set_free (&state->nodes);
625 re_free (state->trtable);
626 re_free (state->trtable_search);
627 re_free (state);
3b0bdc72 628 }
bc15410e 629 re_free (entry->array);
3b0bdc72
UD
630 }
631 re_free (dfa->state_table);
632
633 if (dfa->word_char != NULL)
634 re_free (dfa->word_char);
635 re_free (dfa);
636 }
637 re_free (preg->fastmap);
638}
639#ifdef _LIBC
640weak_alias (__regfree, regfree)
641#endif
642\f
643/* Entry points compatible with 4.2 BSD regex library. We don't define
644 them unless specifically requested. */
645
646#if defined _REGEX_RE_COMP || defined _LIBC
647
648/* BSD has one and only one pattern buffer. */
649static struct re_pattern_buffer re_comp_buf;
650
651char *
652# ifdef _LIBC
653/* Make these definitions weak in libc, so POSIX programs can redefine
654 these names if they don't use our functions, and still use
655 regcomp/regexec above without link errors. */
656weak_function
657# endif
658re_comp (s)
659 const char *s;
660{
661 reg_errcode_t ret;
662
663 if (!s)
664 {
665 if (!re_comp_buf.buffer)
666 return gettext ("No previous regular expression");
667 return 0;
668 }
669
670 if (!re_comp_buf.buffer)
671 {
672 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
673 if (re_comp_buf.fastmap == NULL)
6455d255
UD
674 return (char *) gettext (__re_error_msgid
675 + __re_error_msgid_idx[(int) REG_ESPACE]);
3b0bdc72
UD
676 }
677
678 /* Since `re_exec' always passes NULL for the `regs' argument, we
679 don't need to initialize the pattern buffer fields which affect it. */
680
681 /* Match anchors at newlines. */
682 re_comp_buf.newline_anchor = 1;
683
684 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
685
686 if (!ret)
687 return NULL;
688
689 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455d255 690 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3b0bdc72
UD
691}
692#endif /* _REGEX_RE_COMP */
693\f
694/* Internal entry point.
695 Compile the regular expression PATTERN, whose length is LENGTH.
696 SYNTAX indicate regular expression's syntax. */
697
698static reg_errcode_t
699re_compile_internal (preg, pattern, length, syntax)
700 regex_t *preg;
701 const char * pattern;
702 int length;
703 reg_syntax_t syntax;
704{
705 reg_errcode_t err = REG_NOERROR;
706 re_dfa_t *dfa;
707 re_string_t regexp;
708
709 /* Initialize the pattern buffer. */
710 preg->fastmap_accurate = 0;
711 preg->syntax = syntax;
712 preg->not_bol = preg->not_eol = 0;
713 preg->used = 0;
714 preg->re_nsub = 0;
715
716 /* Initialize the dfa. */
717 dfa = (re_dfa_t *) preg->buffer;
718 if (preg->allocated < sizeof (re_dfa_t))
719 {
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If ->buffer is NULL this
723 is a simple allocation. */
724 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
725 if (dfa == NULL)
726 return REG_ESPACE;
3b0bdc72
UD
727 preg->allocated = sizeof (re_dfa_t);
728 }
729 preg->buffer = (unsigned char *) dfa;
730 preg->used = sizeof (re_dfa_t);
731
732 err = init_dfa (dfa, length);
bc15410e 733 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
734 {
735 re_free (dfa);
736 preg->buffer = NULL;
737 return err;
738 }
739
612546c6
UD
740 err = re_string_construct (&regexp, pattern, length, preg->translate,
741 syntax & RE_ICASE);
bc15410e 742 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
743 {
744 re_free (dfa);
745 preg->buffer = NULL;
746 return err;
747 }
748
749 /* Parse the regular expression, and build a structure tree. */
750 preg->re_nsub = 0;
751 dfa->str_tree = parse (&regexp, preg, syntax, &err);
bc15410e 752 if (BE (dfa->str_tree == NULL, 0))
3b0bdc72
UD
753 goto re_compile_internal_free_return;
754
755 /* Analyze the tree and collect information which is necessary to
756 create the dfa. */
757 err = analyze (dfa);
bc15410e 758 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
759 goto re_compile_internal_free_return;
760
761 /* Then create the initial state of the dfa. */
762 err = create_initial_state (dfa);
bc15410e 763 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
764 goto re_compile_internal_free_return;
765
766 re_compile_internal_free_return:
767 /* Release work areas. */
768 free_workarea_compile (preg);
769 re_string_destruct (&regexp);
770
771 return err;
772}
773
774/* Initialize DFA. We use the length of the regular expression PAT_LEN
775 as the initial length of some arrays. */
776
777static reg_errcode_t
778init_dfa (dfa, pat_len)
779 re_dfa_t *dfa;
780 int pat_len;
781{
782 int table_size;
81c64d40
UD
783
784 memset (dfa, '\0', sizeof (re_dfa_t));
785
3b0bdc72
UD
786 dfa->nodes_alloc = pat_len + 1;
787 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
788
789 dfa->states_alloc = pat_len + 1;
790
791 /* table_size = 2 ^ ceil(log pat_len) */
792 for (table_size = 1; table_size > 0; table_size <<= 1)
793 if (table_size > pat_len)
794 break;
795
796 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
797 dfa->state_hash_mask = table_size - 1;
798
799 dfa->subexps_alloc = 1;
800 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
801 dfa->word_char = NULL;
802
bc15410e
UD
803 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
804 || dfa->subexps == NULL, 0))
3b0bdc72
UD
805 {
806 /* We don't bother to free anything which was allocated. Very
807 soon the process will go down anyway. */
808 dfa->subexps = NULL;
809 dfa->state_table = NULL;
810 dfa->nodes = NULL;
811 return REG_ESPACE;
812 }
813 return REG_NOERROR;
814}
815
816/* Initialize WORD_CHAR table, which indicate which character is
817 "word". In this case "word" means that it is the word construction
818 character used by some operators like "\<", "\>", etc. */
819
a9388965 820static reg_errcode_t
3b0bdc72
UD
821init_word_char (dfa)
822 re_dfa_t *dfa;
823{
824 int i, j, ch;
825 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
bc15410e 826 if (BE (dfa->word_char == NULL, 0))
a9388965 827 return REG_ESPACE;
3b0bdc72
UD
828 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
829 for (j = 0; j < UINT_BITS; ++j, ++ch)
830 if (isalnum (ch) || ch == '_')
831 dfa->word_char[i] |= 1 << j;
a9388965 832 return REG_NOERROR;
3b0bdc72
UD
833}
834
835/* Free the work area which are only used while compiling. */
836
837static void
838free_workarea_compile (preg)
839 regex_t *preg;
840{
841 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
842 free_bin_tree (dfa->str_tree);
843 dfa->str_tree = NULL;
844}
845
846/* Create initial states for all contexts. */
847
848static reg_errcode_t
849create_initial_state (dfa)
850 re_dfa_t *dfa;
851{
852 int first, i;
853 reg_errcode_t err;
854 re_node_set init_nodes;
855
856 /* Initial states have the epsilon closure of the node which is
857 the first node of the regular expression. */
858 first = dfa->str_tree->first;
859 dfa->init_node = first;
860 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
bc15410e 861 if (BE (err != REG_NOERROR, 0))
3b0bdc72
UD
862 return err;
863
864 /* The back-references which are in initial states can epsilon transit,
865 since in this case all of the subexpressions can be null.
866 Then we add epsilon closures of the nodes which are the next nodes of
867 the back-references. */
868 if (dfa->nbackref > 0)
869 for (i = 0; i < init_nodes.nelem; ++i)
870 {
871 int node_idx = init_nodes.elems[i];
872 re_token_type_t type = dfa->nodes[node_idx].type;
873 if (type == OP_CONTEXT_NODE
874 && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type
875 == OP_BACK_REF))
876 {
877 int prev_nelem = init_nodes.nelem;
878 re_node_set_merge (&init_nodes,
879 dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure);
880 if (prev_nelem < init_nodes.nelem)
881 i = 0;
882 }
883 else if (type == OP_BACK_REF)
884 {
885 int next_idx = dfa->nexts[node_idx];
886 if (!re_node_set_contains (&init_nodes, next_idx))
887 {
888 re_node_set_merge (&init_nodes, dfa->eclosures + next_idx);
889 i = 0;
890 }
891 }
892 }
893
894 /* It must be the first time to invoke acquire_state. */
a9388965
UD
895 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
896 /* We don't check ERR here, since the initial state must not be NULL. */
bc15410e 897 if (BE (dfa->init_state == NULL, 0))
a9388965 898 return err;
3b0bdc72
UD
899 if (dfa->init_state->has_constraint)
900 {
a9388965 901 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
3b0bdc72 902 CONTEXT_WORD);
a9388965 903 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
3b0bdc72 904 CONTEXT_NEWLINE);
a9388965
UD
905 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
906 &init_nodes,
3b0bdc72
UD
907 CONTEXT_NEWLINE
908 | CONTEXT_BEGBUF);
bc15410e
UD
909 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
910 || dfa->init_state_begbuf == NULL, 0))
a9388965 911 return err;
3b0bdc72
UD
912 }
913 else
914 dfa->init_state_word = dfa->init_state_nl
915 = dfa->init_state_begbuf = dfa->init_state;
916
3b0bdc72
UD
917 re_node_set_free (&init_nodes);
918 return REG_NOERROR;
919}
920\f
921/* Analyze the structure tree, and calculate "first", "next", "edest",
922 "eclosure", and "inveclosure". */
923
924static reg_errcode_t
925analyze (dfa)
926 re_dfa_t *dfa;
927{
928 int i;
929 reg_errcode_t ret;
930
931 /* Allocate arrays. */
932 dfa->firsts = re_malloc (int, dfa->nodes_alloc);
933 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
934 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
935 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
936 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
bc15410e
UD
937 if (BE (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL
938 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
3b0bdc72
UD
939 return REG_ESPACE;
940 /* Initialize them. */
941 for (i = 0; i < dfa->nodes_len; ++i)
942 {
943 dfa->firsts[i] = -1;
944 dfa->nexts[i] = -1;
945 re_node_set_init_empty (dfa->edests + i);
946 re_node_set_init_empty (dfa->eclosures + i);
947 re_node_set_init_empty (dfa->inveclosures + i);
948 }
949
950 ret = analyze_tree (dfa, dfa->str_tree);
bc15410e 951 if (BE (ret == REG_NOERROR, 1))
3b0bdc72
UD
952 {
953 ret = calc_eclosure (dfa);
954 if (ret == REG_NOERROR)
955 calc_inveclosure (dfa);
956 }
957 return ret;
958}
959
960/* Helper functions for analyze.
961 This function calculate "first", "next", and "edest" for the subtree
962 whose root is NODE. */
963
964static reg_errcode_t
965analyze_tree (dfa, node)
966 re_dfa_t *dfa;
967 bin_tree_t *node;
968{
969 reg_errcode_t ret;
970 if (node->first == -1)
971 calc_first (dfa, node);
972 if (node->next == -1)
973 calc_next (dfa, node);
974 if (node->eclosure.nelem == 0)
975 calc_epsdest (dfa, node);
976 /* Calculate "first" etc. for the left child. */
977 if (node->left != NULL)
978 {
979 ret = analyze_tree (dfa, node->left);
bc15410e 980 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
981 return ret;
982 }
983 /* Calculate "first" etc. for the right child. */
984 if (node->right != NULL)
985 {
986 ret = analyze_tree (dfa, node->right);
bc15410e 987 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
988 return ret;
989 }
990 return REG_NOERROR;
991}
992
993/* Calculate "first" for the node NODE. */
994static void
995calc_first (dfa, node)
996 re_dfa_t *dfa;
997 bin_tree_t *node;
998{
999 int idx, type;
1000 idx = node->node_idx;
1001 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1002
1003 switch (type)
1004 {
1005#ifdef DEBUG
3b0bdc72
UD
1006 case OP_OPEN_BRACKET:
1007 case OP_CLOSE_BRACKET:
1008 case OP_OPEN_DUP_NUM:
1009 case OP_CLOSE_DUP_NUM:
1010 case OP_NON_MATCH_LIST:
1011 case OP_OPEN_COLL_ELEM:
1012 case OP_CLOSE_COLL_ELEM:
1013 case OP_OPEN_EQUIV_CLASS:
1014 case OP_CLOSE_EQUIV_CLASS:
1015 case OP_OPEN_CHAR_CLASS:
1016 case OP_CLOSE_CHAR_CLASS:
1017 /* These must not be appeared here. */
1018 assert (0);
1019#endif
1020 case END_OF_RE:
1021 case CHARACTER:
1022 case OP_PERIOD:
1023 case OP_DUP_ASTERISK:
1024 case OP_DUP_QUESTION:
c0a0f9a3 1025#ifdef RE_ENABLE_I18N
3b0bdc72 1026 case COMPLEX_BRACKET:
c0a0f9a3 1027#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1028 case SIMPLE_BRACKET:
1029 case OP_BACK_REF:
1030 case ANCHOR:
81c64d40
UD
1031 case OP_OPEN_SUBEXP:
1032 case OP_CLOSE_SUBEXP:
3b0bdc72
UD
1033 node->first = idx;
1034 break;
1035 case OP_DUP_PLUS:
1036#ifdef DEBUG
1037 assert (node->left != NULL);
1038#endif
1039 if (node->left->first == -1)
1040 calc_first (dfa, node->left);
1041 node->first = node->left->first;
1042 break;
1043 case OP_ALT:
1044 node->first = idx;
1045 break;
3b0bdc72
UD
1046 /* else fall through */
1047 default:
1048#ifdef DEBUG
1049 assert (node->left != NULL);
1050#endif
1051 if (node->left->first == -1)
1052 calc_first (dfa, node->left);
1053 node->first = node->left->first;
1054 break;
1055 }
1056 if (node->type == 0)
1057 dfa->firsts[idx] = node->first;
1058}
1059
1060/* Calculate "next" for the node NODE. */
1061
1062static void
1063calc_next (dfa, node)
1064 re_dfa_t *dfa;
1065 bin_tree_t *node;
1066{
1067 int idx, type;
1068 bin_tree_t *parent = node->parent;
1069 if (parent == NULL)
1070 {
1071 node->next = -1;
1072 idx = node->node_idx;
1073 if (node->type == 0)
1074 dfa->nexts[idx] = node->next;
1075 return;
1076 }
1077
1078 idx = parent->node_idx;
1079 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1080
1081 switch (type)
1082 {
1083 case OP_DUP_ASTERISK:
1084 case OP_DUP_PLUS:
1085 node->next = idx;
1086 break;
1087 case CONCAT:
1088 if (parent->left == node)
1089 {
1090 if (parent->right->first == -1)
1091 calc_first (dfa, parent->right);
1092 node->next = parent->right->first;
1093 break;
1094 }
1095 /* else fall through */
1096 default:
1097 if (parent->next == -1)
1098 calc_next (dfa, parent);
1099 node->next = parent->next;
1100 break;
1101 }
1102 idx = node->node_idx;
1103 if (node->type == 0)
1104 dfa->nexts[idx] = node->next;
1105}
1106
1107/* Calculate "edest" for the node NODE. */
1108
1109static void
1110calc_epsdest (dfa, node)
1111 re_dfa_t *dfa;
1112 bin_tree_t *node;
1113{
1114 int idx;
1115 idx = node->node_idx;
1116 if (node->type == 0)
1117 {
1118 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1119 || dfa->nodes[idx].type == OP_DUP_PLUS
1120 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1121 {
1122 if (node->left->first == -1)
1123 calc_first (dfa, node->left);
1124 if (node->next == -1)
1125 calc_next (dfa, node);
1126 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1127 node->next);
1128 }
1129 else if (dfa->nodes[idx].type == OP_ALT)
1130 {
1131 int left, right;
1132 if (node->left != NULL)
1133 {
1134 if (node->left->first == -1)
1135 calc_first (dfa, node->left);
1136 left = node->left->first;
1137 }
1138 else
1139 {
1140 if (node->next == -1)
1141 calc_next (dfa, node);
1142 left = node->next;
1143 }
1144 if (node->right != NULL)
1145 {
1146 if (node->right->first == -1)
1147 calc_first (dfa, node->right);
1148 right = node->right->first;
1149 }
1150 else
1151 {
1152 if (node->next == -1)
1153 calc_next (dfa, node);
1154 right = node->next;
1155 }
1156 re_node_set_init_2 (dfa->edests + idx, left, right);
1157 }
81c64d40
UD
1158 else if (dfa->nodes[idx].type == ANCHOR
1159 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1160 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
3b0bdc72
UD
1161 re_node_set_init_1 (dfa->edests + idx, node->next);
1162 }
1163}
1164
a9388965
UD
1165/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1166 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1167 otherwise return the error code. */
1168
1169static reg_errcode_t
1170duplicate_node (new_idx, dfa, org_idx, constraint)
3b0bdc72 1171 re_dfa_t *dfa;
a9388965 1172 int *new_idx, org_idx;
3b0bdc72
UD
1173 unsigned int constraint;
1174{
1175 re_token_t dup;
1176 int dup_idx;
a9388965 1177 reg_errcode_t err;
3b0bdc72
UD
1178
1179 dup.type = OP_CONTEXT_NODE;
1180 if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE)
1181 {
a9388965
UD
1182 /* If the node whose index is ORG_IDX is the same as the intended
1183 node, use it. */
3b0bdc72 1184 if (dfa->nodes[org_idx].constraint == constraint)
a9388965
UD
1185 {
1186 *new_idx = org_idx;
1187 return REG_NOERROR;
1188 }
3b0bdc72
UD
1189 dup.constraint = constraint |
1190 dfa->nodes[org_idx].constraint;
1191 }
1192 else
1193 dup.constraint = constraint;
1194
1195 /* In case that `entity' points OP_CONTEXT_NODE,
1196 we correct `entity' to real entity in calc_inveclosures(). */
1197 dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info));
a9388965 1198 dup_idx = re_dfa_add_node (dfa, dup, 1);
bc15410e 1199 if (BE (dup.opr.ctx_info == NULL || dup_idx == -1, 0))
a9388965 1200 return REG_ESPACE;
3b0bdc72
UD
1201 dup.opr.ctx_info->entity = org_idx;
1202 dup.opr.ctx_info->bkref_eclosure = NULL;
3b0bdc72 1203
a9388965 1204 dfa->nodes[dup_idx].duplicated = 1;
3b0bdc72
UD
1205 dfa->firsts[dup_idx] = dfa->firsts[org_idx];
1206 dfa->nexts[dup_idx] = dfa->nexts[org_idx];
a9388965 1207 err = re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx);
bc15410e 1208 if (BE (err != REG_NOERROR, 0))
a9388965 1209 return err;
3b0bdc72
UD
1210 /* Since we don't duplicate epsilon nodes, epsilon closure have
1211 only itself. */
a9388965 1212 err = re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx);
bc15410e 1213 if (BE (err != REG_NOERROR, 0))
a9388965
UD
1214 return err;
1215 err = re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx);
bc15410e 1216 if (BE (err != REG_NOERROR, 0))
a9388965 1217 return err;
3b0bdc72
UD
1218 /* Then we must update inveclosure for this node.
1219 We process them at last part of calc_eclosure(),
1220 since we don't complete to calculate them here. */
1221
a9388965
UD
1222 *new_idx = dup_idx;
1223 return REG_NOERROR;
3b0bdc72
UD
1224}
1225
1226static void
1227calc_inveclosure (dfa)
1228 re_dfa_t *dfa;
1229{
1230 int src, idx, dest, entity;
1231 for (src = 0; src < dfa->nodes_len; ++src)
1232 {
1233 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1234 {
1235 dest = dfa->eclosures[src].elems[idx];
1236 re_node_set_insert (dfa->inveclosures + dest, src);
1237 }
1238
1239 entity = src;
1240 while (dfa->nodes[entity].type == OP_CONTEXT_NODE)
1241 {
1242 entity = dfa->nodes[entity].opr.ctx_info->entity;
1243 re_node_set_merge (dfa->inveclosures + src,
1244 dfa->inveclosures + entity);
1245 dfa->nodes[src].opr.ctx_info->entity = entity;
1246 }
1247 }
1248}
1249
1250/* Calculate "eclosure" for all the node in DFA. */
1251
1252static reg_errcode_t
1253calc_eclosure (dfa)
1254 re_dfa_t *dfa;
1255{
1256 int idx, node_idx, max, incomplete = 0;
1257#ifdef DEBUG
1258 assert (dfa->nodes_len > 0);
1259#endif
1260 /* For each nodes, calculate epsilon closure. */
1261 for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx)
1262 {
a9388965 1263 reg_errcode_t err;
3b0bdc72
UD
1264 re_node_set eclosure_elem;
1265 if (node_idx == max)
1266 {
1267 if (!incomplete)
1268 break;
1269 incomplete = 0;
1270 node_idx = 0;
1271 }
1272
1273#ifdef DEBUG
1274 assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE);
1275 assert (dfa->eclosures[node_idx].nelem != -1);
1276#endif
1277 /* If we have already calculated, skip it. */
1278 if (dfa->eclosures[node_idx].nelem != 0)
1279 continue;
1280 /* Calculate epsilon closure of `node_idx'. */
a9388965 1281 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
bc15410e 1282 if (BE (err != REG_NOERROR, 0))
a9388965 1283 return err;
3b0bdc72
UD
1284
1285 if (dfa->eclosures[node_idx].nelem == 0)
1286 {
1287 incomplete = 1;
1288 re_node_set_free (&eclosure_elem);
1289 }
1290 }
1291
1292 /* for duplicated nodes. */
1293 for (idx = max; idx < dfa->nodes_len; ++idx)
1294 {
1295 int entity, i, constraint;
1296 re_node_set *bkref_eclosure;
1297 entity = dfa->nodes[idx].opr.ctx_info->entity;
1298 re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity);
1299 if (dfa->nodes[entity].type != OP_BACK_REF)
1300 continue;
1301
1302 /* If the node is backreference, duplicate the epsilon closure of
1303 the next node. Since it may epsilon transit. */
1304 /* Note: duplicate_node() may realloc dfa->eclosures, etc. */
1305 bkref_eclosure = re_malloc (re_node_set, 1);
bc15410e 1306 if (BE (bkref_eclosure == NULL, 0))
3b0bdc72
UD
1307 return REG_ESPACE;
1308 re_node_set_init_empty (bkref_eclosure);
1309 constraint = dfa->nodes[idx].constraint;
1310 for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i)
1311 {
1312 int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i];
1313 if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type))
a9388965
UD
1314 {
1315 reg_errcode_t err;
1316 err = duplicate_node (&dest_node_idx, dfa, dest_node_idx,
1317 constraint);
bc15410e 1318 if (BE (err != REG_NOERROR, 0))
a9388965
UD
1319 return err;
1320 }
3b0bdc72
UD
1321 re_node_set_insert (bkref_eclosure, dest_node_idx);
1322 }
1323 dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure;
1324 }
1325
1326 return REG_NOERROR;
1327}
1328
1329/* Calculate epsilon closure of NODE. */
1330
a9388965
UD
1331static reg_errcode_t
1332calc_eclosure_iter (new_set, dfa, node, root)
1333 re_node_set *new_set;
3b0bdc72
UD
1334 re_dfa_t *dfa;
1335 int node, root;
1336{
a9388965 1337 reg_errcode_t err;
3b0bdc72
UD
1338 unsigned int constraint;
1339 int i, max, incomplete = 0;
1340 re_node_set eclosure;
a9388965 1341 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
bc15410e 1342 if (BE (err != REG_NOERROR, 0))
a9388965 1343 return err;
3b0bdc72
UD
1344
1345 /* This indicates that we are calculating this node now.
1346 We reference this value to avoid infinite loop. */
1347 dfa->eclosures[node].nelem = -1;
1348
1349 constraint = ((dfa->nodes[node].type == ANCHOR)
1350 ? dfa->nodes[node].opr.ctx_type : 0);
1351
1352 /* Expand each epsilon destination nodes. */
1353 if (dfa->edests[node].nelem != 0)
1354 for (i = 0; i < dfa->edests[node].nelem; ++i)
1355 {
1356 re_node_set eclosure_elem;
1357 int edest = dfa->edests[node].elems[i];
1358 /* If calculating the epsilon closure of `edest' is in progress,
1359 return intermediate result. */
1360 if (dfa->eclosures[edest].nelem == -1)
1361 {
1362 incomplete = 1;
1363 continue;
1364 }
1365 /* If we haven't calculated the epsilon closure of `edest' yet,
1366 calculate now. Otherwise use calculated epsilon closure. */
1367 if (dfa->eclosures[edest].nelem == 0)
a9388965
UD
1368 {
1369 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
bc15410e 1370 if (BE (err != REG_NOERROR, 0))
a9388965
UD
1371 return err;
1372 }
3b0bdc72
UD
1373 else
1374 eclosure_elem = dfa->eclosures[edest];
1375 /* Merge the epsilon closure of `edest'. */
1376 re_node_set_merge (&eclosure, &eclosure_elem);
1377 /* If the epsilon closure of `edest' is incomplete,
1378 the epsilon closure of this node is also incomplete. */
1379 if (dfa->eclosures[edest].nelem == 0)
1380 {
1381 incomplete = 1;
1382 re_node_set_free (&eclosure_elem);
1383 }
1384 }
1385
1386 /* If the current node has constraints, duplicate all non-epsilon nodes.
1387 Since they must inherit the constraints. */
1388 if (constraint)
1389 for (i = 0, max = eclosure.nelem; i < max; ++i)
1390 {
1391 int dest = eclosure.elems[i];
1392 if (!IS_EPSILON_NODE (dfa->nodes[dest].type))
1393 {
a9388965
UD
1394 int dup_dest;
1395 reg_errcode_t err;
1396 err = duplicate_node (&dup_dest, dfa, dest, constraint);
bc15410e 1397 if (BE (err != REG_NOERROR, 0))
a9388965 1398 return err;
3b0bdc72
UD
1399 if (dest != dup_dest)
1400 {
1401 re_node_set_remove_at (&eclosure, i--);
1402 re_node_set_insert (&eclosure, dup_dest);
1403 --max;
1404 }
1405 }
1406 }
1407
1408 /* Epsilon closures include itself. */
1409 re_node_set_insert (&eclosure, node);
1410 if (incomplete && !root)
1411 dfa->eclosures[node].nelem = 0;
1412 else
1413 dfa->eclosures[node] = eclosure;
a9388965
UD
1414 *new_set = eclosure;
1415 return REG_NOERROR;
3b0bdc72
UD
1416}
1417\f
1418/* Functions for token which are used in the parser. */
1419
1420/* Fetch a token from INPUT.
1421 We must not use this function inside bracket expressions. */
1422
1423static re_token_t
1424fetch_token (input, syntax)
1425 re_string_t *input;
1426 reg_syntax_t syntax;
1427{
1428 re_token_t token;
1429 int consumed_byte;
1430 consumed_byte = peek_token (&token, input, syntax);
1431 re_string_skip_bytes (input, consumed_byte);
1432 return token;
1433}
1434
1435/* Peek a token from INPUT, and return the length of the token.
1436 We must not use this function inside bracket expressions. */
1437
1438static int
1439peek_token (token, input, syntax)
1440 re_token_t *token;
1441 re_string_t *input;
1442 reg_syntax_t syntax;
1443{
1444 unsigned char c;
1445
1446 if (re_string_eoi (input))
1447 {
1448 token->type = END_OF_RE;
1449 return 0;
1450 }
1451
1452 c = re_string_peek_byte (input, 0);
1453 token->opr.c = c;
1454
1455#ifdef RE_ENABLE_I18N
1456 token->mb_partial = 0;
1457 if (MB_CUR_MAX > 1 &&
1458 !re_string_first_byte (input, re_string_cur_idx (input)))
1459 {
1460 token->type = CHARACTER;
1461 token->mb_partial = 1;
1462 return 1;
1463 }
1464#endif
1465 if (c == '\\')
1466 {
1467 unsigned char c2;
1468 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1469 {
1470 token->type = BACK_SLASH;
1471 return 1;
1472 }
1473
1474 c2 = re_string_peek_byte_case (input, 1);
1475 token->opr.c = c2;
1476 token->type = CHARACTER;
1477 switch (c2)
1478 {
1479 case '|':
1480 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1481 token->type = OP_ALT;
1482 break;
1483 case '1': case '2': case '3': case '4': case '5':
1484 case '6': case '7': case '8': case '9':
1485 if (!(syntax & RE_NO_BK_REFS))
1486 {
1487 token->type = OP_BACK_REF;
1488 token->opr.idx = c2 - '0';
1489 }
1490 break;
1491 case '<':
1492 if (!(syntax & RE_NO_GNU_OPS))
1493 {
1494 token->type = ANCHOR;
1495 token->opr.idx = WORD_FIRST;
1496 }
1497 break;
1498 case '>':
1499 if (!(syntax & RE_NO_GNU_OPS))
1500 {
1501 token->type = ANCHOR;
1502 token->opr.idx = WORD_LAST;
1503 }
1504 break;
1505 case 'b':
1506 if (!(syntax & RE_NO_GNU_OPS))
1507 {
1508 token->type = ANCHOR;
1509 token->opr.idx = WORD_DELIM;
1510 }
1511 break;
1512 case 'B':
1513 if (!(syntax & RE_NO_GNU_OPS))
1514 {
1515 token->type = ANCHOR;
1516 token->opr.idx = INSIDE_WORD;
1517 }
1518 break;
1519 case 'w':
1520 if (!(syntax & RE_NO_GNU_OPS))
1521 token->type = OP_WORD;
1522 break;
1523 case 'W':
1524 if (!(syntax & RE_NO_GNU_OPS))
1525 token->type = OP_NOTWORD;
1526 break;
1527 case '`':
1528 if (!(syntax & RE_NO_GNU_OPS))
1529 {
1530 token->type = ANCHOR;
1531 token->opr.idx = BUF_FIRST;
1532 }
1533 break;
1534 case '\'':
1535 if (!(syntax & RE_NO_GNU_OPS))
1536 {
1537 token->type = ANCHOR;
1538 token->opr.idx = BUF_LAST;
1539 }
1540 break;
1541 case '(':
1542 if (!(syntax & RE_NO_BK_PARENS))
1543 token->type = OP_OPEN_SUBEXP;
1544 break;
1545 case ')':
1546 if (!(syntax & RE_NO_BK_PARENS))
1547 token->type = OP_CLOSE_SUBEXP;
1548 break;
1549 case '+':
1550 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1551 token->type = OP_DUP_PLUS;
1552 break;
1553 case '?':
1554 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1555 token->type = OP_DUP_QUESTION;
1556 break;
1557 case '{':
1558 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1559 token->type = OP_OPEN_DUP_NUM;
1560 break;
1561 case '}':
1562 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1563 token->type = OP_CLOSE_DUP_NUM;
1564 break;
1565 default:
1566 break;
1567 }
1568 return 2;
1569 }
1570
1571 token->type = CHARACTER;
1572 switch (c)
1573 {
1574 case '\n':
1575 if (syntax & RE_NEWLINE_ALT)
1576 token->type = OP_ALT;
1577 break;
1578 case '|':
1579 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1580 token->type = OP_ALT;
1581 break;
1582 case '*':
1583 token->type = OP_DUP_ASTERISK;
1584 break;
1585 case '+':
1586 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1587 token->type = OP_DUP_PLUS;
1588 break;
1589 case '?':
1590 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1591 token->type = OP_DUP_QUESTION;
1592 break;
1593 case '{':
1594 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1595 token->type = OP_OPEN_DUP_NUM;
1596 break;
1597 case '}':
1598 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1599 token->type = OP_CLOSE_DUP_NUM;
1600 break;
1601 case '(':
1602 if (syntax & RE_NO_BK_PARENS)
1603 token->type = OP_OPEN_SUBEXP;
1604 break;
1605 case ')':
1606 if (syntax & RE_NO_BK_PARENS)
1607 token->type = OP_CLOSE_SUBEXP;
1608 break;
1609 case '[':
1610 token->type = OP_OPEN_BRACKET;
1611 break;
1612 case '.':
1613 token->type = OP_PERIOD;
1614 break;
1615 case '^':
1616 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1617 re_string_cur_idx (input) != 0)
1618 {
1619 char prev = re_string_peek_byte (input, -1);
1620 if (prev != '|' && prev != '(' &&
1621 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1622 break;
1623 }
1624 token->type = ANCHOR;
1625 token->opr.idx = LINE_FIRST;
1626 break;
1627 case '$':
1628 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1629 re_string_cur_idx (input) + 1 != re_string_length (input))
1630 {
1631 re_token_t next;
1632 re_string_skip_bytes (input, 1);
1633 peek_token (&next, input, syntax);
1634 re_string_skip_bytes (input, -1);
1635 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1636 break;
1637 }
1638 token->type = ANCHOR;
1639 token->opr.idx = LINE_LAST;
1640 break;
1641 default:
1642 break;
1643 }
1644 return 1;
1645}
1646
1647/* Peek a token from INPUT, and return the length of the token.
1648 We must not use this function out of bracket expressions. */
1649
1650static int
1651peek_token_bracket (token, input, syntax)
1652 re_token_t *token;
1653 re_string_t *input;
1654 reg_syntax_t syntax;
1655{
1656 unsigned char c;
1657 if (re_string_eoi (input))
1658 {
1659 token->type = END_OF_RE;
1660 return 0;
1661 }
1662 c = re_string_peek_byte (input, 0);
1663 token->opr.c = c;
1664
1665#ifdef RE_ENABLE_I18N
1666 if (MB_CUR_MAX > 1 &&
1667 !re_string_first_byte (input, re_string_cur_idx (input)))
1668 {
1669 token->type = CHARACTER;
1670 return 1;
1671 }
1672#endif /* RE_ENABLE_I18N */
1673
1674 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1675 {
1676 /* In this case, '\' escape a character. */
1677 unsigned char c2;
1678 c2 = re_string_peek_byte (input, 1);
1679 token->opr.c = c2;
1680 token->type = CHARACTER;
1681 return 1;
1682 }
1683 if (c == '[') /* '[' is a special char in a bracket exps. */
1684 {
1685 unsigned char c2;
1686 int token_len;
1687 c2 = re_string_peek_byte (input, 1);
1688 token->opr.c = c2;
1689 token_len = 2;
1690 switch (c2)
1691 {
1692 case '.':
1693 token->type = OP_OPEN_COLL_ELEM;
1694 break;
1695 case '=':
1696 token->type = OP_OPEN_EQUIV_CLASS;
1697 break;
1698 case ':':
1699 if (syntax & RE_CHAR_CLASSES)
1700 {
1701 token->type = OP_OPEN_CHAR_CLASS;
1702 break;
1703 }
1704 /* else fall through. */
1705 default:
1706 token->type = CHARACTER;
1707 token->opr.c = c;
1708 token_len = 1;
1709 break;
1710 }
1711 return token_len;
1712 }
1713 switch (c)
1714 {
1715 case '-':
1716 token->type = OP_CHARSET_RANGE;
1717 break;
1718 case ']':
1719 token->type = OP_CLOSE_BRACKET;
1720 break;
1721 case '^':
1722 token->type = OP_NON_MATCH_LIST;
1723 break;
1724 default:
1725 token->type = CHARACTER;
1726 }
1727 return 1;
1728}
1729\f
1730/* Functions for parser. */
1731
1732/* Entry point of the parser.
1733 Parse the regular expression REGEXP and return the structure tree.
1734 If an error is occured, ERR is set by error code, and return NULL.
1735 This function build the following tree, from regular expression <reg_exp>:
1736 CAT
1737 / \
1738 / \
1739 <reg_exp> EOR
1740
1741 CAT means concatenation.
1742 EOR means end of regular expression. */
1743
1744static bin_tree_t *
1745parse (regexp, preg, syntax, err)
1746 re_string_t *regexp;
1747 regex_t *preg;
1748 reg_syntax_t syntax;
1749 reg_errcode_t *err;
1750{
1751 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1752 bin_tree_t *tree, *eor, *root;
1753 re_token_t current_token;
1754 int new_idx;
1755 current_token = fetch_token (regexp, syntax);
1756 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
bc15410e 1757 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1758 return NULL;
1759 new_idx = re_dfa_add_node (dfa, current_token, 0);
1760 eor = create_tree (NULL, NULL, 0, new_idx);
1761 if (tree != NULL)
1762 root = create_tree (tree, eor, CONCAT, 0);
1763 else
1764 root = eor;
bc15410e 1765 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
3b0bdc72
UD
1766 return *err = REG_ESPACE, NULL;
1767 return root;
1768}
1769
1770/* This function build the following tree, from regular expression
1771 <branch1>|<branch2>:
1772 ALT
1773 / \
1774 / \
1775 <branch1> <branch2>
1776
1777 ALT means alternative, which represents the operator `|'. */
1778
1779static bin_tree_t *
1780parse_reg_exp (regexp, preg, token, syntax, nest, err)
1781 re_string_t *regexp;
1782 regex_t *preg;
1783 re_token_t *token;
1784 reg_syntax_t syntax;
1785 int nest;
1786 reg_errcode_t *err;
1787{
1788 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1789 bin_tree_t *tree, *branch = NULL;
1790 int new_idx;
1791 tree = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 1792 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1793 return NULL;
1794
1795 while (token->type == OP_ALT)
1796 {
1797 re_token_t alt_token = *token;
1798 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1799 *token = fetch_token (regexp, syntax);
1800 if (token->type != OP_ALT && token->type != END_OF_RE
1801 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1802 {
1803 branch = parse_branch (regexp, preg, token, syntax, nest, err);
bc15410e 1804 if (BE (*err != REG_NOERROR && branch == NULL, 0))
3b0bdc72
UD
1805 {
1806 free_bin_tree (tree);
1807 return NULL;
1808 }
1809 }
1810 tree = create_tree (tree, branch, 0, new_idx);
bc15410e 1811 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
1812 return *err = REG_ESPACE, NULL;
1813 }
1814 return tree;
1815}
1816
1817/* This function build the following tree, from regular expression
1818 <exp1><exp2>:
1819 CAT
1820 / \
1821 / \
1822 <exp1> <exp2>
1823
1824 CAT means concatenation. */
1825
1826static bin_tree_t *
1827parse_branch (regexp, preg, token, syntax, nest, err)
1828 re_string_t *regexp;
1829 regex_t *preg;
1830 re_token_t *token;
1831 reg_syntax_t syntax;
1832 int nest;
1833 reg_errcode_t *err;
1834{
1835 bin_tree_t *tree, *exp;
1836 tree = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 1837 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1838 return NULL;
1839
1840 while (token->type != OP_ALT && token->type != END_OF_RE
1841 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1842 {
1843 exp = parse_expression (regexp, preg, token, syntax, nest, err);
bc15410e 1844 if (BE (*err != REG_NOERROR && exp == NULL, 0))
3b0bdc72
UD
1845 {
1846 free_bin_tree (tree);
1847 return NULL;
1848 }
1849 if (tree != NULL && exp != NULL)
1850 {
1851 tree = create_tree (tree, exp, CONCAT, 0);
1852 if (tree == NULL)
1853 return *err = REG_ESPACE, NULL;
1854 }
1855 else if (tree == NULL)
1856 tree = exp;
1857 /* Otherwise exp == NULL, we don't need to create new tree. */
1858 }
1859 return tree;
1860}
1861
1862/* This function build the following tree, from regular expression a*:
1863 *
1864 |
1865 a
1866*/
1867
1868static bin_tree_t *
1869parse_expression (regexp, preg, token, syntax, nest, err)
1870 re_string_t *regexp;
1871 regex_t *preg;
1872 re_token_t *token;
1873 reg_syntax_t syntax;
1874 int nest;
1875 reg_errcode_t *err;
1876{
1877 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1878 bin_tree_t *tree;
1879 int new_idx;
1880 switch (token->type)
1881 {
1882 case CHARACTER:
1883 new_idx = re_dfa_add_node (dfa, *token, 0);
1884 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 1885 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
1886 return *err = REG_ESPACE, NULL;
1887#ifdef RE_ENABLE_I18N
1888 if (MB_CUR_MAX > 1)
1889 {
1890 while (!re_string_eoi (regexp)
1891 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1892 {
1893 bin_tree_t *mbc_remain;
1894 *token = fetch_token (regexp, syntax);
1895 new_idx = re_dfa_add_node (dfa, *token, 0);
1896 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1897 tree = create_tree (tree, mbc_remain, CONCAT, 0);
bc15410e 1898 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
3b0bdc72
UD
1899 return *err = REG_ESPACE, NULL;
1900 }
1901 }
1902#endif
1903 break;
1904 case OP_OPEN_SUBEXP:
1905 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
bc15410e 1906 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1907 return NULL;
1908 break;
1909 case OP_OPEN_BRACKET:
1910 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
bc15410e 1911 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
1912 return NULL;
1913 break;
1914 case OP_BACK_REF:
bc15410e
UD
1915 if (BE (preg->re_nsub < token->opr.idx
1916 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
3b0bdc72
UD
1917 {
1918 *err = REG_ESUBREG;
1919 return NULL;
1920 }
1921 new_idx = re_dfa_add_node (dfa, *token, 0);
1922 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 1923 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
1924 return *err = REG_ESPACE, NULL;
1925 ++dfa->nbackref;
1926 dfa->has_mb_node = 1;
1927 break;
1928 case OP_DUP_ASTERISK:
1929 case OP_DUP_PLUS:
1930 case OP_DUP_QUESTION:
1931 case OP_OPEN_DUP_NUM:
1932 if (syntax & RE_CONTEXT_INVALID_OPS)
1933 return *err = REG_BADRPT, NULL;
1934 else if (syntax & RE_CONTEXT_INDEP_OPS)
1935 {
1936 *token = fetch_token (regexp, syntax);
1937 return parse_expression (regexp, preg, token, syntax, nest, err);
1938 }
1939 /* else fall through */
1940 case OP_CLOSE_SUBEXP:
1941 if ((token->type == OP_CLOSE_SUBEXP) &&
1942 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
1943 return *err = REG_ERPAREN, NULL;
1944 /* else fall through */
1945 case OP_CLOSE_DUP_NUM:
1946 /* We treat it as a normal character. */
1947
1948 /* Then we can these characters as normal characters. */
1949 token->type = CHARACTER;
1950 new_idx = re_dfa_add_node (dfa, *token, 0);
1951 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 1952 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
1953 return *err = REG_ESPACE, NULL;
1954 break;
1955 case ANCHOR:
1956 if (dfa->word_char == NULL)
a9388965
UD
1957 {
1958 *err = init_word_char (dfa);
bc15410e 1959 if (BE (*err != REG_NOERROR, 0))
a9388965
UD
1960 return NULL;
1961 }
3b0bdc72
UD
1962 if (token->opr.ctx_type == WORD_DELIM)
1963 {
1964 bin_tree_t *tree_first, *tree_last;
1965 int idx_first, idx_last;
1966 token->opr.ctx_type = WORD_FIRST;
1967 idx_first = re_dfa_add_node (dfa, *token, 0);
1968 tree_first = create_tree (NULL, NULL, 0, idx_first);
1969 token->opr.ctx_type = WORD_LAST;
1970 idx_last = re_dfa_add_node (dfa, *token, 0);
1971 tree_last = create_tree (NULL, NULL, 0, idx_last);
1972 token->type = OP_ALT;
1973 new_idx = re_dfa_add_node (dfa, *token, 0);
1974 tree = create_tree (tree_first, tree_last, 0, new_idx);
bc15410e
UD
1975 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
1976 || tree_first == NULL || tree_last == NULL
1977 || tree == NULL, 0))
3b0bdc72
UD
1978 return *err = REG_ESPACE, NULL;
1979 }
1980 else
1981 {
1982 new_idx = re_dfa_add_node (dfa, *token, 0);
1983 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 1984 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
1985 return *err = REG_ESPACE, NULL;
1986 }
1987 /* We must return here, since ANCHORs can't be followed
1988 by repetition operators.
1989 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
1990 it must not be "<ANCHOR(^)><REPEAT(*)>". */
1991 *token = fetch_token (regexp, syntax);
1992 return tree;
1993 case OP_PERIOD:
1994 new_idx = re_dfa_add_node (dfa, *token, 0);
1995 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 1996 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
1997 return *err = REG_ESPACE, NULL;
1998 if (MB_CUR_MAX > 1)
1999 dfa->has_mb_node = 1;
2000 break;
2001 case OP_WORD:
2002 tree = build_word_op (dfa, 0, err);
bc15410e 2003 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2004 return NULL;
2005 break;
2006 case OP_NOTWORD:
2007 tree = build_word_op (dfa, 1, err);
bc15410e 2008 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72
UD
2009 return NULL;
2010 break;
2011 case OP_ALT:
2012 case END_OF_RE:
2013 return NULL;
2014 case BACK_SLASH:
2015 *err = REG_EESCAPE;
2016 return NULL;
2017 default:
2018 /* Must not happen? */
2019#ifdef DEBUG
2020 assert (0);
2021#endif
2022 return NULL;
2023 }
2024 *token = fetch_token (regexp, syntax);
2025
2026 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2027 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2028 {
2029 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
bc15410e 2030 if (BE (*err != REG_NOERROR && tree == NULL, 0))
602c2f9d 2031 return NULL;
3b0bdc72
UD
2032 }
2033
2034 return tree;
2035}
2036
2037/* This function build the following tree, from regular expression
2038 (<reg_exp>):
2039 SUBEXP
2040 |
2041 <reg_exp>
2042*/
2043
2044static bin_tree_t *
2045parse_sub_exp (regexp, preg, token, syntax, nest, err)
2046 re_string_t *regexp;
2047 regex_t *preg;
2048 re_token_t *token;
2049 reg_syntax_t syntax;
2050 int nest;
2051 reg_errcode_t *err;
2052{
2053 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
81c64d40 2054 bin_tree_t *tree, *left_par, *right_par;
3b0bdc72 2055 size_t cur_nsub;
81c64d40 2056 int new_idx;
3b0bdc72
UD
2057 cur_nsub = preg->re_nsub++;
2058 if (dfa->subexps_alloc < preg->re_nsub)
2059 {
2060 re_subexp_t *new_array;
2061 dfa->subexps_alloc *= 2;
2062 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
bc15410e 2063 if (BE (new_array == NULL, 0))
3b0bdc72
UD
2064 {
2065 dfa->subexps_alloc /= 2;
2066 *err = REG_ESPACE;
2067 return NULL;
2068 }
2069 dfa->subexps = new_array;
2070 }
2071 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2072 dfa->subexps[cur_nsub].end = -1;
81c64d40
UD
2073
2074 new_idx = re_dfa_add_node (dfa, *token, 0);
2075 left_par = create_tree (NULL, NULL, 0, new_idx);
2076 if (BE (new_idx == -1 || left_par == NULL, 0))
2077 return *err = REG_ESPACE, NULL;
2078 dfa->nodes[new_idx].opr.idx = cur_nsub;
3b0bdc72
UD
2079 *token = fetch_token (regexp, syntax);
2080
2081 /* The subexpression may be a null string. */
2082 if (token->type == OP_CLOSE_SUBEXP)
81c64d40 2083 tree = NULL;
3b0bdc72
UD
2084 else
2085 {
2086 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
bc15410e 2087 if (BE (*err != REG_NOERROR && tree == NULL, 0))
3b0bdc72 2088 return NULL;
3b0bdc72 2089 }
81c64d40
UD
2090 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2091 {
2092 free_bin_tree (tree);
2093 *err = REG_BADPAT;
2094 return NULL;
2095 }
2096 new_idx = re_dfa_add_node (dfa, *token, 0);
2097 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2098 right_par = create_tree (NULL, NULL, 0, new_idx);
2099 tree = ((tree == NULL) ? right_par
2100 : create_tree (tree, right_par, CONCAT, 0));
2101 tree = create_tree (left_par, tree, CONCAT, 0);
2102 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2103 return *err = REG_ESPACE, NULL;
2104 dfa->nodes[new_idx].opr.idx = cur_nsub;
2105
3b0bdc72
UD
2106 return tree;
2107}
2108
2109/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2110
2111static bin_tree_t *
2112parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2113 bin_tree_t *dup_elem;
2114 re_string_t *regexp;
2115 re_dfa_t *dfa;
2116 re_token_t *token;
2117 reg_syntax_t syntax;
2118 reg_errcode_t *err;
2119{
2120 re_token_t dup_token;
2121 bin_tree_t *tree = dup_elem, *work_tree;
2122 int new_idx, start_idx = re_string_cur_idx (regexp);
2123 re_token_t start_token = *token;
2124 if (token->type == OP_OPEN_DUP_NUM)
2125 {
602c2f9d
UD
2126 int i;
2127 int end = 0;
2128 int start = fetch_number (regexp, token, syntax);
3b0bdc72
UD
2129 bin_tree_t *elem;
2130 if (start == -1)
3b0bdc72 2131 {
602c2f9d
UD
2132 if (token->type == CHARACTER && token->opr.c == ',')
2133 start = 0; /* We treat "{,m}" as "{0,m}". */
2134 else
3b0bdc72 2135 {
602c2f9d 2136 *err = REG_BADBR; /* <re>{} is invalid. */
3b0bdc72
UD
2137 return NULL;
2138 }
3b0bdc72 2139 }
602c2f9d
UD
2140 if (BE (start != -2, 1))
2141 {
2142 /* We treat "{n}" as "{n,n}". */
2143 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2144 : ((token->type == CHARACTER && token->opr.c == ',')
2145 ? fetch_number (regexp, token, syntax) : -2));
2146 }
2147 if (BE (start == -2 || end == -2, 0))
3b0bdc72 2148 {
602c2f9d
UD
2149 /* Invalid sequence. */
2150 if (token->type == OP_CLOSE_DUP_NUM)
3b0bdc72 2151 goto parse_dup_op_invalid_interval;
602c2f9d
UD
2152 else
2153 goto parse_dup_op_ebrace;
2154 }
2155 if (BE (start == 0 && end == 0, 0))
2156 {
2157 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2158 *token = fetch_token (regexp, syntax);
2159 free_bin_tree (dup_elem);
2160 return NULL;
3b0bdc72 2161 }
602c2f9d 2162
3b0bdc72
UD
2163 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2164 elem = tree;
2165 for (i = 0; i < start; ++i)
2166 if (i != 0)
2167 {
2168 work_tree = duplicate_tree (elem, dfa);
2169 tree = create_tree (tree, work_tree, CONCAT, 0);
bc15410e 2170 if (BE (work_tree == NULL || tree == NULL, 0))
3b0bdc72
UD
2171 goto parse_dup_op_espace;
2172 }
2173
2174 if (end == -1)
2175 {
2176 /* We treat "<re>{0,}" as "<re>*". */
2177 dup_token.type = OP_DUP_ASTERISK;
2178 if (start > 0)
2179 {
2180 elem = duplicate_tree (elem, dfa);
2181 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2182 work_tree = create_tree (elem, NULL, 0, new_idx);
2183 tree = create_tree (tree, work_tree, CONCAT, 0);
bc15410e
UD
2184 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2185 || tree == NULL, 0))
3b0bdc72
UD
2186 goto parse_dup_op_espace;
2187 }
2188 else
2189 {
2190 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2191 tree = create_tree (elem, NULL, 0, new_idx);
bc15410e 2192 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
2193 goto parse_dup_op_espace;
2194 }
2195 }
2196 else if (end - start > 0)
2197 {
2198 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2199 dup_token.type = OP_DUP_QUESTION;
2200 if (start > 0)
2201 {
2202 elem = duplicate_tree (elem, dfa);
2203 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2204 elem = create_tree (elem, NULL, 0, new_idx);
2205 tree = create_tree (tree, elem, CONCAT, 0);
bc15410e 2206 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
2207 goto parse_dup_op_espace;
2208 }
2209 else
2210 {
2211 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2212 tree = elem = create_tree (elem, NULL, 0, new_idx);
bc15410e 2213 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
2214 goto parse_dup_op_espace;
2215 }
2216 for (i = 1; i < end - start; ++i)
2217 {
2218 work_tree = duplicate_tree (elem, dfa);
2219 tree = create_tree (tree, work_tree, CONCAT, 0);
bc15410e 2220 if (BE (work_tree == NULL || tree == NULL, 0))
3b0bdc72
UD
2221 return *err = REG_ESPACE, NULL;
2222 }
2223 }
2224 }
2225 else
2226 {
2227 new_idx = re_dfa_add_node (dfa, *token, 0);
2228 tree = create_tree (tree, NULL, 0, new_idx);
bc15410e 2229 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
2230 return *err = REG_ESPACE, NULL;
2231 }
2232 *token = fetch_token (regexp, syntax);
2233 return tree;
2234
2235 parse_dup_op_espace:
2236 free_bin_tree (tree);
2237 *err = REG_ESPACE;
2238 return NULL;
2239
602c2f9d 2240 parse_dup_op_ebrace:
bc15410e 2241 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
3b0bdc72
UD
2242 {
2243 *err = REG_EBRACE;
2244 return NULL;
2245 }
602c2f9d
UD
2246 goto parse_dup_op_rollback;
2247 parse_dup_op_invalid_interval:
2248 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2249 {
2250 *err = REG_BADBR;
2251 return NULL;
2252 }
2253 parse_dup_op_rollback:
3b0bdc72
UD
2254 re_string_set_index (regexp, start_idx);
2255 *token = start_token;
2256 token->type = CHARACTER;
2257 return dup_elem;
2258}
2259
2260/* Size of the names for collating symbol/equivalence_class/character_class.
2261 I'm not sure, but maybe enough. */
2262#define BRACKET_NAME_BUF_SIZE 32
2263
434d3784
UD
2264#ifndef _LIBC
2265 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2266 Build the range expression which starts from START_ELEM, and ends
2267 at END_ELEM. The result are written to MBCSET and SBCSET.
2268 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2269 mbcset->range_ends, is a pointer argument sinse we may
2270 update it. */
2271
2272static reg_errcode_t
c0a0f9a3
UD
2273# ifdef RE_ENABLE_I18N
2274build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
434d3784 2275 re_charset_t *mbcset;
434d3784 2276 int *range_alloc;
c0a0f9a3
UD
2277# else /* not RE_ENABLE_I18N */
2278build_range_exp (sbcset, start_elem, end_elem)
2279# endif /* not RE_ENABLE_I18N */
2280 re_bitset_ptr_t sbcset;
434d3784
UD
2281 bracket_elem_t *start_elem, *end_elem;
2282{
2283 unsigned int start_ch, end_ch;
2284 /* Equivalence Classes and Character Classes can't be a range start/end. */
2285 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2286 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2287 0))
2288 return REG_ERANGE;
2289
2290 /* We can handle no multi character collating elements without libc
2291 support. */
2292 if (BE ((start_elem->type == COLL_SYM && strlen (start_elem->opr.name) > 1)
2293 || (end_elem->type == COLL_SYM && strlen (end_elem->opr.name) > 1),
2294 0))
2295 return REG_ECOLLATE;
2296
2297# ifdef RE_ENABLE_I18N
2298 {
2299 wchar_t wc, start_wc, end_wc;
2300 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2301
2302 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2303 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2304 : 0));
2305 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2306 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2307 : 0));
2308 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2309 ? __btowc (start_ch) : start_elem->opr.wch);
2310 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2311 ? __btowc (end_ch) : end_elem->opr.wch);
2312 cmp_buf[0] = start_wc;
2313 cmp_buf[4] = end_wc;
2314 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2315 return REG_ERANGE;
2316
2317 /* Check the space of the arrays. */
2318 if (*range_alloc == mbcset->nranges)
2319 {
2320 /* There are not enough space, need realloc. */
2321 wchar_t *new_array_start, *new_array_end;
2322 int new_nranges;
2323
2324 /* +1 in case of mbcset->nranges is 0. */
2325 new_nranges = 2 * mbcset->nranges + 1;
2326 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2327 are NULL if *range_alloc == 0. */
2328 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2329 new_nranges);
2330 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2331 new_nranges);
2332
2333 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2334 return REG_ESPACE;
2335
2336 mbcset->range_starts = new_array_start;
2337 mbcset->range_ends = new_array_end;
2338 *range_alloc = new_nranges;
2339 }
2340
2341 mbcset->range_starts[mbcset->nranges] = start_wc;
2342 mbcset->range_ends[mbcset->nranges++] = end_wc;
2343
2344 /* Build the table for single byte characters. */
2345 for (wc = 0; wc <= SBC_MAX; ++wc)
2346 {
2347 cmp_buf[2] = wc;
2348 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2349 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2350 bitset_set (sbcset, wc);
2351 }
2352 }
2353# else /* not RE_ENABLE_I18N */
2354 {
2355 unsigned int ch;
2356 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2357 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2358 : 0));
2359 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2360 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2361 : 0));
2362 if (start_ch > end_ch)
2363 return REG_ERANGE;
2364 /* Build the table for single byte characters. */
2365 for (ch = 0; ch <= SBC_MAX; ++ch)
2366 if (start_ch <= ch && ch <= end_ch)
2367 bitset_set (sbcset, ch);
2368 }
2369# endif /* not RE_ENABLE_I18N */
2370 return REG_NOERROR;
2371}
2372#endif /* not _LIBC */
2373
2374#ifndef _LIBC
2375/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2376 Build the collating element which is represented by NAME.
2377 The result are written to MBCSET and SBCSET.
2378 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2379 pointer argument since we may update it. */
2380
2381static reg_errcode_t
c0a0f9a3
UD
2382# ifdef RE_ENABLE_I18N
2383build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
434d3784 2384 re_charset_t *mbcset;
434d3784 2385 int *coll_sym_alloc;
c0a0f9a3
UD
2386# else /* not RE_ENABLE_I18N */
2387build_collating_symbol (sbcset, name)
2388# endif /* not RE_ENABLE_I18N */
2389 re_bitset_ptr_t sbcset;
434d3784
UD
2390 unsigned char *name;
2391{
2392 if (BE (strlen (name) != 1, 0))
2393 return REG_ECOLLATE;
2394 else
2395 {
2396 bitset_set (sbcset, name[0]);
2397 return REG_NOERROR;
2398 }
2399}
2400#endif /* not _LIBC */
2401
3b0bdc72
UD
2402/* This function parse bracket expression like "[abc]", "[a-c]",
2403 "[[.a-a.]]" etc. */
2404
2405static bin_tree_t *
2406parse_bracket_exp (regexp, dfa, token, syntax, err)
2407 re_string_t *regexp;
2408 re_dfa_t *dfa;
2409 re_token_t *token;
2410 reg_syntax_t syntax;
2411 reg_errcode_t *err;
2412{
2413#ifdef _LIBC
2414 const unsigned char *collseqmb, *collseqwc;
2415 uint32_t nrules;
2416 int32_t table_size;
2417 const int32_t *symb_table;
2418 const unsigned char *extra;
2419
434d3784 2420 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2421 Seek the collating symbol entry correspondings to NAME.
2422 Return the index of the symbol in the SYMB_TABLE. */
2423
2424 static inline int32_t
2425 seek_collating_symbol_entry (name, name_len)
2426 unsigned char *name;
2427 size_t name_len;
2428 {
2429 int32_t hash = elem_hash (name, name_len);
2430 int32_t elem = hash % table_size;
2431 int32_t second = hash % (table_size - 2);
2432 while (symb_table[2 * elem] != 0)
2433 {
2434 /* First compare the hashing value. */
2435 if (symb_table[2 * elem] == hash
2436 /* Compare the length of the name. */
2437 && name_len == extra[symb_table[2 * elem + 1]]
2438 /* Compare the name. */
2439 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2440 name_len) == 0)
2441 {
2442 /* Yep, this is the entry. */
2443 break;
2444 }
2445
2446 /* Next entry. */
2447 elem += second;
2448 }
2449 return elem;
2450 }
2451
434d3784 2452 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2453 Look up the collation sequence value of BR_ELEM.
2454 Return the value if succeeded, UINT_MAX otherwise. */
2455
2456 static inline unsigned int
2457 lookup_collation_sequence_value (br_elem)
2458 bracket_elem_t *br_elem;
2459 {
2460 if (br_elem->type == SB_CHAR)
2461 {
2462 /*
2463 if (MB_CUR_MAX == 1)
2464 */
2465 if (nrules == 0)
2466 return collseqmb[br_elem->opr.ch];
2467 else
2468 {
2469 wint_t wc = __btowc (br_elem->opr.ch);
2470 return collseq_table_lookup (collseqwc, wc);
2471 }
2472 }
2473 else if (br_elem->type == MB_CHAR)
2474 {
2475 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2476 }
2477 else if (br_elem->type == COLL_SYM)
2478 {
2479 if (nrules != 0)
2480 {
2481 int32_t elem, idx;
2482 elem = seek_collating_symbol_entry (br_elem->opr.name,
2483 strlen (br_elem->opr.name));
2484 if (symb_table[2 * elem] != 0)
2485 {
2486 /* We found the entry. */
2487 idx = symb_table[2 * elem + 1];
2488 /* Skip the name of collating element name. */
2489 idx += 1 + extra[idx];
2490 /* Skip the byte sequence of the collating element. */
2491 idx += 1 + extra[idx];
2492 /* Adjust for the alignment. */
2493 idx = (idx + 3) & ~3;
2494 /* Skip the multibyte collation sequence value. */
2495 idx += sizeof (unsigned int);
2496 /* Skip the wide char sequence of the collating element. */
2497 idx += sizeof (unsigned int) *
2498 (1 + *(unsigned int *) (extra + idx));
2499 /* Return the collation sequence value. */
2500 return *(unsigned int *) (extra + idx);
2501 }
2502 else if (symb_table[2 * elem] == 0 &&
2503 strlen (br_elem->opr.name) == 1)
2504 {
2505 /* No valid character. Match it as a single byte
2506 character. */
2507 return collseqmb[br_elem->opr.name[0]];
2508 }
2509 }
2510 else if (strlen (br_elem->opr.name) == 1)
2511 return collseqmb[br_elem->opr.name[0]];
2512 }
2513 return UINT_MAX;
2514 }
2515
434d3784 2516 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2517 Build the range expression which starts from START_ELEM, and ends
2518 at END_ELEM. The result are written to MBCSET and SBCSET.
2519 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2520 mbcset->range_ends, is a pointer argument sinse we may
2521 update it. */
2522
2523 static inline reg_errcode_t
c0a0f9a3
UD
2524# ifdef RE_ENABLE_I18N
2525 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
3b0bdc72 2526 re_charset_t *mbcset;
3b0bdc72 2527 int *range_alloc;
c0a0f9a3
UD
2528# else /* not RE_ENABLE_I18N */
2529 build_range_exp (sbcset, start_elem, end_elem)
2530# endif /* not RE_ENABLE_I18N */
2531 re_bitset_ptr_t sbcset;
3b0bdc72
UD
2532 bracket_elem_t *start_elem, *end_elem;
2533 {
2534 unsigned int ch;
2535 uint32_t start_collseq;
2536 uint32_t end_collseq;
2537
c0a0f9a3 2538# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2539 /* Check the space of the arrays. */
2540 if (*range_alloc == mbcset->nranges)
2541 {
2542 /* There are not enough space, need realloc. */
2543 uint32_t *new_array_start;
2544 uint32_t *new_array_end;
2545 int new_nranges;
2546
a9388965
UD
2547 /* +1 in case of mbcset->nranges is 0. */
2548 new_nranges = 2 * mbcset->nranges + 1;
2549 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2550 are NULL if *range_alloc == 0. */
2551 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2552 new_nranges);
2553 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2554 new_nranges);
2555
bc15410e 2556 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3b0bdc72
UD
2557 return REG_ESPACE;
2558
2559 mbcset->range_starts = new_array_start;
2560 mbcset->range_ends = new_array_end;
2561 *range_alloc = new_nranges;
2562 }
c0a0f9a3 2563# endif /* RE_ENABLE_I18N */
3b0bdc72 2564
434d3784
UD
2565 /* Equivalence Classes and Character Classes can't be a range
2566 start/end. */
bc15410e
UD
2567 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2568 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2569 0))
3b0bdc72
UD
2570 return REG_ERANGE;
2571
2572 start_collseq = lookup_collation_sequence_value (start_elem);
2573 end_collseq = lookup_collation_sequence_value (end_elem);
2574 /* Check start/end collation sequence values. */
bc15410e 2575 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
3b0bdc72 2576 return REG_ECOLLATE;
bc15410e 2577 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
3b0bdc72
UD
2578 return REG_ERANGE;
2579
c0a0f9a3 2580# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2581 /* Got valid collation sequence values, add them as a new entry. */
2582 mbcset->range_starts[mbcset->nranges] = start_collseq;
2583 mbcset->range_ends[mbcset->nranges++] = end_collseq;
c0a0f9a3 2584# endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2585
2586 /* Build the table for single byte characters. */
2587 for (ch = 0; ch <= SBC_MAX; ch++)
2588 {
2589 uint32_t ch_collseq;
2590 /*
2591 if (MB_CUR_MAX == 1)
2592 */
2593 if (nrules == 0)
2594 ch_collseq = collseqmb[ch];
2595 else
2596 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2597 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2598 bitset_set (sbcset, ch);
2599 }
2600 return REG_NOERROR;
2601 }
3b0bdc72 2602
434d3784 2603 /* Local function for parse_bracket_exp used in _LIBC environement.
3b0bdc72
UD
2604 Build the collating element which is represented by NAME.
2605 The result are written to MBCSET and SBCSET.
2606 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2607 pointer argument sinse we may update it. */
2608
2609 static inline reg_errcode_t
c0a0f9a3
UD
2610# ifdef RE_ENABLE_I18N
2611 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3b0bdc72 2612 re_charset_t *mbcset;
3b0bdc72 2613 int *coll_sym_alloc;
c0a0f9a3
UD
2614# else /* not RE_ENABLE_I18N */
2615 build_collating_symbol (sbcset, name)
2616# endif /* not RE_ENABLE_I18N */
2617 re_bitset_ptr_t sbcset;
3b0bdc72
UD
2618 unsigned char *name;
2619 {
3b0bdc72
UD
2620 int32_t elem, idx;
2621 if (nrules != 0)
2622 {
2623 elem = seek_collating_symbol_entry (name, strlen (name));
2624 if (symb_table[2 * elem] != 0)
2625 {
2626 /* We found the entry. */
2627 idx = symb_table[2 * elem + 1];
2628 /* Skip the name of collating element name. */
2629 idx += 1 + extra[idx];
2630 }
2631 else if (symb_table[2 * elem] == 0 && strlen (name) == 1)
2632 {
2633 /* No valid character, treat it as a normal
2634 character. */
2635 bitset_set (sbcset, name[0]);
2636 return REG_NOERROR;
2637 }
2638 else
2639 return REG_ECOLLATE;
2640
c0a0f9a3 2641# ifdef RE_ENABLE_I18N
3b0bdc72
UD
2642 /* Got valid collation sequence, add it as a new entry. */
2643 /* Check the space of the arrays. */
a9388965
UD
2644 if (*coll_sym_alloc == mbcset->ncoll_syms)
2645 {
2646 /* Not enough, realloc it. */
2647 /* +1 in case of mbcset->ncoll_syms is 0. */
2648 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2649 /* Use realloc since mbcset->coll_syms is NULL
2650 if *alloc == 0. */
2651 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2652 *coll_sym_alloc);
bc15410e 2653 if (BE (mbcset->coll_syms == NULL, 0))
a9388965
UD
2654 return REG_ESPACE;
2655 }
3b0bdc72 2656 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
c0a0f9a3 2657# endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2658 return REG_NOERROR;
2659 }
2660 else
3b0bdc72 2661 {
bc15410e 2662 if (BE (strlen (name) != 1, 0))
3b0bdc72
UD
2663 return REG_ECOLLATE;
2664 else
2665 {
2666 bitset_set (sbcset, name[0]);
2667 return REG_NOERROR;
2668 }
2669 }
2670 }
434d3784
UD
2671#endif
2672
3b0bdc72
UD
2673 re_token_t br_token;
2674 re_bitset_ptr_t sbcset;
c0a0f9a3 2675#ifdef RE_ENABLE_I18N
3b0bdc72 2676 re_charset_t *mbcset;
3b0bdc72
UD
2677 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2678 int equiv_class_alloc = 0, char_class_alloc = 0;
c0a0f9a3
UD
2679#else /* not RE_ENABLE_I18N */
2680 int non_match = 0;
2681#endif /* not RE_ENABLE_I18N */
2682 bin_tree_t *work_tree;
2683 int token_len, new_idx;
3b0bdc72
UD
2684#ifdef _LIBC
2685 collseqmb = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2686 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2687 if (nrules)
2688 {
2689 /*
2690 if (MB_CUR_MAX > 1)
2691 */
2692 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2693 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2694 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2695 _NL_COLLATE_SYMB_TABLEMB);
2696 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2697 _NL_COLLATE_SYMB_EXTRAMB);
2698 }
2699#endif
2700 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 2701#ifdef RE_ENABLE_I18N
3b0bdc72 2702 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
2703#endif /* RE_ENABLE_I18N */
2704#ifdef RE_ENABLE_I18N
bc15410e 2705 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
2706#else
2707 if (BE (sbcset == NULL, 0))
2708#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2709 {
2710 *err = REG_ESPACE;
2711 return NULL;
2712 }
2713
2714 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2715 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2716 {
3b0bdc72 2717 *err = REG_BADPAT;
434d3784 2718 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2719 }
2720 if (token->type == OP_NON_MATCH_LIST)
2721 {
c0a0f9a3 2722#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2723 int i;
2724 mbcset->non_match = 1;
c0a0f9a3
UD
2725#else /* not RE_ENABLE_I18N */
2726 non_match = 1;
2727#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
2728 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2729 bitset_set (sbcset, '\0');
2730 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2731 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2732 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2733 {
3b0bdc72 2734 *err = REG_BADPAT;
434d3784 2735 goto parse_bracket_exp_free_return;
3b0bdc72 2736 }
c0a0f9a3 2737#ifdef RE_ENABLE_I18N
3b0bdc72
UD
2738 if (MB_CUR_MAX > 1)
2739 for (i = 0; i < SBC_MAX; ++i)
2740 if (__btowc (i) == WEOF)
2741 bitset_set (sbcset, i);
c0a0f9a3 2742#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2743 }
2744
2745 /* We treat the first ']' as a normal character. */
2746 if (token->type == OP_CLOSE_BRACKET)
2747 token->type = CHARACTER;
2748
2749 while (1)
2750 {
2751 bracket_elem_t start_elem, end_elem;
2752 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2753 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2754 reg_errcode_t ret;
2755 int token_len2 = 0, is_range_exp = 0;
2756 re_token_t token2;
2757
2758 start_elem.opr.name = start_name_buf;
2759 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2760 syntax);
bc15410e 2761 if (BE (ret != REG_NOERROR, 0))
602c2f9d 2762 {
602c2f9d 2763 *err = ret;
434d3784 2764 goto parse_bracket_exp_free_return;
602c2f9d 2765 }
3b0bdc72
UD
2766
2767 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2768 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2769 {
3b0bdc72 2770 *err = REG_BADPAT;
434d3784 2771 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2772 }
2773 if (token->type == OP_CHARSET_RANGE)
2774 {
2775 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2776 token_len2 = peek_token_bracket (&token2, regexp, syntax);
bc15410e 2777 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2778 {
3b0bdc72 2779 *err = REG_BADPAT;
434d3784 2780 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2781 }
2782 if (token2.type == OP_CLOSE_BRACKET)
2783 {
2784 /* We treat the last '-' as a normal character. */
2785 re_string_skip_bytes (regexp, -token_len);
2786 token->type = CHARACTER;
2787 }
2788 else
2789 is_range_exp = 1;
2790 }
2791
2792 if (is_range_exp == 1)
2793 {
2794 end_elem.opr.name = end_name_buf;
2795 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2796 dfa, syntax);
bc15410e 2797 if (BE (ret != REG_NOERROR, 0))
602c2f9d 2798 {
602c2f9d 2799 *err = ret;
434d3784 2800 goto parse_bracket_exp_free_return;
602c2f9d 2801 }
3b0bdc72
UD
2802
2803 token_len = peek_token_bracket (token, regexp, syntax);
bc15410e 2804 if (BE (token->type == END_OF_RE, 0))
3b0bdc72 2805 {
3b0bdc72 2806 *err = REG_BADPAT;
434d3784 2807 goto parse_bracket_exp_free_return;
3b0bdc72 2808 }
c0a0f9a3
UD
2809 *err = build_range_exp (sbcset,
2810#ifdef RE_ENABLE_I18N
2811 mbcset, &range_alloc,
2812#endif /* RE_ENABLE_I18N */
2813 &start_elem, &end_elem);
bc15410e 2814 if (BE (*err != REG_NOERROR, 0))
434d3784 2815 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2816 }
2817 else
2818 {
2819 switch (start_elem.type)
2820 {
2821 case SB_CHAR:
2822 bitset_set (sbcset, start_elem.opr.ch);
2823 break;
c0a0f9a3 2824#ifdef RE_ENABLE_I18N
3b0bdc72 2825 case MB_CHAR:
a9388965
UD
2826 /* Check whether the array has enough space. */
2827 if (mbchar_alloc == mbcset->nmbchars)
2828 {
2829 /* Not enough, realloc it. */
2830 /* +1 in case of mbcset->nmbchars is 0. */
2831 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2832 /* Use realloc since array is NULL if *alloc == 0. */
2833 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2834 mbchar_alloc);
bc15410e 2835 if (BE (mbcset->mbchars == NULL, 0))
a9388965
UD
2836 goto parse_bracket_exp_espace;
2837 }
3b0bdc72
UD
2838 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2839 break;
c0a0f9a3 2840#endif /* RE_ENABLE_I18N */
3b0bdc72 2841 case EQUIV_CLASS:
c0a0f9a3
UD
2842 *err = build_equiv_class (sbcset,
2843#ifdef RE_ENABLE_I18N
2844 mbcset, &equiv_class_alloc,
2845#endif /* RE_ENABLE_I18N */
3b0bdc72 2846 start_elem.opr.name);
bc15410e 2847 if (BE (*err != REG_NOERROR, 0))
434d3784 2848 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2849 break;
2850 case COLL_SYM:
c0a0f9a3
UD
2851 *err = build_collating_symbol (sbcset,
2852#ifdef RE_ENABLE_I18N
2853 mbcset, &coll_sym_alloc,
2854#endif /* RE_ENABLE_I18N */
3b0bdc72 2855 start_elem.opr.name);
bc15410e 2856 if (BE (*err != REG_NOERROR, 0))
434d3784 2857 goto parse_bracket_exp_free_return;
3b0bdc72
UD
2858 break;
2859 case CHAR_CLASS:
c0a0f9a3
UD
2860 ret = build_charclass (sbcset,
2861#ifdef RE_ENABLE_I18N
2862 mbcset, &char_class_alloc,
2863#endif /* RE_ENABLE_I18N */
602c2f9d 2864 start_elem.opr.name, syntax);
bc15410e 2865 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
2866 goto parse_bracket_exp_espace;
2867 break;
2868 default:
2869 assert (0);
2870 break;
2871 }
2872 }
2873 if (token->type == OP_CLOSE_BRACKET)
2874 break;
2875 }
2876
2877 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2878
2879 /* If it is non-matching list. */
c0a0f9a3 2880#ifdef RE_ENABLE_I18N
3b0bdc72 2881 if (mbcset->non_match)
c0a0f9a3
UD
2882#else /* not RE_ENABLE_I18N */
2883 if (non_match)
2884#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
2885 bitset_not (sbcset);
2886
2887 /* Build a tree for simple bracket. */
2888 br_token.type = SIMPLE_BRACKET;
2889 br_token.opr.sbcset = sbcset;
2890 new_idx = re_dfa_add_node (dfa, br_token, 0);
2891 work_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2892 if (BE (new_idx == -1 || work_tree == NULL, 0))
3b0bdc72
UD
2893 goto parse_bracket_exp_espace;
2894
c0a0f9a3 2895#ifdef RE_ENABLE_I18N
3b0bdc72 2896 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
434d3784
UD
2897 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2898 || mbcset->non_match)))
3b0bdc72
UD
2899 {
2900 re_token_t alt_token;
2901 bin_tree_t *mbc_tree;
2902 /* Build a tree for complex bracket. */
2903 br_token.type = COMPLEX_BRACKET;
2904 br_token.opr.mbcset = mbcset;
2905 dfa->has_mb_node = 1;
2906 new_idx = re_dfa_add_node (dfa, br_token, 0);
2907 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 2908 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3b0bdc72
UD
2909 goto parse_bracket_exp_espace;
2910 /* Then join them by ALT node. */
2911 alt_token.type = OP_ALT;
2912 new_idx = re_dfa_add_node (dfa, alt_token, 0);
2913 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
bc15410e 2914 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3b0bdc72
UD
2915 return work_tree;
2916 }
2917 else
2918 {
2919 free_charset (mbcset);
2920 return work_tree;
2921 }
c0a0f9a3
UD
2922#else /* not RE_ENABLE_I18N */
2923 return work_tree;
2924#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
2925
2926 parse_bracket_exp_espace:
3b0bdc72 2927 *err = REG_ESPACE;
434d3784
UD
2928 parse_bracket_exp_free_return:
2929 re_free (sbcset);
c0a0f9a3 2930#ifdef RE_ENABLE_I18N
434d3784 2931 free_charset (mbcset);
c0a0f9a3 2932#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
2933 return NULL;
2934}
2935
434d3784
UD
2936/* Parse an element in the bracket expression. */
2937
3b0bdc72
UD
2938static reg_errcode_t
2939parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
2940 bracket_elem_t *elem;
2941 re_string_t *regexp;
2942 re_token_t *token;
2943 int token_len;
2944 re_dfa_t *dfa;
2945 reg_syntax_t syntax;
2946{
2947#ifdef RE_ENABLE_I18N
2948 int cur_char_size;
2949 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
2950 if (cur_char_size > 1)
2951 {
2952 elem->type = MB_CHAR;
2953 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
2954 re_string_skip_bytes (regexp, cur_char_size);
2955 return REG_NOERROR;
2956 }
2957#endif /* RE_ENABLE_I18N */
2958 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2959 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
2960 || token->type == OP_OPEN_EQUIV_CLASS)
2961 return parse_bracket_symbol (elem, regexp, token);
2962 elem->type = SB_CHAR;
2963 elem->opr.ch = token->opr.c;
2964 return REG_NOERROR;
2965}
2966
434d3784
UD
2967/* Parse a bracket symbol in the bracket expression. Bracket symbols are
2968 such as [:<character_class>:], [.<collating_element>.], and
2969 [=<equivalent_class>=]. */
2970
3b0bdc72
UD
2971static reg_errcode_t
2972parse_bracket_symbol (elem, regexp, token)
2973 bracket_elem_t *elem;
2974 re_string_t *regexp;
2975 re_token_t *token;
2976{
2977 unsigned char ch, delim = token->opr.c;
2978 int i = 0;
602c2f9d 2979 for (;; ++i)
3b0bdc72 2980 {
602c2f9d
UD
2981 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
2982 return REG_EBRACK;
3b0bdc72
UD
2983 if (token->type == OP_OPEN_CHAR_CLASS)
2984 ch = re_string_fetch_byte_case (regexp);
2985 else
2986 ch = re_string_fetch_byte (regexp);
2987 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
2988 break;
2989 elem->opr.name[i] = ch;
2990 }
2991 re_string_skip_bytes (regexp, 1);
2992 elem->opr.name[i] = '\0';
2993 switch (token->type)
2994 {
2995 case OP_OPEN_COLL_ELEM:
2996 elem->type = COLL_SYM;
2997 break;
2998 case OP_OPEN_EQUIV_CLASS:
2999 elem->type = EQUIV_CLASS;
3000 break;
3001 case OP_OPEN_CHAR_CLASS:
3002 elem->type = CHAR_CLASS;
3003 break;
3004 default:
3005 break;
3006 }
3007 return REG_NOERROR;
3008}
3009
3010 /* Helper function for parse_bracket_exp.
3011 Build the equivalence class which is represented by NAME.
3012 The result are written to MBCSET and SBCSET.
3013 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3014 is a pointer argument sinse we may update it. */
3015
3016static reg_errcode_t
c0a0f9a3
UD
3017#ifdef RE_ENABLE_I18N
3018build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3b0bdc72 3019 re_charset_t *mbcset;
3b0bdc72 3020 int *equiv_class_alloc;
c0a0f9a3
UD
3021#else /* not RE_ENABLE_I18N */
3022build_equiv_class (sbcset, name)
3023#endif /* not RE_ENABLE_I18N */
3024 re_bitset_ptr_t sbcset;
3b0bdc72
UD
3025 const unsigned char *name;
3026{
c0a0f9a3 3027#if defined _LIBC && defined RE_ENABLE_I18N
3b0bdc72
UD
3028 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3029 if (nrules != 0)
3030 {
3031 const int32_t *table, *indirect;
3032 const unsigned char *weights, *extra, *cp;
3033 unsigned char char_buf[2];
3034 int32_t idx1, idx2;
3035 unsigned int ch;
3036 size_t len;
3037 /* This #include defines a local function! */
3038# include <locale/weight.h>
3039 /* Calculate the index for equivalence class. */
3040 cp = name;
3041 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3042 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3043 _NL_COLLATE_WEIGHTMB);
3044 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045 _NL_COLLATE_EXTRAMB);
3046 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3047 _NL_COLLATE_INDIRECTMB);
3048 idx1 = findidx (&cp);
bc15410e 3049 if (BE (idx1 == 0 || cp < name + strlen (name), 0))
3b0bdc72
UD
3050 /* This isn't a valid character. */
3051 return REG_ECOLLATE;
3052
3053 /* Build single byte matcing table for this equivalence class. */
3054 char_buf[1] = '\0';
3055 len = weights[idx1];
3056 for (ch = 0; ch < SBC_MAX; ++ch)
3057 {
3058 char_buf[0] = ch;
3059 cp = char_buf;
3060 idx2 = findidx (&cp);
3061/*
3062 idx2 = table[ch];
3063*/
3064 if (idx2 == 0)
3065 /* This isn't a valid character. */
3066 continue;
3067 if (len == weights[idx2])
3068 {
3069 int cnt = 0;
3070 while (cnt <= len &&
3071 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3072 ++cnt;
3073
3074 if (cnt > len)
3075 bitset_set (sbcset, ch);
3076 }
3077 }
a9388965
UD
3078 /* Check whether the array has enough space. */
3079 if (*equiv_class_alloc == mbcset->nequiv_classes)
3080 {
3081 /* Not enough, realloc it. */
3082 /* +1 in case of mbcset->nequiv_classes is 0. */
3083 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3084 /* Use realloc since the array is NULL if *alloc == 0. */
3085 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3086 *equiv_class_alloc);
bc15410e 3087 if (BE (mbcset->equiv_classes == NULL, 0))
a9388965
UD
3088 return REG_ESPACE;
3089 }
3b0bdc72
UD
3090 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3091 }
3092 else
c0a0f9a3 3093#endif /* _LIBC && RE_ENABLE_I18N */
3b0bdc72 3094 {
bc15410e 3095 if (BE (strlen (name) != 1, 0))
3b0bdc72
UD
3096 return REG_ECOLLATE;
3097 bitset_set (sbcset, name[0]);
3098 }
3099 return REG_NOERROR;
3100}
3101
3102 /* Helper function for parse_bracket_exp.
3103 Build the character class which is represented by NAME.
3104 The result are written to MBCSET and SBCSET.
3105 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3106 is a pointer argument sinse we may update it. */
3107
3108static reg_errcode_t
c0a0f9a3
UD
3109#ifdef RE_ENABLE_I18N
3110build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3b0bdc72 3111 re_charset_t *mbcset;
3b0bdc72 3112 int *char_class_alloc;
c0a0f9a3
UD
3113#else /* not RE_ENABLE_I18N */
3114build_charclass (sbcset, class_name, syntax)
3115#endif /* not RE_ENABLE_I18N */
3116 re_bitset_ptr_t sbcset;
602c2f9d
UD
3117 const unsigned char *class_name;
3118 reg_syntax_t syntax;
3b0bdc72
UD
3119{
3120 int i;
602c2f9d 3121 const unsigned char *name = class_name;
c0a0f9a3
UD
3122
3123 /* In case of REG_ICASE "upper" and "lower" match the both of
3124 upper and lower cases. */
3125 if ((syntax & RE_ICASE)
3126 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3127 name = "alpha";
3128
3129#ifdef RE_ENABLE_I18N
3b0bdc72 3130 /* Check the space of the arrays. */
a9388965
UD
3131 if (*char_class_alloc == mbcset->nchar_classes)
3132 {
3133 /* Not enough, realloc it. */
3134 /* +1 in case of mbcset->nchar_classes is 0. */
3135 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3136 /* Use realloc since array is NULL if *alloc == 0. */
3137 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3138 *char_class_alloc);
bc15410e 3139 if (BE (mbcset->char_classes == NULL, 0))
a9388965
UD
3140 return REG_ESPACE;
3141 }
3b0bdc72 3142 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
c0a0f9a3 3143#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3144
3145#define BUILD_CHARCLASS_LOOP(ctype_func)\
3146 for (i = 0; i < SBC_MAX; ++i) \
3147 { \
3148 if (ctype_func (i)) \
3149 bitset_set (sbcset, i); \
3150 }
3151
3152 if (strcmp (name, "alnum") == 0)
3153 BUILD_CHARCLASS_LOOP (isalnum)
3154 else if (strcmp (name, "cntrl") == 0)
3155 BUILD_CHARCLASS_LOOP (iscntrl)
3156 else if (strcmp (name, "lower") == 0)
3157 BUILD_CHARCLASS_LOOP (islower)
3158 else if (strcmp (name, "space") == 0)
3159 BUILD_CHARCLASS_LOOP (isspace)
3160 else if (strcmp (name, "alpha") == 0)
3161 BUILD_CHARCLASS_LOOP (isalpha)
3162 else if (strcmp (name, "digit") == 0)
3163 BUILD_CHARCLASS_LOOP (isdigit)
3164 else if (strcmp (name, "print") == 0)
3165 BUILD_CHARCLASS_LOOP (isprint)
3166 else if (strcmp (name, "upper") == 0)
3167 BUILD_CHARCLASS_LOOP (isupper)
3168 else if (strcmp (name, "blank") == 0)
3169 BUILD_CHARCLASS_LOOP (isblank)
3170 else if (strcmp (name, "graph") == 0)
3171 BUILD_CHARCLASS_LOOP (isgraph)
3172 else if (strcmp (name, "punct") == 0)
3173 BUILD_CHARCLASS_LOOP (ispunct)
3174 else if (strcmp (name, "xdigit") == 0)
3175 BUILD_CHARCLASS_LOOP (isxdigit)
3176 else
3177 return REG_ECTYPE;
3178
3179 return REG_NOERROR;
3180}
3181
3182static bin_tree_t *
3183build_word_op (dfa, not, err)
3184 re_dfa_t *dfa;
3185 int not;
3186 reg_errcode_t *err;
3187{
3188 re_bitset_ptr_t sbcset;
c0a0f9a3 3189#ifdef RE_ENABLE_I18N
3b0bdc72 3190 re_charset_t *mbcset;
c0a0f9a3
UD
3191 int alloc = 0;
3192#else /* not RE_ENABLE_I18N */
3193 int non_match = 0;
3194#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3195 reg_errcode_t ret;
3196 re_token_t br_token;
3197 bin_tree_t *tree;
c0a0f9a3 3198 int new_idx;
3b0bdc72
UD
3199
3200 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
c0a0f9a3 3201#ifdef RE_ENABLE_I18N
3b0bdc72 3202 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
c0a0f9a3
UD
3203#endif /* RE_ENABLE_I18N */
3204
3205#ifdef RE_ENABLE_I18N
bc15410e 3206 if (BE (sbcset == NULL || mbcset == NULL, 0))
c0a0f9a3
UD
3207#else /* not RE_ENABLE_I18N */
3208 if (BE (sbcset == NULL, 0))
3209#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3210 {
3211 *err = REG_ESPACE;
3212 return NULL;
3213 }
3214
3215 if (not)
3216 {
c0a0f9a3 3217#ifdef RE_ENABLE_I18N
3b0bdc72 3218 int i;
3b0bdc72
UD
3219 /*
3220 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3221 bitset_set(cset->sbcset, '\0');
3222 */
c0a0f9a3 3223 mbcset->non_match = 1;
3b0bdc72
UD
3224 if (MB_CUR_MAX > 1)
3225 for (i = 0; i < SBC_MAX; ++i)
3226 if (__btowc (i) == WEOF)
3227 bitset_set (sbcset, i);
c0a0f9a3
UD
3228#else /* not RE_ENABLE_I18N */
3229 non_match = 1;
3230#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3231 }
3232
602c2f9d 3233 /* We don't care the syntax in this case. */
c0a0f9a3
UD
3234 ret = build_charclass (sbcset,
3235#ifdef RE_ENABLE_I18N
3236 mbcset, &alloc,
3237#endif /* RE_ENABLE_I18N */
3238 "alpha", 0);
3239
bc15410e 3240 if (BE (ret != REG_NOERROR, 0))
3b0bdc72
UD
3241 {
3242 re_free (sbcset);
c0a0f9a3 3243#ifdef RE_ENABLE_I18N
3b0bdc72 3244 free_charset (mbcset);
c0a0f9a3 3245#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3246 *err = REG_ESPACE;
3247 return NULL;
3248 }
434d3784
UD
3249 /* \w match '_' also. */
3250 bitset_set (sbcset, '_');
3b0bdc72
UD
3251
3252 /* If it is non-matching list. */
c0a0f9a3 3253#ifdef RE_ENABLE_I18N
3b0bdc72 3254 if (mbcset->non_match)
c0a0f9a3
UD
3255#else /* not RE_ENABLE_I18N */
3256 if (non_match)
3257#endif /* not RE_ENABLE_I18N */
3b0bdc72
UD
3258 bitset_not (sbcset);
3259
3260 /* Build a tree for simple bracket. */
3261 br_token.type = SIMPLE_BRACKET;
3262 br_token.opr.sbcset = sbcset;
3263 new_idx = re_dfa_add_node (dfa, br_token, 0);
3264 tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3265 if (BE (new_idx == -1 || tree == NULL, 0))
3b0bdc72
UD
3266 goto build_word_op_espace;
3267
c0a0f9a3 3268#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3269 if (MB_CUR_MAX > 1)
3270 {
3271 re_token_t alt_token;
3272 bin_tree_t *mbc_tree;
3273 /* Build a tree for complex bracket. */
3274 br_token.type = COMPLEX_BRACKET;
3275 br_token.opr.mbcset = mbcset;
3276 dfa->has_mb_node = 1;
3277 new_idx = re_dfa_add_node (dfa, br_token, 0);
3278 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
bc15410e 3279 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3b0bdc72
UD
3280 goto build_word_op_espace;
3281 /* Then join them by ALT node. */
3282 alt_token.type = OP_ALT;
3283 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3284 tree = create_tree (tree, mbc_tree, 0, new_idx);
bc15410e 3285 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3b0bdc72
UD
3286 return tree;
3287 }
3288 else
3289 {
3290 free_charset (mbcset);
3291 return tree;
3292 }
c0a0f9a3
UD
3293#else /* not RE_ENABLE_I18N */
3294 return tree;
3295#endif /* not RE_ENABLE_I18N */
3296
3b0bdc72
UD
3297 build_word_op_espace:
3298 re_free (sbcset);
c0a0f9a3 3299#ifdef RE_ENABLE_I18N
3b0bdc72 3300 free_charset (mbcset);
c0a0f9a3 3301#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3302 *err = REG_ESPACE;
3303 return NULL;
3304}
3305
3306/* This is intended for the expressions like "a{1,3}".
3307 Fetch a number from `input', and return the number.
3308 Return -1, if the number field is empty like "{,1}".
3309 Return -2, If an error is occured. */
3310
3311static int
3312fetch_number (input, token, syntax)
3313 re_string_t *input;
3314 re_token_t *token;
3315 reg_syntax_t syntax;
3316{
3317 int num = -1;
3318 unsigned char c;
3319 while (1)
3320 {
3321 *token = fetch_token (input, syntax);
3322 c = token->opr.c;
602c2f9d
UD
3323 if (BE (token->type == END_OF_RE, 0))
3324 return -2;
3b0bdc72
UD
3325 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3326 break;
602c2f9d
UD
3327 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3328 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3329 num = (num > RE_DUP_MAX) ? -2 : num;
3b0bdc72 3330 }
3b0bdc72
UD
3331 return num;
3332}
3333\f
c0a0f9a3 3334#ifdef RE_ENABLE_I18N
3b0bdc72
UD
3335static void
3336free_charset (re_charset_t *cset)
3337{
3338 re_free (cset->mbchars);
c0a0f9a3 3339# ifdef _LIBC
3b0bdc72
UD
3340 re_free (cset->coll_syms);
3341 re_free (cset->equiv_classes);
3342 re_free (cset->range_starts);
3343 re_free (cset->range_ends);
c0a0f9a3 3344# endif
3b0bdc72
UD
3345 re_free (cset->char_classes);
3346 re_free (cset);
3347}
c0a0f9a3 3348#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
3349\f
3350/* Functions for binary tree operation. */
3351
3352/* Create a node of tree.
3353 Note: This function automatically free left and right if malloc fails. */
3354
3355static bin_tree_t *
3356create_tree (left, right, type, index)
3357 bin_tree_t *left;
3358 bin_tree_t *right;
3359 re_token_type_t type;
3360 int index;
3361{
3362 bin_tree_t *tree;
3363 tree = re_malloc (bin_tree_t, 1);
bc15410e 3364 if (BE (tree == NULL, 0))
3b0bdc72
UD
3365 {
3366 free_bin_tree (left);
3367 free_bin_tree (right);
3368 return NULL;
3369 }
3370 tree->parent = NULL;
3371 tree->left = left;
3372 tree->right = right;
3373 tree->type = type;
3374 tree->node_idx = index;
3375 tree->first = -1;
3376 tree->next = -1;
3377 re_node_set_init_empty (&tree->eclosure);
3378
3379 if (left != NULL)
3380 left->parent = tree;
3381 if (right != NULL)
3382 right->parent = tree;
3383 return tree;
3384}
3385
3386/* Free the sub tree pointed by TREE. */
3387
3388static void
3389free_bin_tree (tree)
3390 bin_tree_t *tree;
3391{
3392 if (tree == NULL)
3393 return;
3394 /*re_node_set_free (&tree->eclosure);*/
3395 free_bin_tree (tree->left);
3396 free_bin_tree (tree->right);
3397 re_free (tree);
3398}
3399
3400/* Duplicate the node SRC, and return new node. */
3401
3402static bin_tree_t *
3403duplicate_tree (src, dfa)
3404 const bin_tree_t *src;
3405 re_dfa_t *dfa;
3406{
3407 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3408 int new_node_idx;
3409 /* Since node indies must be according to Post-order of the tree,
3410 we must duplicate the left at first. */
3411 if (src->left != NULL)
3412 {
3413 left = duplicate_tree (src->left, dfa);
3414 if (left == NULL)
3415 return NULL;
3416 }
3417
3418 /* Secondaly, duplicate the right. */
3419 if (src->right != NULL)
3420 {
3421 right = duplicate_tree (src->right, dfa);
3422 if (right == NULL)
3423 {
3424 free_bin_tree (left);
3425 return NULL;
3426 }
3427 }
3428
3429 /* At last, duplicate itself. */
3430 if (src->type == NON_TYPE)
3431 {
3432 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3433 dfa->nodes[new_node_idx].duplicated = 1;
bc15410e 3434 if (BE (new_node_idx == -1, 0))
3b0bdc72
UD
3435 {
3436 free_bin_tree (left);
3437 free_bin_tree (right);
3438 return NULL;
3439 }
3440 }
3441 else
3442 new_node_idx = src->type;
3443
3444 new_tree = create_tree (left, right, src->type, new_node_idx);
bc15410e 3445 if (BE (new_tree == NULL, 0))
3b0bdc72
UD
3446 {
3447 free_bin_tree (left);
3448 free_bin_tree (right);
3449 }
3450 return new_tree;
3451}