1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
37 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
38 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
39 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
40 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
41 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
42 int top_clone_node
, int root_node
,
43 unsigned int constraint
);
44 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
45 unsigned int constraint
);
46 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
47 unsigned int constraint
);
48 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
49 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
51 static void calc_inveclosure (re_dfa_t
*dfa
);
52 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
54 static re_token_t
fetch_token (re_string_t
*input
, reg_syntax_t syntax
);
55 static int peek_token (re_token_t
*token
, re_string_t
*input
,
57 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
59 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
60 reg_syntax_t syntax
, reg_errcode_t
*err
);
61 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
62 re_token_t
*token
, reg_syntax_t syntax
,
63 int nest
, reg_errcode_t
*err
);
64 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
65 re_token_t
*token
, reg_syntax_t syntax
,
66 int nest
, reg_errcode_t
*err
);
67 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
68 re_token_t
*token
, reg_syntax_t syntax
,
69 int nest
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
74 re_dfa_t
*dfa
, re_token_t
*token
,
75 reg_syntax_t syntax
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
77 re_token_t
*token
, reg_syntax_t syntax
,
79 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
81 re_token_t
*token
, int token_len
,
85 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
89 # ifdef RE_ENABLE_I18N
90 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
91 re_charset_t
*mbcset
, int *range_alloc
,
92 bracket_elem_t
*start_elem
,
93 bracket_elem_t
*end_elem
);
94 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
97 const unsigned char *name
);
98 # else /* not RE_ENABLE_I18N */
99 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
100 bracket_elem_t
*start_elem
,
101 bracket_elem_t
*end_elem
);
102 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
103 const unsigned char *name
);
104 # endif /* not RE_ENABLE_I18N */
105 #endif /* not _LIBC */
106 #ifdef RE_ENABLE_I18N
107 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
108 re_charset_t
*mbcset
,
109 int *equiv_class_alloc
,
110 const unsigned char *name
);
111 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
112 re_bitset_ptr_t sbcset
,
113 re_charset_t
*mbcset
,
114 int *char_class_alloc
,
115 const unsigned char *class_name
,
116 reg_syntax_t syntax
);
117 #else /* not RE_ENABLE_I18N */
118 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
119 const unsigned char *name
);
120 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
121 re_bitset_ptr_t sbcset
,
122 const unsigned char *class_name
,
123 reg_syntax_t syntax
);
124 #endif /* not RE_ENABLE_I18N */
125 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
126 const unsigned char *class_name
,
127 const unsigned char *extra
, int not,
129 static void free_bin_tree (bin_tree_t
*tree
);
130 static bin_tree_t
*create_tree (bin_tree_t
*left
, bin_tree_t
*right
,
131 re_token_type_t type
, int index
);
132 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
134 /* This table gives an error message for each of the error codes listed
135 in regex.h. Obviously the order here has to be same as there.
136 POSIX doesn't require that we do anything for REG_NOERROR,
137 but why not be nice? */
139 const char __re_error_msgid
[] attribute_hidden
=
141 #define REG_NOERROR_IDX 0
142 gettext_noop ("Success") /* REG_NOERROR */
144 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
145 gettext_noop ("No match") /* REG_NOMATCH */
147 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
148 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
150 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
151 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
153 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
154 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
156 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
157 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
159 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
160 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
162 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
163 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
165 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
166 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
168 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
169 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
171 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
172 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
174 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
175 gettext_noop ("Invalid range end") /* REG_ERANGE */
177 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
178 gettext_noop ("Memory exhausted") /* REG_ESPACE */
180 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
181 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
183 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
184 gettext_noop ("Premature end of regular expression") /* REG_EEND */
186 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
187 gettext_noop ("Regular expression too big") /* REG_ESIZE */
189 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
190 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
193 const size_t __re_error_msgid_idx
[] attribute_hidden
=
214 /* Entry points for GNU code. */
216 /* re_compile_pattern is the GNU regular expression compiler: it
217 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
218 Returns 0 if the pattern was valid, otherwise an error string.
220 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
221 are set in BUFP on entry. */
224 re_compile_pattern (pattern
, length
, bufp
)
227 struct re_pattern_buffer
*bufp
;
231 /* And GNU code determines whether or not to get register information
232 by passing null for the REGS argument to re_match, etc., not by
236 /* Match anchors at newline. */
237 bufp
->newline_anchor
= 1;
239 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
243 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
246 weak_alias (__re_compile_pattern
, re_compile_pattern
)
249 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
250 also be assigned to arbitrarily: each pattern buffer stores its own
251 syntax, so it can be changed between regex compilations. */
252 /* This has no initializer because initialized variables in Emacs
253 become read-only after dumping. */
254 reg_syntax_t re_syntax_options
;
257 /* Specify the precise syntax of regexps for compilation. This provides
258 for compatibility for various utilities which historically have
259 different, incompatible syntaxes.
261 The argument SYNTAX is a bit mask comprised of the various bits
262 defined in regex.h. We return the old syntax. */
265 re_set_syntax (syntax
)
268 reg_syntax_t ret
= re_syntax_options
;
270 re_syntax_options
= syntax
;
274 weak_alias (__re_set_syntax
, re_set_syntax
)
278 re_compile_fastmap (bufp
)
279 struct re_pattern_buffer
*bufp
;
281 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
282 char *fastmap
= bufp
->fastmap
;
284 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
285 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
286 if (dfa
->init_state
!= dfa
->init_state_word
)
287 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
288 if (dfa
->init_state
!= dfa
->init_state_nl
)
289 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
290 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
291 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
292 bufp
->fastmap_accurate
= 1;
296 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
300 __attribute ((always_inline
))
301 re_set_fastmap (char *fastmap
, int icase
, int ch
)
305 fastmap
[tolower (ch
)] = 1;
308 /* Helper function for re_compile_fastmap.
309 Compile fastmap for the initial_state INIT_STATE. */
312 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
314 const re_dfastate_t
*init_state
;
317 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
319 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
320 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
322 int node
= init_state
->nodes
.elems
[node_cnt
];
323 re_token_type_t type
= dfa
->nodes
[node
].type
;
325 if (type
== CHARACTER
)
327 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
328 #ifdef RE_ENABLE_I18N
329 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
331 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
336 *p
++ = dfa
->nodes
[node
].opr
.c
;
337 while (++node
< dfa
->nodes_len
338 && dfa
->nodes
[node
].type
== CHARACTER
339 && dfa
->nodes
[node
].mb_partial
)
340 *p
++ = dfa
->nodes
[node
].opr
.c
;
341 memset (&state
, 0, sizeof (state
));
342 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
344 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
345 re_set_fastmap (fastmap
, 0, buf
[0]);
349 else if (type
== SIMPLE_BRACKET
)
352 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
353 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
354 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
355 re_set_fastmap (fastmap
, icase
, ch
);
357 #ifdef RE_ENABLE_I18N
358 else if (type
== COMPLEX_BRACKET
)
361 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
362 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
363 || cset
->nranges
|| cset
->nchar_classes
)
366 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
368 /* In this case we want to catch the bytes which are
369 the first byte of any collation elements.
370 e.g. In da_DK, we want to catch 'a' since "aa"
371 is a valid collation element, and don't catch
372 'b' since 'b' is the only collation element
373 which starts from 'b'. */
375 const int32_t *table
= (const int32_t *)
376 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
377 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
378 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
380 re_set_fastmap (fastmap
, icase
, ch
);
383 if (dfa
->mb_cur_max
> 1)
384 for (i
= 0; i
< SBC_MAX
; ++i
)
385 if (__btowc (i
) == WEOF
)
386 re_set_fastmap (fastmap
, icase
, i
);
387 # endif /* not _LIBC */
389 for (i
= 0; i
< cset
->nmbchars
; ++i
)
393 memset (&state
, '\0', sizeof (state
));
394 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
395 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
396 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
398 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
399 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
403 #endif /* RE_ENABLE_I18N */
404 else if (type
== END_OF_RE
|| type
== OP_PERIOD
405 || type
== OP_UTF8_PERIOD
)
407 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
408 if (type
== END_OF_RE
)
409 bufp
->can_be_null
= 1;
415 /* Entry point for POSIX code. */
416 /* regcomp takes a regular expression as a string and compiles it.
418 PREG is a regex_t *. We do not expect any fields to be initialized,
419 since POSIX says we shouldn't. Thus, we set
421 `buffer' to the compiled pattern;
422 `used' to the length of the compiled pattern;
423 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
424 REG_EXTENDED bit in CFLAGS is set; otherwise, to
425 RE_SYNTAX_POSIX_BASIC;
426 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
427 `fastmap' to an allocated space for the fastmap;
428 `fastmap_accurate' to zero;
429 `re_nsub' to the number of subexpressions in PATTERN.
431 PATTERN is the address of the pattern string.
433 CFLAGS is a series of bits which affect compilation.
435 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
436 use POSIX basic syntax.
438 If REG_NEWLINE is set, then . and [^...] don't match newline.
439 Also, regexec will try a match beginning after every newline.
441 If REG_ICASE is set, then we considers upper- and lowercase
442 versions of letters to be equivalent when matching.
444 If REG_NOSUB is set, then when PREG is passed to regexec, that
445 routine will report only success or failure, and nothing about the
448 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
449 the return codes and their meanings.) */
452 regcomp (preg
, pattern
, cflags
)
453 regex_t
*__restrict preg
;
454 const char *__restrict pattern
;
458 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
459 : RE_SYNTAX_POSIX_BASIC
);
465 /* Try to allocate space for the fastmap. */
466 preg
->fastmap
= re_malloc (char, SBC_MAX
);
467 if (BE (preg
->fastmap
== NULL
, 0))
470 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
472 /* If REG_NEWLINE is set, newlines are treated differently. */
473 if (cflags
& REG_NEWLINE
)
474 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
475 syntax
&= ~RE_DOT_NEWLINE
;
476 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
477 /* It also changes the matching behavior. */
478 preg
->newline_anchor
= 1;
481 preg
->newline_anchor
= 0;
482 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
483 preg
->translate
= NULL
;
485 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
487 /* POSIX doesn't distinguish between an unmatched open-group and an
488 unmatched close-group: both are REG_EPAREN. */
489 if (ret
== REG_ERPAREN
)
492 /* We have already checked preg->fastmap != NULL. */
493 if (BE (ret
== REG_NOERROR
, 1))
494 /* Compute the fastmap now, since regexec cannot modify the pattern
495 buffer. This function nevers fails in this implementation. */
496 (void) re_compile_fastmap (preg
);
499 /* Some error occurred while compiling the expression. */
500 re_free (preg
->fastmap
);
501 preg
->fastmap
= NULL
;
507 weak_alias (__regcomp
, regcomp
)
510 /* Returns a message corresponding to an error code, ERRCODE, returned
511 from either regcomp or regexec. We don't use PREG here. */
514 regerror (errcode
, preg
, errbuf
, errbuf_size
)
524 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
525 / sizeof (__re_error_msgid_idx
[0])), 0))
526 /* Only error codes returned by the rest of the code should be passed
527 to this routine. If we are given anything else, or if other regex
528 code generates an invalid error code, then the program has a bug.
529 Dump core so we can fix it. */
532 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
534 msg_size
= strlen (msg
) + 1; /* Includes the null. */
536 if (BE (errbuf_size
!= 0, 1))
538 if (BE (msg_size
> errbuf_size
, 0))
540 #if defined HAVE_MEMPCPY || defined _LIBC
541 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
543 memcpy (errbuf
, msg
, errbuf_size
- 1);
544 errbuf
[errbuf_size
- 1] = 0;
548 memcpy (errbuf
, msg
, msg_size
);
554 weak_alias (__regerror
, regerror
)
559 free_dfa_content (re_dfa_t
*dfa
)
563 re_free (dfa
->subexps
);
565 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
567 re_token_t
*node
= dfa
->nodes
+ i
;
568 #ifdef RE_ENABLE_I18N
569 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
570 free_charset (node
->opr
.mbcset
);
572 #endif /* RE_ENABLE_I18N */
573 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
574 re_free (node
->opr
.sbcset
);
576 re_free (dfa
->nexts
);
577 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
579 if (dfa
->eclosures
!= NULL
)
580 re_node_set_free (dfa
->eclosures
+ i
);
581 if (dfa
->inveclosures
!= NULL
)
582 re_node_set_free (dfa
->inveclosures
+ i
);
583 if (dfa
->edests
!= NULL
)
584 re_node_set_free (dfa
->edests
+ i
);
586 re_free (dfa
->edests
);
587 re_free (dfa
->eclosures
);
588 re_free (dfa
->inveclosures
);
589 re_free (dfa
->nodes
);
591 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
593 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
594 for (j
= 0; j
< entry
->num
; ++j
)
596 re_dfastate_t
*state
= entry
->array
[j
];
599 re_free (entry
->array
);
601 re_free (dfa
->state_table
);
603 if (dfa
->word_char
!= NULL
)
604 re_free (dfa
->word_char
);
606 re_free (dfa
->re_str
);
613 /* Free dynamically allocated space used by PREG. */
619 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
620 if (BE (dfa
!= NULL
, 1))
621 free_dfa_content (dfa
);
623 re_free (preg
->fastmap
);
626 weak_alias (__regfree
, regfree
)
629 /* Entry points compatible with 4.2 BSD regex library. We don't define
630 them unless specifically requested. */
632 #if defined _REGEX_RE_COMP || defined _LIBC
634 /* BSD has one and only one pattern buffer. */
635 static struct re_pattern_buffer re_comp_buf
;
639 /* Make these definitions weak in libc, so POSIX programs can redefine
640 these names if they don't use our functions, and still use
641 regcomp/regexec above without link errors. */
652 if (!re_comp_buf
.buffer
)
653 return gettext ("No previous regular expression");
657 if (re_comp_buf
.buffer
)
659 fastmap
= re_comp_buf
.fastmap
;
660 re_comp_buf
.fastmap
= NULL
;
661 __regfree (&re_comp_buf
);
662 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
663 re_comp_buf
.fastmap
= fastmap
;
666 if (re_comp_buf
.fastmap
== NULL
)
668 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
669 if (re_comp_buf
.fastmap
== NULL
)
670 return (char *) gettext (__re_error_msgid
671 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
674 /* Since `re_exec' always passes NULL for the `regs' argument, we
675 don't need to initialize the pattern buffer fields which affect it. */
677 /* Match anchors at newlines. */
678 re_comp_buf
.newline_anchor
= 1;
680 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
685 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
686 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
690 libc_freeres_fn (free_mem
)
692 __regfree (&re_comp_buf
);
696 #endif /* _REGEX_RE_COMP */
698 /* Internal entry point.
699 Compile the regular expression PATTERN, whose length is LENGTH.
700 SYNTAX indicate regular expression's syntax. */
703 re_compile_internal (preg
, pattern
, length
, syntax
)
705 const char * pattern
;
709 reg_errcode_t err
= REG_NOERROR
;
713 /* Initialize the pattern buffer. */
714 preg
->fastmap_accurate
= 0;
715 preg
->syntax
= syntax
;
716 preg
->not_bol
= preg
->not_eol
= 0;
719 preg
->can_be_null
= 0;
720 preg
->regs_allocated
= REGS_UNALLOCATED
;
722 /* Initialize the dfa. */
723 dfa
= (re_dfa_t
*) preg
->buffer
;
724 if (preg
->allocated
< sizeof (re_dfa_t
))
726 /* If zero allocated, but buffer is non-null, try to realloc
727 enough space. This loses if buffer's address is bogus, but
728 that is the user's responsibility. If ->buffer is NULL this
729 is a simple allocation. */
730 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
733 preg
->allocated
= sizeof (re_dfa_t
);
735 preg
->buffer
= (unsigned char *) dfa
;
736 preg
->used
= sizeof (re_dfa_t
);
738 err
= init_dfa (dfa
, length
);
739 if (BE (err
!= REG_NOERROR
, 0))
747 dfa
->re_str
= re_malloc (char, length
+ 1);
748 strncpy (dfa
->re_str
, pattern
, length
+ 1);
751 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
752 syntax
& RE_ICASE
, dfa
);
753 if (BE (err
!= REG_NOERROR
, 0))
761 /* Parse the regular expression, and build a structure tree. */
763 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
764 if (BE (dfa
->str_tree
== NULL
, 0))
765 goto re_compile_internal_free_return
;
767 #ifdef RE_ENABLE_I18N
768 /* If possible, do searching in single byte encoding to speed things up. */
769 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
773 /* Analyze the tree and collect information which is necessary to
776 if (BE (err
!= REG_NOERROR
, 0))
777 goto re_compile_internal_free_return
;
779 /* Then create the initial state of the dfa. */
780 err
= create_initial_state (dfa
);
782 /* Release work areas. */
783 free_workarea_compile (preg
);
784 re_string_destruct (®exp
);
786 if (BE (err
!= REG_NOERROR
, 0))
788 re_compile_internal_free_return
:
789 free_dfa_content (dfa
);
797 /* Initialize DFA. We use the length of the regular expression PAT_LEN
798 as the initial length of some arrays. */
801 init_dfa (dfa
, pat_len
)
807 memset (dfa
, '\0', sizeof (re_dfa_t
));
809 dfa
->nodes_alloc
= pat_len
+ 1;
810 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
812 dfa
->states_alloc
= pat_len
+ 1;
814 /* table_size = 2 ^ ceil(log pat_len) */
815 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
816 if (table_size
> pat_len
)
819 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
820 dfa
->state_hash_mask
= table_size
- 1;
822 dfa
->subexps_alloc
= 1;
823 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
824 dfa
->word_char
= NULL
;
826 dfa
->mb_cur_max
= MB_CUR_MAX
;
828 if (dfa
->mb_cur_max
> 1
829 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
831 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
835 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
836 || dfa
->subexps
== NULL
, 0))
838 /* We don't bother to free anything which was allocated. Very
839 soon the process will go down anyway. */
841 dfa
->state_table
= NULL
;
848 /* Initialize WORD_CHAR table, which indicate which character is
849 "word". In this case "word" means that it is the word construction
850 character used by some operators like "\<", "\>", etc. */
857 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
858 if (BE (dfa
->word_char
== NULL
, 0))
860 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
861 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
862 if (isalnum (ch
) || ch
== '_')
863 dfa
->word_char
[i
] |= 1 << j
;
867 /* Free the work area which are only used while compiling. */
870 free_workarea_compile (preg
)
873 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
874 free_bin_tree (dfa
->str_tree
);
875 dfa
->str_tree
= NULL
;
876 re_free (dfa
->org_indices
);
877 dfa
->org_indices
= NULL
;
880 /* Create initial states for all contexts. */
883 create_initial_state (dfa
)
888 re_node_set init_nodes
;
890 /* Initial states have the epsilon closure of the node which is
891 the first node of the regular expression. */
892 first
= dfa
->str_tree
->first
;
893 dfa
->init_node
= first
;
894 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
895 if (BE (err
!= REG_NOERROR
, 0))
898 /* The back-references which are in initial states can epsilon transit,
899 since in this case all of the subexpressions can be null.
900 Then we add epsilon closures of the nodes which are the next nodes of
901 the back-references. */
902 if (dfa
->nbackref
> 0)
903 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
905 int node_idx
= init_nodes
.elems
[i
];
906 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
909 if (type
!= OP_BACK_REF
)
911 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
913 re_token_t
*clexp_node
;
914 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
915 if (clexp_node
->type
== OP_CLOSE_SUBEXP
916 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
919 if (clexp_idx
== init_nodes
.nelem
)
922 if (type
== OP_BACK_REF
)
924 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
925 if (!re_node_set_contains (&init_nodes
, dest_idx
))
927 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
933 /* It must be the first time to invoke acquire_state. */
934 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
935 /* We don't check ERR here, since the initial state must not be NULL. */
936 if (BE (dfa
->init_state
== NULL
, 0))
938 if (dfa
->init_state
->has_constraint
)
940 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
942 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
944 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
948 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
949 || dfa
->init_state_begbuf
== NULL
, 0))
953 dfa
->init_state_word
= dfa
->init_state_nl
954 = dfa
->init_state_begbuf
= dfa
->init_state
;
956 re_node_set_free (&init_nodes
);
960 #ifdef RE_ENABLE_I18N
961 /* If it is possible to do searching in single byte encoding instead of UTF-8
962 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
963 DFA nodes where needed. */
969 int node
, i
, mb_chars
= 0, has_period
= 0;
971 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
972 switch (dfa
->nodes
[node
].type
)
975 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
979 switch (dfa
->nodes
[node
].opr
.idx
)
987 /* Word anchors etc. cannot be handled. */
997 case OP_DUP_ASTERISK
:
998 case OP_DUP_QUESTION
:
1000 case OP_OPEN_SUBEXP
:
1001 case OP_CLOSE_SUBEXP
:
1003 case SIMPLE_BRACKET
:
1004 /* Just double check. */
1005 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1006 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1013 if (mb_chars
|| has_period
)
1014 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1016 if (dfa
->nodes
[node
].type
== CHARACTER
1017 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1018 dfa
->nodes
[node
].mb_partial
= 0;
1019 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1020 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1023 /* The search can be in single byte locale. */
1024 dfa
->mb_cur_max
= 1;
1026 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1030 /* Analyze the structure tree, and calculate "first", "next", "edest",
1031 "eclosure", and "inveclosure". */
1033 static reg_errcode_t
1040 /* Allocate arrays. */
1041 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1042 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1043 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1044 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1045 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1046 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1047 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
1049 /* Initialize them. */
1050 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
1053 re_node_set_init_empty (dfa
->edests
+ i
);
1054 re_node_set_init_empty (dfa
->eclosures
+ i
);
1055 re_node_set_init_empty (dfa
->inveclosures
+ i
);
1058 ret
= analyze_tree (dfa
, dfa
->str_tree
);
1059 if (BE (ret
== REG_NOERROR
, 1))
1061 ret
= calc_eclosure (dfa
);
1062 if (ret
== REG_NOERROR
)
1063 calc_inveclosure (dfa
);
1068 /* Helper functions for analyze.
1069 This function calculate "first", "next", and "edest" for the subtree
1070 whose root is NODE. */
1072 static reg_errcode_t
1073 analyze_tree (dfa
, node
)
1078 if (node
->first
== -1)
1079 calc_first (dfa
, node
);
1080 if (node
->next
== -1)
1081 calc_next (dfa
, node
);
1082 if (node
->eclosure
.nelem
== 0)
1083 calc_epsdest (dfa
, node
);
1084 /* Calculate "first" etc. for the left child. */
1085 if (node
->left
!= NULL
)
1087 ret
= analyze_tree (dfa
, node
->left
);
1088 if (BE (ret
!= REG_NOERROR
, 0))
1091 /* Calculate "first" etc. for the right child. */
1092 if (node
->right
!= NULL
)
1094 ret
= analyze_tree (dfa
, node
->right
);
1095 if (BE (ret
!= REG_NOERROR
, 0))
1101 /* Calculate "first" for the node NODE. */
1103 calc_first (dfa
, node
)
1108 idx
= node
->node_idx
;
1109 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1114 case OP_OPEN_BRACKET
:
1115 case OP_CLOSE_BRACKET
:
1116 case OP_OPEN_DUP_NUM
:
1117 case OP_CLOSE_DUP_NUM
:
1118 case OP_NON_MATCH_LIST
:
1119 case OP_OPEN_COLL_ELEM
:
1120 case OP_CLOSE_COLL_ELEM
:
1121 case OP_OPEN_EQUIV_CLASS
:
1122 case OP_CLOSE_EQUIV_CLASS
:
1123 case OP_OPEN_CHAR_CLASS
:
1124 case OP_CLOSE_CHAR_CLASS
:
1125 /* These must not be appeared here. */
1131 case OP_DUP_ASTERISK
:
1132 case OP_DUP_QUESTION
:
1133 #ifdef RE_ENABLE_I18N
1134 case OP_UTF8_PERIOD
:
1135 case COMPLEX_BRACKET
:
1136 #endif /* RE_ENABLE_I18N */
1137 case SIMPLE_BRACKET
:
1140 case OP_OPEN_SUBEXP
:
1141 case OP_CLOSE_SUBEXP
:
1146 assert (node
->left
!= NULL
);
1148 if (node
->left
->first
== -1)
1149 calc_first (dfa
, node
->left
);
1150 node
->first
= node
->left
->first
;
1155 /* else fall through */
1158 assert (node
->left
!= NULL
);
1160 if (node
->left
->first
== -1)
1161 calc_first (dfa
, node
->left
);
1162 node
->first
= node
->left
->first
;
1167 /* Calculate "next" for the node NODE. */
1170 calc_next (dfa
, node
)
1175 bin_tree_t
*parent
= node
->parent
;
1179 idx
= node
->node_idx
;
1180 if (node
->type
== 0)
1181 dfa
->nexts
[idx
] = node
->next
;
1185 idx
= parent
->node_idx
;
1186 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1190 case OP_DUP_ASTERISK
:
1195 if (parent
->left
== node
)
1197 if (parent
->right
->first
== -1)
1198 calc_first (dfa
, parent
->right
);
1199 node
->next
= parent
->right
->first
;
1202 /* else fall through */
1204 if (parent
->next
== -1)
1205 calc_next (dfa
, parent
);
1206 node
->next
= parent
->next
;
1209 idx
= node
->node_idx
;
1210 if (node
->type
== 0)
1211 dfa
->nexts
[idx
] = node
->next
;
1214 /* Calculate "edest" for the node NODE. */
1217 calc_epsdest (dfa
, node
)
1222 idx
= node
->node_idx
;
1223 if (node
->type
== 0)
1225 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1226 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1227 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1229 if (node
->left
->first
== -1)
1230 calc_first (dfa
, node
->left
);
1231 if (node
->next
== -1)
1232 calc_next (dfa
, node
);
1233 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1236 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1239 if (node
->left
!= NULL
)
1241 if (node
->left
->first
== -1)
1242 calc_first (dfa
, node
->left
);
1243 left
= node
->left
->first
;
1247 if (node
->next
== -1)
1248 calc_next (dfa
, node
);
1251 if (node
->right
!= NULL
)
1253 if (node
->right
->first
== -1)
1254 calc_first (dfa
, node
->right
);
1255 right
= node
->right
->first
;
1259 if (node
->next
== -1)
1260 calc_next (dfa
, node
);
1263 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1265 else if (dfa
->nodes
[idx
].type
== ANCHOR
1266 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1267 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1268 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1269 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1271 assert (!IS_EPSILON_NODE (dfa
->nodes
[idx
].type
));
1275 /* Duplicate the epsilon closure of the node ROOT_NODE.
1276 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1277 to their own constraint. */
1279 static reg_errcode_t
1280 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1283 int top_org_node
, top_clone_node
, root_node
;
1284 unsigned int init_constraint
;
1287 int org_node
, clone_node
, ret
;
1288 unsigned int constraint
= init_constraint
;
1289 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1291 int org_dest
, clone_dest
;
1292 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1294 /* If the back reference epsilon-transit, its destination must
1295 also have the constraint. Then duplicate the epsilon closure
1296 of the destination of the back reference, and store it in
1297 edests of the back reference. */
1298 org_dest
= dfa
->nexts
[org_node
];
1299 re_node_set_empty (dfa
->edests
+ clone_node
);
1300 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1301 if (BE (err
!= REG_NOERROR
, 0))
1303 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1304 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1305 if (BE (ret
< 0, 0))
1308 else if (dfa
->edests
[org_node
].nelem
== 0)
1310 /* In case of the node can't epsilon-transit, don't duplicate the
1311 destination and store the original destination as the
1312 destination of the node. */
1313 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1316 else if (dfa
->edests
[org_node
].nelem
== 1)
1318 /* In case of the node can epsilon-transit, and it has only one
1320 org_dest
= dfa
->edests
[org_node
].elems
[0];
1321 re_node_set_empty (dfa
->edests
+ clone_node
);
1322 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1324 /* In case of the node has another constraint, append it. */
1325 if (org_node
== root_node
&& clone_node
!= org_node
)
1327 /* ...but if the node is root_node itself, it means the
1328 epsilon closure have a loop, then tie it to the
1329 destination of the root_node. */
1330 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1332 if (BE (ret
< 0, 0))
1336 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1338 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1339 if (BE (err
!= REG_NOERROR
, 0))
1341 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1342 if (BE (ret
< 0, 0))
1345 else /* dfa->edests[org_node].nelem == 2 */
1347 /* In case of the node can epsilon-transit, and it has two
1348 destinations. E.g. '|', '*', '+', '?'. */
1349 org_dest
= dfa
->edests
[org_node
].elems
[0];
1350 re_node_set_empty (dfa
->edests
+ clone_node
);
1351 /* Search for a duplicated node which satisfies the constraint. */
1352 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1353 if (clone_dest
== -1)
1355 /* There are no such a duplicated node, create a new one. */
1356 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1357 if (BE (err
!= REG_NOERROR
, 0))
1359 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1360 if (BE (ret
< 0, 0))
1362 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1363 root_node
, constraint
);
1364 if (BE (err
!= REG_NOERROR
, 0))
1369 /* There are a duplicated node which satisfy the constraint,
1370 use it to avoid infinite loop. */
1371 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1372 if (BE (ret
< 0, 0))
1376 org_dest
= dfa
->edests
[org_node
].elems
[1];
1377 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1378 if (BE (err
!= REG_NOERROR
, 0))
1380 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1381 if (BE (ret
< 0, 0))
1384 org_node
= org_dest
;
1385 clone_node
= clone_dest
;
1390 /* Search for a node which is duplicated from the node ORG_NODE, and
1391 satisfies the constraint CONSTRAINT. */
1394 search_duplicated_node (dfa
, org_node
, constraint
)
1397 unsigned int constraint
;
1400 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1402 if (org_node
== dfa
->org_indices
[idx
]
1403 && constraint
== dfa
->nodes
[idx
].constraint
)
1404 return idx
; /* Found. */
1406 return -1; /* Not found. */
1409 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1410 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1411 otherwise return the error code. */
1413 static reg_errcode_t
1414 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1416 int *new_idx
, org_idx
;
1417 unsigned int constraint
;
1422 dup
= dfa
->nodes
[org_idx
];
1423 dup_idx
= re_dfa_add_node (dfa
, dup
, 1);
1424 if (BE (dup_idx
== -1, 0))
1426 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1427 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1428 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1429 dfa
->nodes
[dup_idx
].duplicated
= 1;
1430 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1431 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1432 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1434 /* Store the index of the original node. */
1435 dfa
->org_indices
[dup_idx
] = org_idx
;
1441 calc_inveclosure (dfa
)
1445 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1447 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1449 dest
= dfa
->eclosures
[src
].elems
[idx
];
1450 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1455 /* Calculate "eclosure" for all the node in DFA. */
1457 static reg_errcode_t
1461 int node_idx
, incomplete
;
1463 assert (dfa
->nodes_len
> 0);
1466 /* For each nodes, calculate epsilon closure. */
1467 for (node_idx
= 0; ; ++node_idx
)
1470 re_node_set eclosure_elem
;
1471 if (node_idx
== dfa
->nodes_len
)
1480 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1482 /* If we have already calculated, skip it. */
1483 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1485 /* Calculate epsilon closure of `node_idx'. */
1486 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1487 if (BE (err
!= REG_NOERROR
, 0))
1490 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1493 re_node_set_free (&eclosure_elem
);
1499 /* Calculate epsilon closure of NODE. */
1501 static reg_errcode_t
1502 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1503 re_node_set
*new_set
;
1508 unsigned int constraint
;
1510 re_node_set eclosure
;
1512 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1513 if (BE (err
!= REG_NOERROR
, 0))
1516 /* This indicates that we are calculating this node now.
1517 We reference this value to avoid infinite loop. */
1518 dfa
->eclosures
[node
].nelem
= -1;
1520 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1521 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1522 /* If the current node has constraints, duplicate all nodes.
1523 Since they must inherit the constraints. */
1524 if (constraint
&& !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1526 int org_node
, cur_node
;
1527 org_node
= cur_node
= node
;
1528 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1529 if (BE (err
!= REG_NOERROR
, 0))
1533 /* Expand each epsilon destination nodes. */
1534 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1535 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1537 re_node_set eclosure_elem
;
1538 int edest
= dfa
->edests
[node
].elems
[i
];
1539 /* If calculating the epsilon closure of `edest' is in progress,
1540 return intermediate result. */
1541 if (dfa
->eclosures
[edest
].nelem
== -1)
1546 /* If we haven't calculated the epsilon closure of `edest' yet,
1547 calculate now. Otherwise use calculated epsilon closure. */
1548 if (dfa
->eclosures
[edest
].nelem
== 0)
1550 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1551 if (BE (err
!= REG_NOERROR
, 0))
1555 eclosure_elem
= dfa
->eclosures
[edest
];
1556 /* Merge the epsilon closure of `edest'. */
1557 re_node_set_merge (&eclosure
, &eclosure_elem
);
1558 /* If the epsilon closure of `edest' is incomplete,
1559 the epsilon closure of this node is also incomplete. */
1560 if (dfa
->eclosures
[edest
].nelem
== 0)
1563 re_node_set_free (&eclosure_elem
);
1567 /* Epsilon closures include itself. */
1568 re_node_set_insert (&eclosure
, node
);
1569 if (incomplete
&& !root
)
1570 dfa
->eclosures
[node
].nelem
= 0;
1572 dfa
->eclosures
[node
] = eclosure
;
1573 *new_set
= eclosure
;
1577 /* Functions for token which are used in the parser. */
1579 /* Fetch a token from INPUT.
1580 We must not use this function inside bracket expressions. */
1583 fetch_token (input
, syntax
)
1585 reg_syntax_t syntax
;
1589 consumed_byte
= peek_token (&token
, input
, syntax
);
1590 re_string_skip_bytes (input
, consumed_byte
);
1594 /* Peek a token from INPUT, and return the length of the token.
1595 We must not use this function inside bracket expressions. */
1598 peek_token (token
, input
, syntax
)
1601 reg_syntax_t syntax
;
1605 if (re_string_eoi (input
))
1607 token
->type
= END_OF_RE
;
1611 c
= re_string_peek_byte (input
, 0);
1614 #ifdef RE_ENABLE_I18N
1615 token
->mb_partial
= 0;
1616 if (input
->mb_cur_max
> 1 &&
1617 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1619 token
->type
= CHARACTER
;
1620 token
->mb_partial
= 1;
1627 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1629 token
->type
= BACK_SLASH
;
1633 c2
= re_string_peek_byte_case (input
, 1);
1635 token
->type
= CHARACTER
;
1639 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1640 token
->type
= OP_ALT
;
1642 case '1': case '2': case '3': case '4': case '5':
1643 case '6': case '7': case '8': case '9':
1644 if (!(syntax
& RE_NO_BK_REFS
))
1646 token
->type
= OP_BACK_REF
;
1647 token
->opr
.idx
= c2
- '0';
1651 if (!(syntax
& RE_NO_GNU_OPS
))
1653 token
->type
= ANCHOR
;
1654 token
->opr
.idx
= WORD_FIRST
;
1658 if (!(syntax
& RE_NO_GNU_OPS
))
1660 token
->type
= ANCHOR
;
1661 token
->opr
.idx
= WORD_LAST
;
1665 if (!(syntax
& RE_NO_GNU_OPS
))
1667 token
->type
= ANCHOR
;
1668 token
->opr
.idx
= WORD_DELIM
;
1672 if (!(syntax
& RE_NO_GNU_OPS
))
1674 token
->type
= ANCHOR
;
1675 token
->opr
.idx
= INSIDE_WORD
;
1679 if (!(syntax
& RE_NO_GNU_OPS
))
1680 token
->type
= OP_WORD
;
1683 if (!(syntax
& RE_NO_GNU_OPS
))
1684 token
->type
= OP_NOTWORD
;
1687 if (!(syntax
& RE_NO_GNU_OPS
))
1688 token
->type
= OP_SPACE
;
1691 if (!(syntax
& RE_NO_GNU_OPS
))
1692 token
->type
= OP_NOTSPACE
;
1695 if (!(syntax
& RE_NO_GNU_OPS
))
1697 token
->type
= ANCHOR
;
1698 token
->opr
.idx
= BUF_FIRST
;
1702 if (!(syntax
& RE_NO_GNU_OPS
))
1704 token
->type
= ANCHOR
;
1705 token
->opr
.idx
= BUF_LAST
;
1709 if (!(syntax
& RE_NO_BK_PARENS
))
1710 token
->type
= OP_OPEN_SUBEXP
;
1713 if (!(syntax
& RE_NO_BK_PARENS
))
1714 token
->type
= OP_CLOSE_SUBEXP
;
1717 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1718 token
->type
= OP_DUP_PLUS
;
1721 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1722 token
->type
= OP_DUP_QUESTION
;
1725 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1726 token
->type
= OP_OPEN_DUP_NUM
;
1729 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1730 token
->type
= OP_CLOSE_DUP_NUM
;
1738 token
->type
= CHARACTER
;
1742 if (syntax
& RE_NEWLINE_ALT
)
1743 token
->type
= OP_ALT
;
1746 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1747 token
->type
= OP_ALT
;
1750 token
->type
= OP_DUP_ASTERISK
;
1753 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1754 token
->type
= OP_DUP_PLUS
;
1757 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1758 token
->type
= OP_DUP_QUESTION
;
1761 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1762 token
->type
= OP_OPEN_DUP_NUM
;
1765 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1766 token
->type
= OP_CLOSE_DUP_NUM
;
1769 if (syntax
& RE_NO_BK_PARENS
)
1770 token
->type
= OP_OPEN_SUBEXP
;
1773 if (syntax
& RE_NO_BK_PARENS
)
1774 token
->type
= OP_CLOSE_SUBEXP
;
1777 token
->type
= OP_OPEN_BRACKET
;
1780 token
->type
= OP_PERIOD
;
1783 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1784 re_string_cur_idx (input
) != 0)
1786 char prev
= re_string_peek_byte (input
, -1);
1787 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1790 token
->type
= ANCHOR
;
1791 token
->opr
.idx
= LINE_FIRST
;
1794 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1795 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1798 re_string_skip_bytes (input
, 1);
1799 peek_token (&next
, input
, syntax
);
1800 re_string_skip_bytes (input
, -1);
1801 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1804 token
->type
= ANCHOR
;
1805 token
->opr
.idx
= LINE_LAST
;
1813 /* Peek a token from INPUT, and return the length of the token.
1814 We must not use this function out of bracket expressions. */
1817 peek_token_bracket (token
, input
, syntax
)
1820 reg_syntax_t syntax
;
1823 if (re_string_eoi (input
))
1825 token
->type
= END_OF_RE
;
1828 c
= re_string_peek_byte (input
, 0);
1831 #ifdef RE_ENABLE_I18N
1832 if (input
->mb_cur_max
> 1 &&
1833 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1835 token
->type
= CHARACTER
;
1838 #endif /* RE_ENABLE_I18N */
1840 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1842 /* In this case, '\' escape a character. */
1844 re_string_skip_bytes (input
, 1);
1845 c2
= re_string_peek_byte (input
, 0);
1847 token
->type
= CHARACTER
;
1850 if (c
== '[') /* '[' is a special char in a bracket exps. */
1854 c2
= re_string_peek_byte (input
, 1);
1860 token
->type
= OP_OPEN_COLL_ELEM
;
1863 token
->type
= OP_OPEN_EQUIV_CLASS
;
1866 if (syntax
& RE_CHAR_CLASSES
)
1868 token
->type
= OP_OPEN_CHAR_CLASS
;
1871 /* else fall through. */
1873 token
->type
= CHARACTER
;
1883 token
->type
= OP_CHARSET_RANGE
;
1886 token
->type
= OP_CLOSE_BRACKET
;
1889 token
->type
= OP_NON_MATCH_LIST
;
1892 token
->type
= CHARACTER
;
1897 /* Functions for parser. */
1899 /* Entry point of the parser.
1900 Parse the regular expression REGEXP and return the structure tree.
1901 If an error is occured, ERR is set by error code, and return NULL.
1902 This function build the following tree, from regular expression <reg_exp>:
1908 CAT means concatenation.
1909 EOR means end of regular expression. */
1912 parse (regexp
, preg
, syntax
, err
)
1913 re_string_t
*regexp
;
1915 reg_syntax_t syntax
;
1918 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1919 bin_tree_t
*tree
, *eor
, *root
;
1920 re_token_t current_token
;
1922 current_token
= fetch_token (regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
1923 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1924 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1926 new_idx
= re_dfa_add_node (dfa
, current_token
, 0);
1927 eor
= create_tree (NULL
, NULL
, 0, new_idx
);
1929 root
= create_tree (tree
, eor
, CONCAT
, 0);
1932 if (BE (new_idx
== -1 || eor
== NULL
|| root
== NULL
, 0))
1940 /* This function build the following tree, from regular expression
1941 <branch1>|<branch2>:
1947 ALT means alternative, which represents the operator `|'. */
1950 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
1951 re_string_t
*regexp
;
1954 reg_syntax_t syntax
;
1958 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1959 bin_tree_t
*tree
, *branch
= NULL
;
1961 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1962 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1965 while (token
->type
== OP_ALT
)
1967 re_token_t alt_token
= *token
;
1968 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
1969 *token
= fetch_token (regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
1970 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
1971 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
1973 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
1974 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
1976 free_bin_tree (tree
);
1982 tree
= create_tree (tree
, branch
, 0, new_idx
);
1983 if (BE (new_idx
== -1 || tree
== NULL
, 0))
1988 dfa
->has_plural_match
= 1;
1993 /* This function build the following tree, from regular expression
2000 CAT means concatenation. */
2003 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2004 re_string_t
*regexp
;
2007 reg_syntax_t syntax
;
2011 bin_tree_t
*tree
, *exp
;
2012 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2013 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2016 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2017 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2019 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2020 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2022 free_bin_tree (tree
);
2025 if (tree
!= NULL
&& exp
!= NULL
)
2027 tree
= create_tree (tree
, exp
, CONCAT
, 0);
2034 else if (tree
== NULL
)
2036 /* Otherwise exp == NULL, we don't need to create new tree. */
2041 /* This function build the following tree, from regular expression a*:
2048 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2049 re_string_t
*regexp
;
2052 reg_syntax_t syntax
;
2056 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2059 switch (token
->type
)
2062 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2063 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2064 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2069 #ifdef RE_ENABLE_I18N
2070 if (dfa
->mb_cur_max
> 1)
2072 while (!re_string_eoi (regexp
)
2073 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2075 bin_tree_t
*mbc_remain
;
2076 *token
= fetch_token (regexp
, syntax
);
2077 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2078 mbc_remain
= create_tree (NULL
, NULL
, 0, new_idx
);
2079 tree
= create_tree (tree
, mbc_remain
, CONCAT
, 0);
2080 if (BE (new_idx
== -1 || mbc_remain
== NULL
|| tree
== NULL
, 0))
2089 case OP_OPEN_SUBEXP
:
2090 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2091 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2094 case OP_OPEN_BRACKET
:
2095 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2096 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2100 if (BE (preg
->re_nsub
< token
->opr
.idx
2101 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
2106 dfa
->used_bkref_map
|= 1 << (token
->opr
.idx
- 1);
2107 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2108 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2109 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2115 dfa
->has_mb_node
= 1;
2117 case OP_OPEN_DUP_NUM
:
2118 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2124 case OP_DUP_ASTERISK
:
2126 case OP_DUP_QUESTION
:
2127 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2132 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2134 *token
= fetch_token (regexp
, syntax
);
2135 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2137 /* else fall through */
2138 case OP_CLOSE_SUBEXP
:
2139 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2140 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2145 /* else fall through */
2146 case OP_CLOSE_DUP_NUM
:
2147 /* We treat it as a normal character. */
2149 /* Then we can these characters as normal characters. */
2150 token
->type
= CHARACTER
;
2151 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2152 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2153 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2160 if (dfa
->word_char
== NULL
)
2162 *err
= init_word_char (dfa
);
2163 if (BE (*err
!= REG_NOERROR
, 0))
2166 if (token
->opr
.ctx_type
== WORD_DELIM
)
2168 bin_tree_t
*tree_first
, *tree_last
;
2169 int idx_first
, idx_last
;
2170 token
->opr
.ctx_type
= WORD_FIRST
;
2171 idx_first
= re_dfa_add_node (dfa
, *token
, 0);
2172 tree_first
= create_tree (NULL
, NULL
, 0, idx_first
);
2173 token
->opr
.ctx_type
= WORD_LAST
;
2174 idx_last
= re_dfa_add_node (dfa
, *token
, 0);
2175 tree_last
= create_tree (NULL
, NULL
, 0, idx_last
);
2176 token
->type
= OP_ALT
;
2177 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2178 tree
= create_tree (tree_first
, tree_last
, 0, new_idx
);
2179 if (BE (idx_first
== -1 || idx_last
== -1 || new_idx
== -1
2180 || tree_first
== NULL
|| tree_last
== NULL
2181 || tree
== NULL
, 0))
2189 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2190 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2191 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2197 /* We must return here, since ANCHORs can't be followed
2198 by repetition operators.
2199 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2200 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2201 *token
= fetch_token (regexp
, syntax
);
2204 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2205 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
2206 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2211 if (dfa
->mb_cur_max
> 1)
2212 dfa
->has_mb_node
= 1;
2215 tree
= build_charclass_op (dfa
, regexp
->trans
, "alnum", "_", 0, err
);
2216 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2220 tree
= build_charclass_op (dfa
, regexp
->trans
, "alnum", "_", 1, err
);
2221 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2225 tree
= build_charclass_op (dfa
, regexp
->trans
, "space", "", 0, err
);
2226 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2230 tree
= build_charclass_op (dfa
, regexp
->trans
, "space", "", 1, err
);
2231 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2241 /* Must not happen? */
2247 *token
= fetch_token (regexp
, syntax
);
2249 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2250 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2252 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2253 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2255 /* In BRE consecutive duplications are not allowed. */
2256 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2257 && (token
->type
== OP_DUP_ASTERISK
2258 || token
->type
== OP_OPEN_DUP_NUM
))
2263 dfa
->has_plural_match
= 1;
2269 /* This function build the following tree, from regular expression
2277 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2278 re_string_t
*regexp
;
2281 reg_syntax_t syntax
;
2285 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2286 bin_tree_t
*tree
, *left_par
, *right_par
;
2289 cur_nsub
= preg
->re_nsub
++;
2290 if (dfa
->subexps_alloc
< preg
->re_nsub
)
2292 re_subexp_t
*new_array
;
2293 dfa
->subexps_alloc
*= 2;
2294 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2295 if (BE (new_array
== NULL
, 0))
2297 dfa
->subexps_alloc
/= 2;
2301 dfa
->subexps
= new_array
;
2303 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2304 dfa
->subexps
[cur_nsub
].end
= -1;
2306 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2307 left_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2308 if (BE (new_idx
== -1 || left_par
== NULL
, 0))
2313 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2314 *token
= fetch_token (regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2316 /* The subexpression may be a null string. */
2317 if (token
->type
== OP_CLOSE_SUBEXP
)
2321 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2322 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2325 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2327 free_bin_tree (tree
);
2331 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2332 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2333 right_par
= create_tree (NULL
, NULL
, 0, new_idx
);
2334 tree
= ((tree
== NULL
) ? right_par
2335 : create_tree (tree
, right_par
, CONCAT
, 0));
2336 tree
= create_tree (left_par
, tree
, CONCAT
, 0);
2337 if (BE (new_idx
== -1 || right_par
== NULL
|| tree
== NULL
, 0))
2342 dfa
->nodes
[new_idx
].opr
.idx
= cur_nsub
;
2347 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2350 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2351 bin_tree_t
*dup_elem
;
2352 re_string_t
*regexp
;
2355 reg_syntax_t syntax
;
2358 re_token_t dup_token
;
2359 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2360 int new_idx
, start_idx
= re_string_cur_idx (regexp
);
2361 re_token_t start_token
= *token
;
2362 if (token
->type
== OP_OPEN_DUP_NUM
)
2366 int start
= fetch_number (regexp
, token
, syntax
);
2370 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2371 start
= 0; /* We treat "{,m}" as "{0,m}". */
2374 *err
= REG_BADBR
; /* <re>{} is invalid. */
2378 if (BE (start
!= -2, 1))
2380 /* We treat "{n}" as "{n,n}". */
2381 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2382 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2383 ? fetch_number (regexp
, token
, syntax
) : -2));
2385 if (BE (start
== -2 || end
== -2, 0))
2387 /* Invalid sequence. */
2388 if (token
->type
== OP_CLOSE_DUP_NUM
)
2389 goto parse_dup_op_invalid_interval
;
2391 goto parse_dup_op_ebrace
;
2393 if (BE (start
== 0 && end
== 0, 0))
2395 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2396 *token
= fetch_token (regexp
, syntax
);
2397 free_bin_tree (dup_elem
);
2401 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2403 for (i
= 0; i
< start
; ++i
)
2406 work_tree
= duplicate_tree (elem
, dfa
);
2407 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2408 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2409 goto parse_dup_op_espace
;
2414 /* We treat "<re>{0,}" as "<re>*". */
2415 dup_token
.type
= OP_DUP_ASTERISK
;
2418 elem
= duplicate_tree (elem
, dfa
);
2419 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2420 work_tree
= create_tree (elem
, NULL
, 0, new_idx
);
2421 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2422 if (BE (elem
== NULL
|| new_idx
== -1 || work_tree
== NULL
2423 || tree
== NULL
, 0))
2424 goto parse_dup_op_espace
;
2428 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2429 tree
= create_tree (elem
, NULL
, 0, new_idx
);
2430 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2431 goto parse_dup_op_espace
;
2434 else if (BE (start
> end
, 0))
2436 /* First number greater than first. */
2440 else if (end
- start
> 0)
2442 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2443 dup_token
.type
= OP_DUP_QUESTION
;
2446 elem
= duplicate_tree (elem
, dfa
);
2447 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2448 elem
= create_tree (elem
, NULL
, 0, new_idx
);
2449 tree
= create_tree (tree
, elem
, CONCAT
, 0);
2450 if (BE (elem
== NULL
|| new_idx
== -1 || tree
== NULL
, 0))
2451 goto parse_dup_op_espace
;
2455 new_idx
= re_dfa_add_node (dfa
, dup_token
, 0);
2456 tree
= elem
= create_tree (elem
, NULL
, 0, new_idx
);
2457 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2458 goto parse_dup_op_espace
;
2460 for (i
= 1; i
< end
- start
; ++i
)
2462 work_tree
= duplicate_tree (elem
, dfa
);
2463 tree
= create_tree (tree
, work_tree
, CONCAT
, 0);
2464 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2474 new_idx
= re_dfa_add_node (dfa
, *token
, 0);
2475 tree
= create_tree (tree
, NULL
, 0, new_idx
);
2476 if (BE (new_idx
== -1 || tree
== NULL
, 0))
2482 *token
= fetch_token (regexp
, syntax
);
2485 parse_dup_op_espace
:
2486 free_bin_tree (tree
);
2490 parse_dup_op_ebrace
:
2491 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2496 goto parse_dup_op_rollback
;
2497 parse_dup_op_invalid_interval
:
2498 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2503 parse_dup_op_rollback
:
2504 re_string_set_index (regexp
, start_idx
);
2505 *token
= start_token
;
2506 token
->type
= CHARACTER
;
2510 /* Size of the names for collating symbol/equivalence_class/character_class.
2511 I'm not sure, but maybe enough. */
2512 #define BRACKET_NAME_BUF_SIZE 32
2515 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2516 Build the range expression which starts from START_ELEM, and ends
2517 at END_ELEM. The result are written to MBCSET and SBCSET.
2518 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2519 mbcset->range_ends, is a pointer argument sinse we may
2522 static reg_errcode_t
2523 # ifdef RE_ENABLE_I18N
2524 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2525 re_charset_t
*mbcset
;
2527 # else /* not RE_ENABLE_I18N */
2528 build_range_exp (sbcset
, start_elem
, end_elem
)
2529 # endif /* not RE_ENABLE_I18N */
2530 re_bitset_ptr_t sbcset
;
2531 bracket_elem_t
*start_elem
, *end_elem
;
2533 unsigned int start_ch
, end_ch
;
2534 /* Equivalence Classes and Character Classes can't be a range start/end. */
2535 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2536 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2540 /* We can handle no multi character collating elements without libc
2542 if (BE ((start_elem
->type
== COLL_SYM
2543 && strlen ((char *) start_elem
->opr
.name
) > 1)
2544 || (end_elem
->type
== COLL_SYM
2545 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2546 return REG_ECOLLATE
;
2548 # ifdef RE_ENABLE_I18N
2550 wchar_t wc
, start_wc
, end_wc
;
2551 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2553 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2554 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2556 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2557 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2559 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2560 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2561 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2562 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2563 cmp_buf
[0] = start_wc
;
2564 cmp_buf
[4] = end_wc
;
2565 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2568 /* Check the space of the arrays. */
2569 if (*range_alloc
== mbcset
->nranges
)
2571 /* There are not enough space, need realloc. */
2572 wchar_t *new_array_start
, *new_array_end
;
2575 /* +1 in case of mbcset->nranges is 0. */
2576 new_nranges
= 2 * mbcset
->nranges
+ 1;
2577 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2578 are NULL if *range_alloc == 0. */
2579 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2581 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2584 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2587 mbcset
->range_starts
= new_array_start
;
2588 mbcset
->range_ends
= new_array_end
;
2589 *range_alloc
= new_nranges
;
2592 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2593 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2595 /* Build the table for single byte characters. */
2596 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2599 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2600 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2601 bitset_set (sbcset
, wc
);
2604 # else /* not RE_ENABLE_I18N */
2607 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2608 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2610 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2611 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2613 if (start_ch
> end_ch
)
2615 /* Build the table for single byte characters. */
2616 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2617 if (start_ch
<= ch
&& ch
<= end_ch
)
2618 bitset_set (sbcset
, ch
);
2620 # endif /* not RE_ENABLE_I18N */
2623 #endif /* not _LIBC */
2626 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2627 Build the collating element which is represented by NAME.
2628 The result are written to MBCSET and SBCSET.
2629 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2630 pointer argument since we may update it. */
2632 static reg_errcode_t
2633 # ifdef RE_ENABLE_I18N
2634 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2635 re_charset_t
*mbcset
;
2636 int *coll_sym_alloc
;
2637 # else /* not RE_ENABLE_I18N */
2638 build_collating_symbol (sbcset
, name
)
2639 # endif /* not RE_ENABLE_I18N */
2640 re_bitset_ptr_t sbcset
;
2641 const unsigned char *name
;
2643 size_t name_len
= strlen ((const char *) name
);
2644 if (BE (name_len
!= 1, 0))
2645 return REG_ECOLLATE
;
2648 bitset_set (sbcset
, name
[0]);
2652 #endif /* not _LIBC */
2654 /* This function parse bracket expression like "[abc]", "[a-c]",
2658 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2659 re_string_t
*regexp
;
2662 reg_syntax_t syntax
;
2666 const unsigned char *collseqmb
;
2667 const char *collseqwc
;
2670 const int32_t *symb_table
;
2671 const unsigned char *extra
;
2673 /* Local function for parse_bracket_exp used in _LIBC environement.
2674 Seek the collating symbol entry correspondings to NAME.
2675 Return the index of the symbol in the SYMB_TABLE. */
2677 static inline int32_t
2678 __attribute ((always_inline
))
2679 seek_collating_symbol_entry (name
, name_len
)
2680 const unsigned char *name
;
2683 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2684 int32_t elem
= hash
% table_size
;
2685 int32_t second
= hash
% (table_size
- 2);
2686 while (symb_table
[2 * elem
] != 0)
2688 /* First compare the hashing value. */
2689 if (symb_table
[2 * elem
] == hash
2690 /* Compare the length of the name. */
2691 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2692 /* Compare the name. */
2693 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2696 /* Yep, this is the entry. */
2706 /* Local function for parse_bracket_exp used in _LIBC environement.
2707 Look up the collation sequence value of BR_ELEM.
2708 Return the value if succeeded, UINT_MAX otherwise. */
2710 static inline unsigned int
2711 __attribute ((always_inline
))
2712 lookup_collation_sequence_value (br_elem
)
2713 bracket_elem_t
*br_elem
;
2715 if (br_elem
->type
== SB_CHAR
)
2718 if (MB_CUR_MAX == 1)
2721 return collseqmb
[br_elem
->opr
.ch
];
2724 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2725 return __collseq_table_lookup (collseqwc
, wc
);
2728 else if (br_elem
->type
== MB_CHAR
)
2730 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2732 else if (br_elem
->type
== COLL_SYM
)
2734 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2738 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2740 if (symb_table
[2 * elem
] != 0)
2742 /* We found the entry. */
2743 idx
= symb_table
[2 * elem
+ 1];
2744 /* Skip the name of collating element name. */
2745 idx
+= 1 + extra
[idx
];
2746 /* Skip the byte sequence of the collating element. */
2747 idx
+= 1 + extra
[idx
];
2748 /* Adjust for the alignment. */
2749 idx
= (idx
+ 3) & ~3;
2750 /* Skip the multibyte collation sequence value. */
2751 idx
+= sizeof (unsigned int);
2752 /* Skip the wide char sequence of the collating element. */
2753 idx
+= sizeof (unsigned int) *
2754 (1 + *(unsigned int *) (extra
+ idx
));
2755 /* Return the collation sequence value. */
2756 return *(unsigned int *) (extra
+ idx
);
2758 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2760 /* No valid character. Match it as a single byte
2762 return collseqmb
[br_elem
->opr
.name
[0]];
2765 else if (sym_name_len
== 1)
2766 return collseqmb
[br_elem
->opr
.name
[0]];
2771 /* Local function for parse_bracket_exp used in _LIBC environement.
2772 Build the range expression which starts from START_ELEM, and ends
2773 at END_ELEM. The result are written to MBCSET and SBCSET.
2774 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2775 mbcset->range_ends, is a pointer argument sinse we may
2778 static inline reg_errcode_t
2779 __attribute ((always_inline
))
2780 # ifdef RE_ENABLE_I18N
2781 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2782 re_charset_t
*mbcset
;
2784 # else /* not RE_ENABLE_I18N */
2785 build_range_exp (sbcset
, start_elem
, end_elem
)
2786 # endif /* not RE_ENABLE_I18N */
2787 re_bitset_ptr_t sbcset
;
2788 bracket_elem_t
*start_elem
, *end_elem
;
2791 uint32_t start_collseq
;
2792 uint32_t end_collseq
;
2794 # ifdef RE_ENABLE_I18N
2795 /* Check the space of the arrays. */
2796 if (*range_alloc
== mbcset
->nranges
)
2798 /* There are not enough space, need realloc. */
2799 uint32_t *new_array_start
;
2800 uint32_t *new_array_end
;
2803 /* +1 in case of mbcset->nranges is 0. */
2804 new_nranges
= 2 * mbcset
->nranges
+ 1;
2805 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2806 are NULL if *range_alloc == 0. */
2807 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2809 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2812 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2815 mbcset
->range_starts
= new_array_start
;
2816 mbcset
->range_ends
= new_array_end
;
2817 *range_alloc
= new_nranges
;
2819 # endif /* RE_ENABLE_I18N */
2821 /* Equivalence Classes and Character Classes can't be a range
2823 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2824 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2828 start_collseq
= lookup_collation_sequence_value (start_elem
);
2829 end_collseq
= lookup_collation_sequence_value (end_elem
);
2830 /* Check start/end collation sequence values. */
2831 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2832 return REG_ECOLLATE
;
2833 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2836 # ifdef RE_ENABLE_I18N
2837 /* Got valid collation sequence values, add them as a new entry. */
2838 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2839 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2840 # endif /* RE_ENABLE_I18N */
2842 /* Build the table for single byte characters. */
2843 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2845 uint32_t ch_collseq
;
2847 if (MB_CUR_MAX == 1)
2850 ch_collseq
= collseqmb
[ch
];
2852 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2853 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2854 bitset_set (sbcset
, ch
);
2859 /* Local function for parse_bracket_exp used in _LIBC environement.
2860 Build the collating element which is represented by NAME.
2861 The result are written to MBCSET and SBCSET.
2862 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2863 pointer argument sinse we may update it. */
2865 static inline reg_errcode_t
2866 __attribute ((always_inline
))
2867 # ifdef RE_ENABLE_I18N
2868 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2869 re_charset_t
*mbcset
;
2870 int *coll_sym_alloc
;
2871 # else /* not RE_ENABLE_I18N */
2872 build_collating_symbol (sbcset
, name
)
2873 # endif /* not RE_ENABLE_I18N */
2874 re_bitset_ptr_t sbcset
;
2875 const unsigned char *name
;
2878 size_t name_len
= strlen ((const char *) name
);
2881 elem
= seek_collating_symbol_entry (name
, name_len
);
2882 if (symb_table
[2 * elem
] != 0)
2884 /* We found the entry. */
2885 idx
= symb_table
[2 * elem
+ 1];
2886 /* Skip the name of collating element name. */
2887 idx
+= 1 + extra
[idx
];
2889 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2891 /* No valid character, treat it as a normal
2893 bitset_set (sbcset
, name
[0]);
2897 return REG_ECOLLATE
;
2899 # ifdef RE_ENABLE_I18N
2900 /* Got valid collation sequence, add it as a new entry. */
2901 /* Check the space of the arrays. */
2902 if (*coll_sym_alloc
== mbcset
->ncoll_syms
)
2904 /* Not enough, realloc it. */
2905 /* +1 in case of mbcset->ncoll_syms is 0. */
2906 *coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2907 /* Use realloc since mbcset->coll_syms is NULL
2909 mbcset
->coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2911 if (BE (mbcset
->coll_syms
== NULL
, 0))
2914 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2915 # endif /* RE_ENABLE_I18N */
2920 if (BE (name_len
!= 1, 0))
2921 return REG_ECOLLATE
;
2924 bitset_set (sbcset
, name
[0]);
2931 re_token_t br_token
;
2932 re_bitset_ptr_t sbcset
;
2933 #ifdef RE_ENABLE_I18N
2934 re_charset_t
*mbcset
;
2935 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2936 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2937 #else /* not RE_ENABLE_I18N */
2939 #endif /* not RE_ENABLE_I18N */
2940 bin_tree_t
*work_tree
;
2941 int token_len
, new_idx
;
2943 collseqmb
= (const unsigned char *)
2944 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2945 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2951 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2952 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2953 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2954 _NL_COLLATE_SYMB_TABLEMB
);
2955 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
2956 _NL_COLLATE_SYMB_EXTRAMB
);
2959 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
2960 #ifdef RE_ENABLE_I18N
2961 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
2962 #endif /* RE_ENABLE_I18N */
2963 #ifdef RE_ENABLE_I18N
2964 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
2966 if (BE (sbcset
== NULL
, 0))
2967 #endif /* RE_ENABLE_I18N */
2973 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2974 if (BE (token
->type
== END_OF_RE
, 0))
2977 goto parse_bracket_exp_free_return
;
2979 if (token
->type
== OP_NON_MATCH_LIST
)
2981 #ifdef RE_ENABLE_I18N
2983 mbcset
->non_match
= 1;
2984 #else /* not RE_ENABLE_I18N */
2986 #endif /* not RE_ENABLE_I18N */
2987 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
2988 bitset_set (sbcset
, '\0');
2989 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
2990 token_len
= peek_token_bracket (token
, regexp
, syntax
);
2991 if (BE (token
->type
== END_OF_RE
, 0))
2994 goto parse_bracket_exp_free_return
;
2996 #ifdef RE_ENABLE_I18N
2997 if (dfa
->mb_cur_max
> 1)
2998 for (i
= 0; i
< SBC_MAX
; ++i
)
2999 if (__btowc (i
) == WEOF
)
3000 bitset_set (sbcset
, i
);
3001 #endif /* RE_ENABLE_I18N */
3004 /* We treat the first ']' as a normal character. */
3005 if (token
->type
== OP_CLOSE_BRACKET
)
3006 token
->type
= CHARACTER
;
3008 int first_round
= 1;
3011 bracket_elem_t start_elem
, end_elem
;
3012 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3013 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3015 int token_len2
= 0, is_range_exp
= 0;
3018 start_elem
.opr
.name
= start_name_buf
;
3019 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3020 syntax
, first_round
);
3021 if (BE (ret
!= REG_NOERROR
, 0))
3024 goto parse_bracket_exp_free_return
;
3028 /* Get information about the next token. We need it in any case. */
3029 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3031 /* Do not check for ranges if we know they are not allowed. */
3032 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3034 if (BE (token
->type
== END_OF_RE
, 0))
3037 goto parse_bracket_exp_free_return
;
3039 if (token
->type
== OP_CHARSET_RANGE
)
3041 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3042 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3043 if (BE (token2
.type
== END_OF_RE
, 0))
3046 goto parse_bracket_exp_free_return
;
3048 if (token2
.type
== OP_CLOSE_BRACKET
)
3050 /* We treat the last '-' as a normal character. */
3051 re_string_skip_bytes (regexp
, -token_len
);
3052 token
->type
= CHARACTER
;
3059 if (is_range_exp
== 1)
3061 end_elem
.opr
.name
= end_name_buf
;
3062 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3064 if (BE (ret
!= REG_NOERROR
, 0))
3067 goto parse_bracket_exp_free_return
;
3070 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3072 *err
= build_range_exp (sbcset
,
3073 #ifdef RE_ENABLE_I18N
3074 mbcset
, &range_alloc
,
3075 #endif /* RE_ENABLE_I18N */
3076 &start_elem
, &end_elem
);
3077 if (BE (*err
!= REG_NOERROR
, 0))
3078 goto parse_bracket_exp_free_return
;
3082 switch (start_elem
.type
)
3085 bitset_set (sbcset
, start_elem
.opr
.ch
);
3087 #ifdef RE_ENABLE_I18N
3089 /* Check whether the array has enough space. */
3090 if (mbchar_alloc
== mbcset
->nmbchars
)
3092 /* Not enough, realloc it. */
3093 /* +1 in case of mbcset->nmbchars is 0. */
3094 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3095 /* Use realloc since array is NULL if *alloc == 0. */
3096 mbcset
->mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3098 if (BE (mbcset
->mbchars
== NULL
, 0))
3099 goto parse_bracket_exp_espace
;
3101 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3103 #endif /* RE_ENABLE_I18N */
3105 *err
= build_equiv_class (sbcset
,
3106 #ifdef RE_ENABLE_I18N
3107 mbcset
, &equiv_class_alloc
,
3108 #endif /* RE_ENABLE_I18N */
3109 start_elem
.opr
.name
);
3110 if (BE (*err
!= REG_NOERROR
, 0))
3111 goto parse_bracket_exp_free_return
;
3114 *err
= build_collating_symbol (sbcset
,
3115 #ifdef RE_ENABLE_I18N
3116 mbcset
, &coll_sym_alloc
,
3117 #endif /* RE_ENABLE_I18N */
3118 start_elem
.opr
.name
);
3119 if (BE (*err
!= REG_NOERROR
, 0))
3120 goto parse_bracket_exp_free_return
;
3123 *err
= build_charclass (regexp
->trans
, sbcset
,
3124 #ifdef RE_ENABLE_I18N
3125 mbcset
, &char_class_alloc
,
3126 #endif /* RE_ENABLE_I18N */
3127 start_elem
.opr
.name
, syntax
);
3128 if (BE (*err
!= REG_NOERROR
, 0))
3129 goto parse_bracket_exp_free_return
;
3136 if (BE (token
->type
== END_OF_RE
, 0))
3139 goto parse_bracket_exp_free_return
;
3141 if (token
->type
== OP_CLOSE_BRACKET
)
3145 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3147 /* If it is non-matching list. */
3148 #ifdef RE_ENABLE_I18N
3149 if (mbcset
->non_match
)
3150 #else /* not RE_ENABLE_I18N */
3152 #endif /* not RE_ENABLE_I18N */
3153 bitset_not (sbcset
);
3155 /* Build a tree for simple bracket. */
3156 br_token
.type
= SIMPLE_BRACKET
;
3157 br_token
.opr
.sbcset
= sbcset
;
3158 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3159 work_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3160 if (BE (new_idx
== -1 || work_tree
== NULL
, 0))
3161 goto parse_bracket_exp_espace
;
3163 #ifdef RE_ENABLE_I18N
3164 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3165 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3166 || mbcset
->non_match
)))
3168 re_token_t alt_token
;
3169 bin_tree_t
*mbc_tree
;
3171 /* Build a tree for complex bracket. */
3172 dfa
->has_mb_node
= 1;
3173 dfa
->has_plural_match
= 1;
3174 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3175 if (sbcset
[sbc_idx
])
3177 /* If there are no bits set in sbcset, there is no point
3178 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3179 if (sbc_idx
== BITSET_UINTS
)
3182 dfa
->nodes
[new_idx
].type
= COMPLEX_BRACKET
;
3183 dfa
->nodes
[new_idx
].opr
.mbcset
= mbcset
;
3186 br_token
.type
= COMPLEX_BRACKET
;
3187 br_token
.opr
.mbcset
= mbcset
;
3188 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3189 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3190 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3191 goto parse_bracket_exp_espace
;
3192 /* Then join them by ALT node. */
3193 alt_token
.type
= OP_ALT
;
3194 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3195 work_tree
= create_tree (work_tree
, mbc_tree
, 0, new_idx
);
3196 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3201 free_charset (mbcset
);
3204 #else /* not RE_ENABLE_I18N */
3206 #endif /* not RE_ENABLE_I18N */
3208 parse_bracket_exp_espace
:
3210 parse_bracket_exp_free_return
:
3212 #ifdef RE_ENABLE_I18N
3213 free_charset (mbcset
);
3214 #endif /* RE_ENABLE_I18N */
3218 /* Parse an element in the bracket expression. */
3220 static reg_errcode_t
3221 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3223 bracket_elem_t
*elem
;
3224 re_string_t
*regexp
;
3228 reg_syntax_t syntax
;
3231 #ifdef RE_ENABLE_I18N
3233 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3234 if (cur_char_size
> 1)
3236 elem
->type
= MB_CHAR
;
3237 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3238 re_string_skip_bytes (regexp
, cur_char_size
);
3241 #endif /* RE_ENABLE_I18N */
3242 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3243 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3244 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3245 return parse_bracket_symbol (elem
, regexp
, token
);
3246 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3248 /* A '-' must only appear as anything but a range indicator before
3249 the closing bracket. Everything else is an error. */
3251 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3252 if (token2
.type
!= OP_CLOSE_BRACKET
)
3253 /* The actual error value is not standardized since this whole
3254 case is undefined. But ERANGE makes good sense. */
3257 elem
->type
= SB_CHAR
;
3258 elem
->opr
.ch
= token
->opr
.c
;
3262 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3263 such as [:<character_class>:], [.<collating_element>.], and
3264 [=<equivalent_class>=]. */
3266 static reg_errcode_t
3267 parse_bracket_symbol (elem
, regexp
, token
)
3268 bracket_elem_t
*elem
;
3269 re_string_t
*regexp
;
3272 unsigned char ch
, delim
= token
->opr
.c
;
3276 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
3278 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3279 ch
= re_string_fetch_byte_case (regexp
);
3281 ch
= re_string_fetch_byte (regexp
);
3282 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3284 elem
->opr
.name
[i
] = ch
;
3286 re_string_skip_bytes (regexp
, 1);
3287 elem
->opr
.name
[i
] = '\0';
3288 switch (token
->type
)
3290 case OP_OPEN_COLL_ELEM
:
3291 elem
->type
= COLL_SYM
;
3293 case OP_OPEN_EQUIV_CLASS
:
3294 elem
->type
= EQUIV_CLASS
;
3296 case OP_OPEN_CHAR_CLASS
:
3297 elem
->type
= CHAR_CLASS
;
3305 /* Helper function for parse_bracket_exp.
3306 Build the equivalence class which is represented by NAME.
3307 The result are written to MBCSET and SBCSET.
3308 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3309 is a pointer argument sinse we may update it. */
3311 static reg_errcode_t
3312 #ifdef RE_ENABLE_I18N
3313 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3314 re_charset_t
*mbcset
;
3315 int *equiv_class_alloc
;
3316 #else /* not RE_ENABLE_I18N */
3317 build_equiv_class (sbcset
, name
)
3318 #endif /* not RE_ENABLE_I18N */
3319 re_bitset_ptr_t sbcset
;
3320 const unsigned char *name
;
3322 #if defined _LIBC && defined RE_ENABLE_I18N
3323 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3326 const int32_t *table
, *indirect
;
3327 const unsigned char *weights
, *extra
, *cp
;
3328 unsigned char char_buf
[2];
3332 /* This #include defines a local function! */
3333 # include <locale/weight.h>
3334 /* Calculate the index for equivalence class. */
3336 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3337 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3338 _NL_COLLATE_WEIGHTMB
);
3339 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3340 _NL_COLLATE_EXTRAMB
);
3341 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3342 _NL_COLLATE_INDIRECTMB
);
3343 idx1
= findidx (&cp
);
3344 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3345 /* This isn't a valid character. */
3346 return REG_ECOLLATE
;
3348 /* Build single byte matcing table for this equivalence class. */
3349 char_buf
[1] = (unsigned char) '\0';
3350 len
= weights
[idx1
];
3351 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3355 idx2
= findidx (&cp
);
3360 /* This isn't a valid character. */
3362 if (len
== weights
[idx2
])
3365 while (cnt
<= len
&&
3366 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3370 bitset_set (sbcset
, ch
);
3373 /* Check whether the array has enough space. */
3374 if (*equiv_class_alloc
== mbcset
->nequiv_classes
)
3376 /* Not enough, realloc it. */
3377 /* +1 in case of mbcset->nequiv_classes is 0. */
3378 *equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3379 /* Use realloc since the array is NULL if *alloc == 0. */
3380 mbcset
->equiv_classes
= re_realloc (mbcset
->equiv_classes
, int32_t,
3381 *equiv_class_alloc
);
3382 if (BE (mbcset
->equiv_classes
== NULL
, 0))
3385 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3388 #endif /* _LIBC && RE_ENABLE_I18N */
3390 if (BE (strlen ((const char *) name
) != 1, 0))
3391 return REG_ECOLLATE
;
3392 bitset_set (sbcset
, *name
);
3397 /* Helper function for parse_bracket_exp.
3398 Build the character class which is represented by NAME.
3399 The result are written to MBCSET and SBCSET.
3400 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3401 is a pointer argument sinse we may update it. */
3403 static reg_errcode_t
3404 #ifdef RE_ENABLE_I18N
3405 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3406 re_charset_t
*mbcset
;
3407 int *char_class_alloc
;
3408 #else /* not RE_ENABLE_I18N */
3409 build_charclass (trans
, sbcset
, class_name
, syntax
)
3410 #endif /* not RE_ENABLE_I18N */
3411 RE_TRANSLATE_TYPE trans
;
3412 re_bitset_ptr_t sbcset
;
3413 const unsigned char *class_name
;
3414 reg_syntax_t syntax
;
3417 const char *name
= (const char *) class_name
;
3419 /* In case of REG_ICASE "upper" and "lower" match the both of
3420 upper and lower cases. */
3421 if ((syntax
& RE_ICASE
)
3422 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3425 #ifdef RE_ENABLE_I18N
3426 /* Check the space of the arrays. */
3427 if (*char_class_alloc
== mbcset
->nchar_classes
)
3429 /* Not enough, realloc it. */
3430 /* +1 in case of mbcset->nchar_classes is 0. */
3431 *char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3432 /* Use realloc since array is NULL if *alloc == 0. */
3433 mbcset
->char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3435 if (BE (mbcset
->char_classes
== NULL
, 0))
3438 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3439 #endif /* RE_ENABLE_I18N */
3441 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3442 for (i = 0; i < SBC_MAX; ++i) \
3444 if (ctype_func (i)) \
3446 int ch = trans ? trans[i] : i; \
3447 bitset_set (sbcset, ch); \
3451 if (strcmp (name
, "alnum") == 0)
3452 BUILD_CHARCLASS_LOOP (isalnum
)
3453 else if (strcmp (name
, "cntrl") == 0)
3454 BUILD_CHARCLASS_LOOP (iscntrl
)
3455 else if (strcmp (name
, "lower") == 0)
3456 BUILD_CHARCLASS_LOOP (islower
)
3457 else if (strcmp (name
, "space") == 0)
3458 BUILD_CHARCLASS_LOOP (isspace
)
3459 else if (strcmp (name
, "alpha") == 0)
3460 BUILD_CHARCLASS_LOOP (isalpha
)
3461 else if (strcmp (name
, "digit") == 0)
3462 BUILD_CHARCLASS_LOOP (isdigit
)
3463 else if (strcmp (name
, "print") == 0)
3464 BUILD_CHARCLASS_LOOP (isprint
)
3465 else if (strcmp (name
, "upper") == 0)
3466 BUILD_CHARCLASS_LOOP (isupper
)
3467 else if (strcmp (name
, "blank") == 0)
3468 BUILD_CHARCLASS_LOOP (isblank
)
3469 else if (strcmp (name
, "graph") == 0)
3470 BUILD_CHARCLASS_LOOP (isgraph
)
3471 else if (strcmp (name
, "punct") == 0)
3472 BUILD_CHARCLASS_LOOP (ispunct
)
3473 else if (strcmp (name
, "xdigit") == 0)
3474 BUILD_CHARCLASS_LOOP (isxdigit
)
3482 build_charclass_op (dfa
, trans
, class_name
, extra
, not, err
)
3484 RE_TRANSLATE_TYPE trans
;
3485 const unsigned char *class_name
;
3486 const unsigned char *extra
;
3490 re_bitset_ptr_t sbcset
;
3491 #ifdef RE_ENABLE_I18N
3492 re_charset_t
*mbcset
;
3494 #else /* not RE_ENABLE_I18N */
3496 #endif /* not RE_ENABLE_I18N */
3498 re_token_t br_token
;
3502 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3503 #ifdef RE_ENABLE_I18N
3504 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3505 #endif /* RE_ENABLE_I18N */
3507 #ifdef RE_ENABLE_I18N
3508 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3509 #else /* not RE_ENABLE_I18N */
3510 if (BE (sbcset
== NULL
, 0))
3511 #endif /* not RE_ENABLE_I18N */
3519 #ifdef RE_ENABLE_I18N
3522 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3523 bitset_set(cset->sbcset, '\0');
3525 mbcset
->non_match
= 1;
3526 if (dfa
->mb_cur_max
> 1)
3527 for (i
= 0; i
< SBC_MAX
; ++i
)
3528 if (__btowc (i
) == WEOF
)
3529 bitset_set (sbcset
, i
);
3530 #else /* not RE_ENABLE_I18N */
3532 #endif /* not RE_ENABLE_I18N */
3535 /* We don't care the syntax in this case. */
3536 ret
= build_charclass (trans
, sbcset
,
3537 #ifdef RE_ENABLE_I18N
3539 #endif /* RE_ENABLE_I18N */
3542 if (BE (ret
!= REG_NOERROR
, 0))
3545 #ifdef RE_ENABLE_I18N
3546 free_charset (mbcset
);
3547 #endif /* RE_ENABLE_I18N */
3551 /* \w match '_' also. */
3552 for (; *extra
; extra
++)
3553 bitset_set (sbcset
, *extra
);
3555 /* If it is non-matching list. */
3556 #ifdef RE_ENABLE_I18N
3557 if (mbcset
->non_match
)
3558 #else /* not RE_ENABLE_I18N */
3560 #endif /* not RE_ENABLE_I18N */
3561 bitset_not (sbcset
);
3563 /* Build a tree for simple bracket. */
3564 br_token
.type
= SIMPLE_BRACKET
;
3565 br_token
.opr
.sbcset
= sbcset
;
3566 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3567 tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3568 if (BE (new_idx
== -1 || tree
== NULL
, 0))
3569 goto build_word_op_espace
;
3571 #ifdef RE_ENABLE_I18N
3572 if (dfa
->mb_cur_max
> 1)
3574 re_token_t alt_token
;
3575 bin_tree_t
*mbc_tree
;
3576 /* Build a tree for complex bracket. */
3577 br_token
.type
= COMPLEX_BRACKET
;
3578 br_token
.opr
.mbcset
= mbcset
;
3579 dfa
->has_mb_node
= 1;
3580 new_idx
= re_dfa_add_node (dfa
, br_token
, 0);
3581 mbc_tree
= create_tree (NULL
, NULL
, 0, new_idx
);
3582 if (BE (new_idx
== -1 || mbc_tree
== NULL
, 0))
3583 goto build_word_op_espace
;
3584 /* Then join them by ALT node. */
3585 alt_token
.type
= OP_ALT
;
3586 new_idx
= re_dfa_add_node (dfa
, alt_token
, 0);
3587 tree
= create_tree (tree
, mbc_tree
, 0, new_idx
);
3588 if (BE (new_idx
!= -1 && mbc_tree
!= NULL
, 1))
3593 free_charset (mbcset
);
3596 #else /* not RE_ENABLE_I18N */
3598 #endif /* not RE_ENABLE_I18N */
3600 build_word_op_espace
:
3602 #ifdef RE_ENABLE_I18N
3603 free_charset (mbcset
);
3604 #endif /* RE_ENABLE_I18N */
3609 /* This is intended for the expressions like "a{1,3}".
3610 Fetch a number from `input', and return the number.
3611 Return -1, if the number field is empty like "{,1}".
3612 Return -2, If an error is occured. */
3615 fetch_number (input
, token
, syntax
)
3618 reg_syntax_t syntax
;
3624 *token
= fetch_token (input
, syntax
);
3626 if (BE (token
->type
== END_OF_RE
, 0))
3628 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3630 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3631 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3632 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3637 #ifdef RE_ENABLE_I18N
3639 free_charset (re_charset_t
*cset
)
3641 re_free (cset
->mbchars
);
3643 re_free (cset
->coll_syms
);
3644 re_free (cset
->equiv_classes
);
3645 re_free (cset
->range_starts
);
3646 re_free (cset
->range_ends
);
3648 re_free (cset
->char_classes
);
3651 #endif /* RE_ENABLE_I18N */
3653 /* Functions for binary tree operation. */
3655 /* Create a node of tree.
3656 Note: This function automatically free left and right if malloc fails. */
3659 create_tree (left
, right
, type
, index
)
3662 re_token_type_t type
;
3666 tree
= re_malloc (bin_tree_t
, 1);
3667 if (BE (tree
== NULL
, 0))
3669 free_bin_tree (left
);
3670 free_bin_tree (right
);
3673 tree
->parent
= NULL
;
3675 tree
->right
= right
;
3677 tree
->node_idx
= index
;
3680 re_node_set_init_empty (&tree
->eclosure
);
3683 left
->parent
= tree
;
3685 right
->parent
= tree
;
3689 /* Free the sub tree pointed by TREE. */
3692 free_bin_tree (tree
)
3697 /*re_node_set_free (&tree->eclosure);*/
3698 free_bin_tree (tree
->left
);
3699 free_bin_tree (tree
->right
);
3703 /* Duplicate the node SRC, and return new node. */
3706 duplicate_tree (src
, dfa
)
3707 const bin_tree_t
*src
;
3710 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3712 /* Since node indies must be according to Post-order of the tree,
3713 we must duplicate the left at first. */
3714 if (src
->left
!= NULL
)
3716 left
= duplicate_tree (src
->left
, dfa
);
3721 /* Secondaly, duplicate the right. */
3722 if (src
->right
!= NULL
)
3724 right
= duplicate_tree (src
->right
, dfa
);
3727 free_bin_tree (left
);
3732 /* At last, duplicate itself. */
3733 if (src
->type
== NON_TYPE
)
3735 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3736 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3737 if (BE (new_node_idx
== -1, 0))
3739 free_bin_tree (left
);
3740 free_bin_tree (right
);
3745 new_node_idx
= src
->type
;
3747 new_tree
= create_tree (left
, right
, src
->type
, new_node_idx
);
3748 if (BE (new_tree
== NULL
, 0))
3750 free_bin_tree (left
);
3751 free_bin_tree (right
);