1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (regex_t
*preg
);
37 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
38 static reg_errcode_t
preorder (bin_tree_t
*root
,
39 reg_errcode_t (fn (void *, bin_tree_t
*)),
41 static reg_errcode_t
postorder (bin_tree_t
*root
,
42 reg_errcode_t (fn (void *, bin_tree_t
*)),
44 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
45 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
46 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
48 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
49 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
51 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
52 int top_clone_node
, int root_node
,
53 unsigned int constraint
);
54 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
55 unsigned int constraint
);
56 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
57 unsigned int constraint
);
58 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
59 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
61 static void calc_inveclosure (re_dfa_t
*dfa
);
62 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
64 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
66 static int peek_token (re_token_t
*token
, re_string_t
*input
,
68 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
70 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
71 reg_syntax_t syntax
, reg_errcode_t
*err
);
72 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
73 re_token_t
*token
, reg_syntax_t syntax
,
74 int nest
, reg_errcode_t
*err
);
75 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
76 re_token_t
*token
, reg_syntax_t syntax
,
77 int nest
, reg_errcode_t
*err
);
78 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
79 re_token_t
*token
, reg_syntax_t syntax
,
80 int nest
, reg_errcode_t
*err
);
81 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
82 re_token_t
*token
, reg_syntax_t syntax
,
83 int nest
, reg_errcode_t
*err
);
84 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
85 re_dfa_t
*dfa
, re_token_t
*token
,
86 reg_syntax_t syntax
, reg_errcode_t
*err
);
87 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
88 re_token_t
*token
, reg_syntax_t syntax
,
90 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
92 re_token_t
*token
, int token_len
,
96 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
100 # ifdef RE_ENABLE_I18N
101 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
102 re_charset_t
*mbcset
, int *range_alloc
,
103 bracket_elem_t
*start_elem
,
104 bracket_elem_t
*end_elem
);
105 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
106 re_charset_t
*mbcset
,
108 const unsigned char *name
);
109 # else /* not RE_ENABLE_I18N */
110 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
111 bracket_elem_t
*start_elem
,
112 bracket_elem_t
*end_elem
);
113 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
114 const unsigned char *name
);
115 # endif /* not RE_ENABLE_I18N */
116 #endif /* not _LIBC */
117 #ifdef RE_ENABLE_I18N
118 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
119 re_charset_t
*mbcset
,
120 int *equiv_class_alloc
,
121 const unsigned char *name
);
122 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
123 re_bitset_ptr_t sbcset
,
124 re_charset_t
*mbcset
,
125 int *char_class_alloc
,
126 const unsigned char *class_name
,
127 reg_syntax_t syntax
);
128 #else /* not RE_ENABLE_I18N */
129 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
130 const unsigned char *name
);
131 static reg_errcode_t
build_charclass (unsigned RE_TRANSLATE_TYPE trans
,
132 re_bitset_ptr_t sbcset
,
133 const unsigned char *class_name
,
134 reg_syntax_t syntax
);
135 #endif /* not RE_ENABLE_I18N */
136 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
137 unsigned RE_TRANSLATE_TYPE trans
,
138 const unsigned char *class_name
,
139 const unsigned char *extra
,
140 int non_match
, reg_errcode_t
*err
);
141 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
142 bin_tree_t
*left
, bin_tree_t
*right
,
143 re_token_type_t type
);
144 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
145 bin_tree_t
*left
, bin_tree_t
*right
,
146 const re_token_t
*token
);
147 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
148 static void free_token (re_token_t
*node
);
149 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
150 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
152 /* This table gives an error message for each of the error codes listed
153 in regex.h. Obviously the order here has to be same as there.
154 POSIX doesn't require that we do anything for REG_NOERROR,
155 but why not be nice? */
157 const char __re_error_msgid
[] attribute_hidden
=
159 #define REG_NOERROR_IDX 0
160 gettext_noop ("Success") /* REG_NOERROR */
162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163 gettext_noop ("No match") /* REG_NOMATCH */
165 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193 gettext_noop ("Invalid range end") /* REG_ERANGE */
195 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
196 gettext_noop ("Memory exhausted") /* REG_ESPACE */
198 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202 gettext_noop ("Premature end of regular expression") /* REG_EEND */
204 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
205 gettext_noop ("Regular expression too big") /* REG_ESIZE */
207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
211 const size_t __re_error_msgid_idx
[] attribute_hidden
=
232 /* Entry points for GNU code. */
234 /* re_compile_pattern is the GNU regular expression compiler: it
235 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236 Returns 0 if the pattern was valid, otherwise an error string.
238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239 are set in BUFP on entry. */
242 re_compile_pattern (pattern
, length
, bufp
)
245 struct re_pattern_buffer
*bufp
;
249 /* And GNU code determines whether or not to get register information
250 by passing null for the REGS argument to re_match, etc., not by
251 setting no_sub, unless RE_NO_SUB is set. */
252 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
254 /* Match anchors at newline. */
255 bufp
->newline_anchor
= 1;
257 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
261 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
264 weak_alias (__re_compile_pattern
, re_compile_pattern
)
267 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
268 also be assigned to arbitrarily: each pattern buffer stores its own
269 syntax, so it can be changed between regex compilations. */
270 /* This has no initializer because initialized variables in Emacs
271 become read-only after dumping. */
272 reg_syntax_t re_syntax_options
;
275 /* Specify the precise syntax of regexps for compilation. This provides
276 for compatibility for various utilities which historically have
277 different, incompatible syntaxes.
279 The argument SYNTAX is a bit mask comprised of the various bits
280 defined in regex.h. We return the old syntax. */
283 re_set_syntax (syntax
)
286 reg_syntax_t ret
= re_syntax_options
;
288 re_syntax_options
= syntax
;
292 weak_alias (__re_set_syntax
, re_set_syntax
)
296 re_compile_fastmap (bufp
)
297 struct re_pattern_buffer
*bufp
;
299 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
300 char *fastmap
= bufp
->fastmap
;
302 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
303 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
304 if (dfa
->init_state
!= dfa
->init_state_word
)
305 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
306 if (dfa
->init_state
!= dfa
->init_state_nl
)
307 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
308 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
309 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
310 bufp
->fastmap_accurate
= 1;
314 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
318 __attribute ((always_inline
))
319 re_set_fastmap (char *fastmap
, int icase
, int ch
)
323 fastmap
[tolower (ch
)] = 1;
326 /* Helper function for re_compile_fastmap.
327 Compile fastmap for the initial_state INIT_STATE. */
330 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
332 const re_dfastate_t
*init_state
;
335 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
337 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
338 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
340 int node
= init_state
->nodes
.elems
[node_cnt
];
341 re_token_type_t type
= dfa
->nodes
[node
].type
;
343 if (type
== CHARACTER
)
345 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
346 #ifdef RE_ENABLE_I18N
347 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
349 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
354 *p
++ = dfa
->nodes
[node
].opr
.c
;
355 while (++node
< dfa
->nodes_len
356 && dfa
->nodes
[node
].type
== CHARACTER
357 && dfa
->nodes
[node
].mb_partial
)
358 *p
++ = dfa
->nodes
[node
].opr
.c
;
359 memset (&state
, 0, sizeof (state
));
360 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
362 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
363 re_set_fastmap (fastmap
, 0, buf
[0]);
367 else if (type
== SIMPLE_BRACKET
)
370 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
371 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
372 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
373 re_set_fastmap (fastmap
, icase
, ch
);
375 #ifdef RE_ENABLE_I18N
376 else if (type
== COMPLEX_BRACKET
)
379 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
380 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
381 || cset
->nranges
|| cset
->nchar_classes
)
384 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
393 const int32_t *table
= (const int32_t *)
394 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
395 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
396 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
398 re_set_fastmap (fastmap
, icase
, ch
);
401 if (dfa
->mb_cur_max
> 1)
402 for (i
= 0; i
< SBC_MAX
; ++i
)
403 if (__btowc (i
) == WEOF
)
404 re_set_fastmap (fastmap
, icase
, i
);
405 # endif /* not _LIBC */
407 for (i
= 0; i
< cset
->nmbchars
; ++i
)
411 memset (&state
, '\0', sizeof (state
));
412 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
413 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
414 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
416 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
417 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
421 #endif /* RE_ENABLE_I18N */
422 else if (type
== OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type
== OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type
== END_OF_RE
)
428 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
429 if (type
== END_OF_RE
)
430 bufp
->can_be_null
= 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg
, pattern
, cflags
)
474 regex_t
*__restrict preg
;
475 const char *__restrict pattern
;
479 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC
);
486 /* Try to allocate space for the fastmap. */
487 preg
->fastmap
= re_malloc (char, SBC_MAX
);
488 if (BE (preg
->fastmap
== NULL
, 0))
491 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags
& REG_NEWLINE
)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax
&= ~RE_DOT_NEWLINE
;
497 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
498 /* It also changes the matching behavior. */
499 preg
->newline_anchor
= 1;
502 preg
->newline_anchor
= 0;
503 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
504 preg
->translate
= NULL
;
506 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret
== REG_ERPAREN
)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret
== REG_NOERROR
, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg
);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg
->fastmap
);
522 preg
->fastmap
= NULL
;
528 weak_alias (__regcomp
, regcomp
)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode
, preg
, errbuf
, errbuf_size
)
545 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
546 / sizeof (__re_error_msgid_idx
[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
553 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
555 msg_size
= strlen (msg
) + 1; /* Includes the null. */
557 if (BE (errbuf_size
!= 0, 1))
559 if (BE (msg_size
> errbuf_size
, 0))
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
564 memcpy (errbuf
, msg
, errbuf_size
- 1);
565 errbuf
[errbuf_size
- 1] = 0;
569 memcpy (errbuf
, msg
, msg_size
);
575 weak_alias (__regerror
, regerror
)
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map
=
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
590 # error "Add case for new unsigned int size"
597 free_dfa_content (re_dfa_t
*dfa
)
602 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
603 free_token (dfa
->nodes
+ i
);
604 re_free (dfa
->nexts
);
605 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
607 if (dfa
->eclosures
!= NULL
)
608 re_node_set_free (dfa
->eclosures
+ i
);
609 if (dfa
->inveclosures
!= NULL
)
610 re_node_set_free (dfa
->inveclosures
+ i
);
611 if (dfa
->edests
!= NULL
)
612 re_node_set_free (dfa
->edests
+ i
);
614 re_free (dfa
->edests
);
615 re_free (dfa
->eclosures
);
616 re_free (dfa
->inveclosures
);
617 re_free (dfa
->nodes
);
619 if (dfa
->state_table
)
620 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
622 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
623 for (j
= 0; j
< entry
->num
; ++j
)
625 re_dfastate_t
*state
= entry
->array
[j
];
628 re_free (entry
->array
);
630 re_free (dfa
->state_table
);
631 #ifdef RE_ENABLE_I18N
632 if (dfa
->sb_char
!= utf8_sb_map
)
633 re_free (dfa
->sb_char
);
635 re_free (dfa
->subexp_map
);
637 re_free (dfa
->re_str
);
644 /* Free dynamically allocated space used by PREG. */
650 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
651 if (BE (dfa
!= NULL
, 1))
652 free_dfa_content (dfa
);
656 re_free (preg
->fastmap
);
657 preg
->fastmap
= NULL
;
659 re_free (preg
->translate
);
660 preg
->translate
= NULL
;
663 weak_alias (__regfree
, regfree
)
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf
;
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
689 if (!re_comp_buf
.buffer
)
690 return gettext ("No previous regular expression");
694 if (re_comp_buf
.buffer
)
696 fastmap
= re_comp_buf
.fastmap
;
697 re_comp_buf
.fastmap
= NULL
;
698 __regfree (&re_comp_buf
);
699 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
700 re_comp_buf
.fastmap
= fastmap
;
703 if (re_comp_buf
.fastmap
== NULL
)
705 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
706 if (re_comp_buf
.fastmap
== NULL
)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf
.newline_anchor
= 1;
717 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
727 libc_freeres_fn (free_mem
)
729 __regfree (&re_comp_buf
);
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
740 re_compile_internal (preg
, pattern
, length
, syntax
)
742 const char * pattern
;
746 reg_errcode_t err
= REG_NOERROR
;
750 /* Initialize the pattern buffer. */
751 preg
->fastmap_accurate
= 0;
752 preg
->syntax
= syntax
;
753 preg
->not_bol
= preg
->not_eol
= 0;
756 preg
->can_be_null
= 0;
757 preg
->regs_allocated
= REGS_UNALLOCATED
;
759 /* Initialize the dfa. */
760 dfa
= (re_dfa_t
*) preg
->buffer
;
761 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
770 preg
->allocated
= sizeof (re_dfa_t
);
771 preg
->buffer
= (unsigned char *) dfa
;
773 preg
->used
= sizeof (re_dfa_t
);
775 err
= init_dfa (dfa
, length
);
776 if (BE (err
!= REG_NOERROR
, 0))
778 free_dfa_content (dfa
);
784 dfa
->re_str
= re_malloc (char, length
+ 1);
785 strncpy (dfa
->re_str
, pattern
, length
+ 1);
788 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
789 syntax
& RE_ICASE
, dfa
);
790 if (BE (err
!= REG_NOERROR
, 0))
792 re_compile_internal_free_return
:
793 free_workarea_compile (preg
);
794 re_string_destruct (®exp
);
795 free_dfa_content (dfa
);
801 /* Parse the regular expression, and build a structure tree. */
803 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
804 if (BE (dfa
->str_tree
== NULL
, 0))
805 goto re_compile_internal_free_return
;
807 /* Analyze the tree and create the nfa. */
808 err
= analyze (preg
);
809 if (BE (err
!= REG_NOERROR
, 0))
810 goto re_compile_internal_free_return
;
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
818 /* Then create the initial state of the dfa. */
819 err
= create_initial_state (dfa
);
821 /* Release work areas. */
822 free_workarea_compile (preg
);
823 re_string_destruct (®exp
);
825 if (BE (err
!= REG_NOERROR
, 0))
827 free_dfa_content (dfa
);
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
839 init_dfa (dfa
, pat_len
)
848 memset (dfa
, '\0', sizeof (re_dfa_t
));
850 /* Force allocation of str_tree_storage the first time. */
851 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
853 dfa
->nodes_alloc
= pat_len
+ 1;
854 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
856 dfa
->states_alloc
= pat_len
+ 1;
858 /* table_size = 2 ^ ceil(log pat_len) */
859 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
860 if (table_size
> pat_len
)
863 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
864 dfa
->state_hash_mask
= table_size
- 1;
866 dfa
->mb_cur_max
= MB_CUR_MAX
;
868 if (dfa
->mb_cur_max
== 6
869 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
871 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
874 # ifdef HAVE_LANGINFO_CODESET
875 codeset_name
= nl_langinfo (CODESET
);
877 codeset_name
= getenv ("LC_ALL");
878 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
879 codeset_name
= getenv ("LC_CTYPE");
880 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
881 codeset_name
= getenv ("LANG");
882 if (codeset_name
== NULL
)
884 else if (strchr (codeset_name
, '.') != NULL
)
885 codeset_name
= strchr (codeset_name
, '.') + 1;
888 if (strcasecmp (codeset_name
, "UTF-8") == 0
889 || strcasecmp (codeset_name
, "UTF8") == 0)
892 /* We check exhaustively in the loop below if this charset is a
893 superset of ASCII. */
894 dfa
->map_notascii
= 0;
897 #ifdef RE_ENABLE_I18N
898 if (dfa
->mb_cur_max
> 1)
901 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
906 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
907 if (BE (dfa
->sb_char
== NULL
, 0))
910 /* Clear all bits by, then set those corresponding to single
912 bitset_empty (dfa
->sb_char
);
914 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
915 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
917 wchar_t wch
= __btowc (ch
);
919 dfa
->sb_char
[i
] |= 1 << j
;
921 if (isascii (ch
) && wch
!= (wchar_t) ch
)
922 dfa
->map_notascii
= 1;
929 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
943 dfa
->word_ops_used
= 1;
944 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
945 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
946 if (isalnum (ch
) || ch
== '_')
947 dfa
->word_char
[i
] |= 1 << j
;
950 /* Free the work area which are only used while compiling. */
953 free_workarea_compile (preg
)
956 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
957 bin_tree_storage_t
*storage
, *next
;
958 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
960 next
= storage
->next
;
963 dfa
->str_tree_storage
= NULL
;
964 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
965 dfa
->str_tree
= NULL
;
966 re_free (dfa
->org_indices
);
967 dfa
->org_indices
= NULL
;
970 /* Create initial states for all contexts. */
973 create_initial_state (dfa
)
978 re_node_set init_nodes
;
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first
= dfa
->str_tree
->first
->node_idx
;
983 dfa
->init_node
= first
;
984 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
985 if (BE (err
!= REG_NOERROR
, 0))
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa
->nbackref
> 0)
993 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
995 int node_idx
= init_nodes
.elems
[i
];
996 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
999 if (type
!= OP_BACK_REF
)
1001 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1003 re_token_t
*clexp_node
;
1004 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1005 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1006 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1009 if (clexp_idx
== init_nodes
.nelem
)
1012 if (type
== OP_BACK_REF
)
1014 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1015 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1017 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa
->init_state
== NULL
, 0))
1028 if (dfa
->init_state
->has_constraint
)
1030 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1032 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1034 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1038 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1039 || dfa
->init_state_begbuf
== NULL
, 0))
1043 dfa
->init_state_word
= dfa
->init_state_nl
1044 = dfa
->init_state_begbuf
= dfa
->init_state
;
1046 re_node_set_free (&init_nodes
);
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1059 int node
, i
, mb_chars
= 0, has_period
= 0;
1061 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1062 switch (dfa
->nodes
[node
].type
)
1065 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1069 switch (dfa
->nodes
[node
].opr
.idx
)
1077 /* Word anchors etc. cannot be handled. */
1087 case OP_DUP_ASTERISK
:
1088 case OP_OPEN_SUBEXP
:
1089 case OP_CLOSE_SUBEXP
:
1091 case COMPLEX_BRACKET
:
1093 case SIMPLE_BRACKET
:
1094 /* Just double check. */
1095 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1096 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1103 if (mb_chars
|| has_period
)
1104 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1106 if (dfa
->nodes
[node
].type
== CHARACTER
1107 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1108 dfa
->nodes
[node
].mb_partial
= 0;
1109 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1110 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1113 /* The search can be in single byte locale. */
1114 dfa
->mb_cur_max
= 1;
1116 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121 "eclosure", and "inveclosure". */
1123 static reg_errcode_t
1127 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1130 /* Allocate arrays. */
1131 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1132 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1133 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1134 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1135 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1136 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1137 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
1140 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1141 if (dfa
->subexp_map
!= NULL
)
1144 for (i
= 0; i
< preg
->re_nsub
; i
++)
1145 dfa
->subexp_map
[i
] = i
;
1146 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1147 for (i
= 0; i
< preg
->re_nsub
; i
++)
1148 if (dfa
->subexp_map
[i
] != i
)
1150 if (i
== preg
->re_nsub
)
1152 free (dfa
->subexp_map
);
1153 dfa
->subexp_map
= NULL
;
1157 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1158 if (BE (ret
!= REG_NOERROR
, 0))
1160 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1161 if (BE (ret
!= REG_NOERROR
, 0))
1163 preorder (dfa
->str_tree
, calc_next
, dfa
);
1164 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1165 if (BE (ret
!= REG_NOERROR
, 0))
1167 ret
= calc_eclosure (dfa
);
1168 if (BE (ret
!= REG_NOERROR
, 0))
1170 calc_inveclosure (dfa
);
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175 implement parse tree visits. Instead, we use parent pointers and
1176 some hairy code in these two functions. */
1177 static reg_errcode_t
1178 postorder (root
, fn
, extra
)
1180 reg_errcode_t (fn (void *, bin_tree_t
*));
1183 bin_tree_t
*node
, *prev
;
1185 for (node
= root
; ; )
1187 /* Descend down the tree, preferably to the left (or to the right
1188 if that's the only child). */
1189 while (node
->left
|| node
->right
)
1197 reg_errcode_t err
= fn (extra
, node
);
1198 if (BE (err
!= REG_NOERROR
, 0))
1200 if (node
->parent
== NULL
)
1203 node
= node
->parent
;
1205 /* Go up while we have a node that is reached from the right. */
1206 while (node
->right
== prev
|| node
->right
== NULL
);
1211 static reg_errcode_t
1212 preorder (root
, fn
, extra
)
1214 reg_errcode_t (fn (void *, bin_tree_t
*));
1219 for (node
= root
; ; )
1221 reg_errcode_t err
= fn (extra
, node
);
1222 if (BE (err
!= REG_NOERROR
, 0))
1225 /* Go to the left node, or up and to the right. */
1230 bin_tree_t
*prev
= NULL
;
1231 while (node
->right
== prev
|| node
->right
== NULL
)
1234 node
= node
->parent
;
1243 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1244 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1245 backreferences as well. Requires a preorder visit. */
1246 static reg_errcode_t
1247 optimize_subexps (extra
, node
)
1251 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1253 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1255 int idx
= node
->token
.opr
.idx
;
1256 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1257 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1260 else if (node
->token
.type
== SUBEXP
1261 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1263 int other_idx
= node
->left
->token
.opr
.idx
;
1265 node
->left
= node
->left
->left
;
1267 node
->left
->parent
= node
;
1269 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1270 if (other_idx
< 8 * sizeof (dfa
->used_bkref_map
))
1271 dfa
->used_bkref_map
&= ~(1 << other_idx
);
1277 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1278 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1279 static reg_errcode_t
1280 lower_subexps (extra
, node
)
1284 regex_t
*preg
= (regex_t
*) extra
;
1285 reg_errcode_t err
= REG_NOERROR
;
1287 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1289 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1291 node
->left
->parent
= node
;
1293 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1295 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1297 node
->right
->parent
= node
;
1304 lower_subexp (err
, preg
, node
)
1309 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1310 bin_tree_t
*body
= node
->left
;
1311 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1314 && (node
->token
.opr
.idx
>= 8 * sizeof (dfa
->used_bkref_map
)
1315 || !(dfa
->used_bkref_map
& (1 << node
->token
.opr
.idx
))))
1318 /* Convert the SUBEXP node to the concatenation of an
1319 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1320 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1321 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1322 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1323 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1324 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1330 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1331 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1335 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1336 nodes. Requires a postorder visit. */
1337 static reg_errcode_t
1338 calc_first (extra
, node
)
1342 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1343 if (node
->token
.type
== CONCAT
)
1345 node
->first
= node
->left
->first
;
1346 node
->node_idx
= node
->left
->node_idx
;
1351 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1352 if (BE (node
->node_idx
== -1, 0))
1358 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1359 static reg_errcode_t
1360 calc_next (extra
, node
)
1364 switch (node
->token
.type
)
1366 case OP_DUP_ASTERISK
:
1367 node
->left
->next
= node
;
1370 node
->left
->next
= node
->right
->first
;
1371 node
->right
->next
= node
->next
;
1375 node
->left
->next
= node
->next
;
1377 node
->right
->next
= node
->next
;
1383 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1384 static reg_errcode_t
1385 link_nfa_nodes (extra
, node
)
1389 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1390 int idx
= node
->node_idx
;
1391 reg_errcode_t err
= REG_NOERROR
;
1393 switch (node
->token
.type
)
1399 assert (node
->next
== NULL
);
1402 case OP_DUP_ASTERISK
:
1406 dfa
->has_plural_match
= 1;
1407 if (node
->left
!= NULL
)
1408 left
= node
->left
->first
->node_idx
;
1410 left
= node
->next
->node_idx
;
1411 if (node
->right
!= NULL
)
1412 right
= node
->right
->first
->node_idx
;
1414 right
= node
->next
->node_idx
;
1416 assert (right
> -1);
1417 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1422 case OP_OPEN_SUBEXP
:
1423 case OP_CLOSE_SUBEXP
:
1424 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1428 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1429 if (node
->token
.type
== OP_BACK_REF
)
1430 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1434 assert (!IS_EPSILON_NODE (node
->token
.type
));
1435 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1442 /* Duplicate the epsilon closure of the node ROOT_NODE.
1443 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1444 to their own constraint. */
1446 static reg_errcode_t
1447 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1450 int top_org_node
, top_clone_node
, root_node
;
1451 unsigned int init_constraint
;
1454 int org_node
, clone_node
, ret
;
1455 unsigned int constraint
= init_constraint
;
1456 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1458 int org_dest
, clone_dest
;
1459 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1461 /* If the back reference epsilon-transit, its destination must
1462 also have the constraint. Then duplicate the epsilon closure
1463 of the destination of the back reference, and store it in
1464 edests of the back reference. */
1465 org_dest
= dfa
->nexts
[org_node
];
1466 re_node_set_empty (dfa
->edests
+ clone_node
);
1467 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1468 if (BE (err
!= REG_NOERROR
, 0))
1470 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1471 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1472 if (BE (ret
< 0, 0))
1475 else if (dfa
->edests
[org_node
].nelem
== 0)
1477 /* In case of the node can't epsilon-transit, don't duplicate the
1478 destination and store the original destination as the
1479 destination of the node. */
1480 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1483 else if (dfa
->edests
[org_node
].nelem
== 1)
1485 /* In case of the node can epsilon-transit, and it has only one
1487 org_dest
= dfa
->edests
[org_node
].elems
[0];
1488 re_node_set_empty (dfa
->edests
+ clone_node
);
1489 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1491 /* In case of the node has another constraint, append it. */
1492 if (org_node
== root_node
&& clone_node
!= org_node
)
1494 /* ...but if the node is root_node itself, it means the
1495 epsilon closure have a loop, then tie it to the
1496 destination of the root_node. */
1497 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1499 if (BE (ret
< 0, 0))
1503 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1505 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1506 if (BE (err
!= REG_NOERROR
, 0))
1508 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1509 if (BE (ret
< 0, 0))
1512 else /* dfa->edests[org_node].nelem == 2 */
1514 /* In case of the node can epsilon-transit, and it has two
1515 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1516 org_dest
= dfa
->edests
[org_node
].elems
[0];
1517 re_node_set_empty (dfa
->edests
+ clone_node
);
1518 /* Search for a duplicated node which satisfies the constraint. */
1519 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1520 if (clone_dest
== -1)
1522 /* There are no such a duplicated node, create a new one. */
1523 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1524 if (BE (err
!= REG_NOERROR
, 0))
1526 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1527 if (BE (ret
< 0, 0))
1529 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1530 root_node
, constraint
);
1531 if (BE (err
!= REG_NOERROR
, 0))
1536 /* There are a duplicated node which satisfy the constraint,
1537 use it to avoid infinite loop. */
1538 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1539 if (BE (ret
< 0, 0))
1543 org_dest
= dfa
->edests
[org_node
].elems
[1];
1544 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1545 if (BE (err
!= REG_NOERROR
, 0))
1547 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1548 if (BE (ret
< 0, 0))
1551 org_node
= org_dest
;
1552 clone_node
= clone_dest
;
1557 /* Search for a node which is duplicated from the node ORG_NODE, and
1558 satisfies the constraint CONSTRAINT. */
1561 search_duplicated_node (dfa
, org_node
, constraint
)
1564 unsigned int constraint
;
1567 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1569 if (org_node
== dfa
->org_indices
[idx
]
1570 && constraint
== dfa
->nodes
[idx
].constraint
)
1571 return idx
; /* Found. */
1573 return -1; /* Not found. */
1576 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1577 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1578 otherwise return the error code. */
1580 static reg_errcode_t
1581 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1583 int *new_idx
, org_idx
;
1584 unsigned int constraint
;
1586 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1587 if (BE (dup_idx
== -1, 0))
1589 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1590 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1591 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1592 dfa
->nodes
[dup_idx
].duplicated
= 1;
1594 /* Store the index of the original node. */
1595 dfa
->org_indices
[dup_idx
] = org_idx
;
1601 calc_inveclosure (dfa
)
1605 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1607 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1609 dest
= dfa
->eclosures
[src
].elems
[idx
];
1610 re_node_set_insert_last (dfa
->inveclosures
+ dest
, src
);
1615 /* Calculate "eclosure" for all the node in DFA. */
1617 static reg_errcode_t
1621 int node_idx
, incomplete
;
1623 assert (dfa
->nodes_len
> 0);
1626 /* For each nodes, calculate epsilon closure. */
1627 for (node_idx
= 0; ; ++node_idx
)
1630 re_node_set eclosure_elem
;
1631 if (node_idx
== dfa
->nodes_len
)
1640 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1643 /* If we have already calculated, skip it. */
1644 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1646 /* Calculate epsilon closure of `node_idx'. */
1647 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1648 if (BE (err
!= REG_NOERROR
, 0))
1651 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1654 re_node_set_free (&eclosure_elem
);
1660 /* Calculate epsilon closure of NODE. */
1662 static reg_errcode_t
1663 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1664 re_node_set
*new_set
;
1669 unsigned int constraint
;
1671 re_node_set eclosure
;
1673 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1674 if (BE (err
!= REG_NOERROR
, 0))
1677 /* This indicates that we are calculating this node now.
1678 We reference this value to avoid infinite loop. */
1679 dfa
->eclosures
[node
].nelem
= -1;
1681 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1682 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1683 /* If the current node has constraints, duplicate all nodes.
1684 Since they must inherit the constraints. */
1686 && dfa
->edests
[node
].nelem
1687 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1689 int org_node
, cur_node
;
1690 org_node
= cur_node
= node
;
1691 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1692 if (BE (err
!= REG_NOERROR
, 0))
1696 /* Expand each epsilon destination nodes. */
1697 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1698 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1700 re_node_set eclosure_elem
;
1701 int edest
= dfa
->edests
[node
].elems
[i
];
1702 /* If calculating the epsilon closure of `edest' is in progress,
1703 return intermediate result. */
1704 if (dfa
->eclosures
[edest
].nelem
== -1)
1709 /* If we haven't calculated the epsilon closure of `edest' yet,
1710 calculate now. Otherwise use calculated epsilon closure. */
1711 if (dfa
->eclosures
[edest
].nelem
== 0)
1713 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1714 if (BE (err
!= REG_NOERROR
, 0))
1718 eclosure_elem
= dfa
->eclosures
[edest
];
1719 /* Merge the epsilon closure of `edest'. */
1720 re_node_set_merge (&eclosure
, &eclosure_elem
);
1721 /* If the epsilon closure of `edest' is incomplete,
1722 the epsilon closure of this node is also incomplete. */
1723 if (dfa
->eclosures
[edest
].nelem
== 0)
1726 re_node_set_free (&eclosure_elem
);
1730 /* Epsilon closures include itself. */
1731 re_node_set_insert (&eclosure
, node
);
1732 if (incomplete
&& !root
)
1733 dfa
->eclosures
[node
].nelem
= 0;
1735 dfa
->eclosures
[node
] = eclosure
;
1736 *new_set
= eclosure
;
1740 /* Functions for token which are used in the parser. */
1742 /* Fetch a token from INPUT.
1743 We must not use this function inside bracket expressions. */
1746 fetch_token (result
, input
, syntax
)
1749 reg_syntax_t syntax
;
1751 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1754 /* Peek a token from INPUT, and return the length of the token.
1755 We must not use this function inside bracket expressions. */
1758 peek_token (token
, input
, syntax
)
1761 reg_syntax_t syntax
;
1765 if (re_string_eoi (input
))
1767 token
->type
= END_OF_RE
;
1771 c
= re_string_peek_byte (input
, 0);
1774 token
->word_char
= 0;
1775 #ifdef RE_ENABLE_I18N
1776 token
->mb_partial
= 0;
1777 if (input
->mb_cur_max
> 1 &&
1778 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1780 token
->type
= CHARACTER
;
1781 token
->mb_partial
= 1;
1788 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1790 token
->type
= BACK_SLASH
;
1794 c2
= re_string_peek_byte_case (input
, 1);
1796 token
->type
= CHARACTER
;
1797 #ifdef RE_ENABLE_I18N
1798 if (input
->mb_cur_max
> 1)
1800 wint_t wc
= re_string_wchar_at (input
,
1801 re_string_cur_idx (input
) + 1);
1802 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1806 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1811 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1812 token
->type
= OP_ALT
;
1814 case '1': case '2': case '3': case '4': case '5':
1815 case '6': case '7': case '8': case '9':
1816 if (!(syntax
& RE_NO_BK_REFS
))
1818 token
->type
= OP_BACK_REF
;
1819 token
->opr
.idx
= c2
- '1';
1823 if (!(syntax
& RE_NO_GNU_OPS
))
1825 token
->type
= ANCHOR
;
1826 token
->opr
.ctx_type
= WORD_FIRST
;
1830 if (!(syntax
& RE_NO_GNU_OPS
))
1832 token
->type
= ANCHOR
;
1833 token
->opr
.ctx_type
= WORD_LAST
;
1837 if (!(syntax
& RE_NO_GNU_OPS
))
1839 token
->type
= ANCHOR
;
1840 token
->opr
.ctx_type
= WORD_DELIM
;
1844 if (!(syntax
& RE_NO_GNU_OPS
))
1846 token
->type
= ANCHOR
;
1847 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1851 if (!(syntax
& RE_NO_GNU_OPS
))
1852 token
->type
= OP_WORD
;
1855 if (!(syntax
& RE_NO_GNU_OPS
))
1856 token
->type
= OP_NOTWORD
;
1859 if (!(syntax
& RE_NO_GNU_OPS
))
1860 token
->type
= OP_SPACE
;
1863 if (!(syntax
& RE_NO_GNU_OPS
))
1864 token
->type
= OP_NOTSPACE
;
1867 if (!(syntax
& RE_NO_GNU_OPS
))
1869 token
->type
= ANCHOR
;
1870 token
->opr
.ctx_type
= BUF_FIRST
;
1874 if (!(syntax
& RE_NO_GNU_OPS
))
1876 token
->type
= ANCHOR
;
1877 token
->opr
.ctx_type
= BUF_LAST
;
1881 if (!(syntax
& RE_NO_BK_PARENS
))
1882 token
->type
= OP_OPEN_SUBEXP
;
1885 if (!(syntax
& RE_NO_BK_PARENS
))
1886 token
->type
= OP_CLOSE_SUBEXP
;
1889 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1890 token
->type
= OP_DUP_PLUS
;
1893 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1894 token
->type
= OP_DUP_QUESTION
;
1897 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1898 token
->type
= OP_OPEN_DUP_NUM
;
1901 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1902 token
->type
= OP_CLOSE_DUP_NUM
;
1910 token
->type
= CHARACTER
;
1911 #ifdef RE_ENABLE_I18N
1912 if (input
->mb_cur_max
> 1)
1914 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1915 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1919 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1924 if (syntax
& RE_NEWLINE_ALT
)
1925 token
->type
= OP_ALT
;
1928 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1929 token
->type
= OP_ALT
;
1932 token
->type
= OP_DUP_ASTERISK
;
1935 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1936 token
->type
= OP_DUP_PLUS
;
1939 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1940 token
->type
= OP_DUP_QUESTION
;
1943 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1944 token
->type
= OP_OPEN_DUP_NUM
;
1947 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1948 token
->type
= OP_CLOSE_DUP_NUM
;
1951 if (syntax
& RE_NO_BK_PARENS
)
1952 token
->type
= OP_OPEN_SUBEXP
;
1955 if (syntax
& RE_NO_BK_PARENS
)
1956 token
->type
= OP_CLOSE_SUBEXP
;
1959 token
->type
= OP_OPEN_BRACKET
;
1962 token
->type
= OP_PERIOD
;
1965 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1966 re_string_cur_idx (input
) != 0)
1968 char prev
= re_string_peek_byte (input
, -1);
1969 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1972 token
->type
= ANCHOR
;
1973 token
->opr
.ctx_type
= LINE_FIRST
;
1976 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1977 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1980 re_string_skip_bytes (input
, 1);
1981 peek_token (&next
, input
, syntax
);
1982 re_string_skip_bytes (input
, -1);
1983 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1986 token
->type
= ANCHOR
;
1987 token
->opr
.ctx_type
= LINE_LAST
;
1995 /* Peek a token from INPUT, and return the length of the token.
1996 We must not use this function out of bracket expressions. */
1999 peek_token_bracket (token
, input
, syntax
)
2002 reg_syntax_t syntax
;
2005 if (re_string_eoi (input
))
2007 token
->type
= END_OF_RE
;
2010 c
= re_string_peek_byte (input
, 0);
2013 #ifdef RE_ENABLE_I18N
2014 if (input
->mb_cur_max
> 1 &&
2015 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2017 token
->type
= CHARACTER
;
2020 #endif /* RE_ENABLE_I18N */
2022 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2023 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2025 /* In this case, '\' escape a character. */
2027 re_string_skip_bytes (input
, 1);
2028 c2
= re_string_peek_byte (input
, 0);
2030 token
->type
= CHARACTER
;
2033 if (c
== '[') /* '[' is a special char in a bracket exps. */
2037 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2038 c2
= re_string_peek_byte (input
, 1);
2046 token
->type
= OP_OPEN_COLL_ELEM
;
2049 token
->type
= OP_OPEN_EQUIV_CLASS
;
2052 if (syntax
& RE_CHAR_CLASSES
)
2054 token
->type
= OP_OPEN_CHAR_CLASS
;
2057 /* else fall through. */
2059 token
->type
= CHARACTER
;
2069 token
->type
= OP_CHARSET_RANGE
;
2072 token
->type
= OP_CLOSE_BRACKET
;
2075 token
->type
= OP_NON_MATCH_LIST
;
2078 token
->type
= CHARACTER
;
2083 /* Functions for parser. */
2085 /* Entry point of the parser.
2086 Parse the regular expression REGEXP and return the structure tree.
2087 If an error is occured, ERR is set by error code, and return NULL.
2088 This function build the following tree, from regular expression <reg_exp>:
2094 CAT means concatenation.
2095 EOR means end of regular expression. */
2098 parse (regexp
, preg
, syntax
, err
)
2099 re_string_t
*regexp
;
2101 reg_syntax_t syntax
;
2104 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2105 bin_tree_t
*tree
, *eor
, *root
;
2106 re_token_t current_token
;
2107 dfa
->syntax
= syntax
;
2108 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2109 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2110 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2112 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2114 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2117 if (BE (eor
== NULL
|| root
== NULL
, 0))
2125 /* This function build the following tree, from regular expression
2126 <branch1>|<branch2>:
2132 ALT means alternative, which represents the operator `|'. */
2135 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2136 re_string_t
*regexp
;
2139 reg_syntax_t syntax
;
2143 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2144 bin_tree_t
*tree
, *branch
= NULL
;
2145 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2146 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2149 while (token
->type
== OP_ALT
)
2151 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2152 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2153 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2155 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2156 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2161 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2162 if (BE (tree
== NULL
, 0))
2171 /* This function build the following tree, from regular expression
2178 CAT means concatenation. */
2181 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2182 re_string_t
*regexp
;
2185 reg_syntax_t syntax
;
2189 bin_tree_t
*tree
, *exp
;
2190 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2191 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2192 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2195 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2196 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2198 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2199 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2203 if (tree
!= NULL
&& exp
!= NULL
)
2205 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2212 else if (tree
== NULL
)
2214 /* Otherwise exp == NULL, we don't need to create new tree. */
2219 /* This function build the following tree, from regular expression a*:
2226 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2227 re_string_t
*regexp
;
2230 reg_syntax_t syntax
;
2234 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2236 switch (token
->type
)
2239 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2240 if (BE (tree
== NULL
, 0))
2245 #ifdef RE_ENABLE_I18N
2246 if (dfa
->mb_cur_max
> 1)
2248 while (!re_string_eoi (regexp
)
2249 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2251 bin_tree_t
*mbc_remain
;
2252 fetch_token (token
, regexp
, syntax
);
2253 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2254 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2255 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2264 case OP_OPEN_SUBEXP
:
2265 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2266 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2269 case OP_OPEN_BRACKET
:
2270 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2271 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2275 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2280 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2281 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2282 if (BE (tree
== NULL
, 0))
2288 dfa
->has_mb_node
= 1;
2290 case OP_OPEN_DUP_NUM
:
2291 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2297 case OP_DUP_ASTERISK
:
2299 case OP_DUP_QUESTION
:
2300 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2305 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2307 fetch_token (token
, regexp
, syntax
);
2308 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2310 /* else fall through */
2311 case OP_CLOSE_SUBEXP
:
2312 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2313 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2318 /* else fall through */
2319 case OP_CLOSE_DUP_NUM
:
2320 /* We treat it as a normal character. */
2322 /* Then we can these characters as normal characters. */
2323 token
->type
= CHARACTER
;
2324 /* mb_partial and word_char bits should be initialized already
2326 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2327 if (BE (tree
== NULL
, 0))
2334 if ((token
->opr
.ctx_type
2335 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2336 && dfa
->word_ops_used
== 0)
2337 init_word_char (dfa
);
2338 if (token
->opr
.ctx_type
== WORD_DELIM
2339 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2341 bin_tree_t
*tree_first
, *tree_last
;
2342 if (token
->opr
.ctx_type
== WORD_DELIM
)
2344 token
->opr
.ctx_type
= WORD_FIRST
;
2345 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2346 token
->opr
.ctx_type
= WORD_LAST
;
2350 token
->opr
.ctx_type
= INSIDE_WORD
;
2351 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2352 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2354 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2355 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2356 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2364 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2365 if (BE (tree
== NULL
, 0))
2371 /* We must return here, since ANCHORs can't be followed
2372 by repetition operators.
2373 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2375 fetch_token (token
, regexp
, syntax
);
2378 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2379 if (BE (tree
== NULL
, 0))
2384 if (dfa
->mb_cur_max
> 1)
2385 dfa
->has_mb_node
= 1;
2389 tree
= build_charclass_op (dfa
, regexp
->trans
,
2390 (const unsigned char *) "alnum",
2391 (const unsigned char *) "_",
2392 token
->type
== OP_NOTWORD
, err
);
2393 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2398 tree
= build_charclass_op (dfa
, regexp
->trans
,
2399 (const unsigned char *) "space",
2400 (const unsigned char *) "",
2401 token
->type
== OP_NOTSPACE
, err
);
2402 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2412 /* Must not happen? */
2418 fetch_token (token
, regexp
, syntax
);
2420 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2421 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2423 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2424 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2426 /* In BRE consecutive duplications are not allowed. */
2427 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2428 && (token
->type
== OP_DUP_ASTERISK
2429 || token
->type
== OP_OPEN_DUP_NUM
))
2439 /* This function build the following tree, from regular expression
2447 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2448 re_string_t
*regexp
;
2451 reg_syntax_t syntax
;
2455 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2458 cur_nsub
= preg
->re_nsub
++;
2460 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2462 /* The subexpression may be a null string. */
2463 if (token
->type
== OP_CLOSE_SUBEXP
)
2467 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2468 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2470 if (BE (*err
!= REG_NOERROR
, 0))
2473 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2475 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2476 if (BE (tree
== NULL
, 0))
2481 tree
->token
.opr
.idx
= cur_nsub
;
2485 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2488 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2490 re_string_t
*regexp
;
2493 reg_syntax_t syntax
;
2496 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2497 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2498 re_token_t start_token
= *token
;
2500 if (token
->type
== OP_OPEN_DUP_NUM
)
2503 start
= fetch_number (regexp
, token
, syntax
);
2506 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2507 start
= 0; /* We treat "{,m}" as "{0,m}". */
2510 *err
= REG_BADBR
; /* <re>{} is invalid. */
2514 if (BE (start
!= -2, 1))
2516 /* We treat "{n}" as "{n,n}". */
2517 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2518 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2519 ? fetch_number (regexp
, token
, syntax
) : -2));
2521 if (BE (start
== -2 || end
== -2, 0))
2523 /* Invalid sequence. */
2524 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2526 if (token
->type
== END_OF_RE
)
2534 /* If the syntax bit is set, rollback. */
2535 re_string_set_index (regexp
, start_idx
);
2536 *token
= start_token
;
2537 token
->type
= CHARACTER
;
2538 /* mb_partial and word_char bits should be already initialized by
2543 if (BE (end
!= -1 && start
> end
, 0))
2545 /* First number greater than second. */
2552 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2553 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2556 fetch_token (token
, regexp
, syntax
);
2558 if (BE (elem
== NULL
, 0))
2560 if (BE (start
== 0 && end
== 0, 0))
2562 postorder (elem
, free_tree
, NULL
);
2566 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2567 if (BE (start
> 0, 0))
2570 for (i
= 2; i
<= start
; ++i
)
2572 elem
= duplicate_tree (elem
, dfa
);
2573 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2574 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2575 goto parse_dup_op_espace
;
2581 /* Duplicate ELEM before it is marked optional. */
2582 elem
= duplicate_tree (elem
, dfa
);
2588 if (elem
->token
.type
== SUBEXP
)
2589 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2591 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2592 if (BE (tree
== NULL
, 0))
2593 goto parse_dup_op_espace
;
2595 /* This loop is actually executed only when end != -1,
2596 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2597 already created the start+1-th copy. */
2598 for (i
= start
+ 2; i
<= end
; ++i
)
2600 elem
= duplicate_tree (elem
, dfa
);
2601 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2602 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2603 goto parse_dup_op_espace
;
2605 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2606 if (BE (tree
== NULL
, 0))
2607 goto parse_dup_op_espace
;
2611 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2615 parse_dup_op_espace
:
2620 /* Size of the names for collating symbol/equivalence_class/character_class.
2621 I'm not sure, but maybe enough. */
2622 #define BRACKET_NAME_BUF_SIZE 32
2625 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2626 Build the range expression which starts from START_ELEM, and ends
2627 at END_ELEM. The result are written to MBCSET and SBCSET.
2628 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2629 mbcset->range_ends, is a pointer argument sinse we may
2632 static reg_errcode_t
2633 # ifdef RE_ENABLE_I18N
2634 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2635 re_charset_t
*mbcset
;
2637 # else /* not RE_ENABLE_I18N */
2638 build_range_exp (sbcset
, start_elem
, end_elem
)
2639 # endif /* not RE_ENABLE_I18N */
2640 re_bitset_ptr_t sbcset
;
2641 bracket_elem_t
*start_elem
, *end_elem
;
2643 unsigned int start_ch
, end_ch
;
2644 /* Equivalence Classes and Character Classes can't be a range start/end. */
2645 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2646 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2650 /* We can handle no multi character collating elements without libc
2652 if (BE ((start_elem
->type
== COLL_SYM
2653 && strlen ((char *) start_elem
->opr
.name
) > 1)
2654 || (end_elem
->type
== COLL_SYM
2655 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2656 return REG_ECOLLATE
;
2658 # ifdef RE_ENABLE_I18N
2660 wchar_t wc
, start_wc
, end_wc
;
2661 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2663 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2664 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2666 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2667 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2669 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2670 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2671 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2672 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2673 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2674 return REG_ECOLLATE
;
2675 cmp_buf
[0] = start_wc
;
2676 cmp_buf
[4] = end_wc
;
2677 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2680 /* Got valid collation sequence values, add them as a new entry.
2681 However, for !_LIBC we have no collation elements: if the
2682 character set is single byte, the single byte character set
2683 that we build below suffices. parse_bracket_exp passes
2684 no MBCSET if dfa->mb_cur_max == 1. */
2687 /* Check the space of the arrays. */
2688 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2690 /* There is not enough space, need realloc. */
2691 wchar_t *new_array_start
, *new_array_end
;
2694 /* +1 in case of mbcset->nranges is 0. */
2695 new_nranges
= 2 * mbcset
->nranges
+ 1;
2696 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2697 are NULL if *range_alloc == 0. */
2698 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2700 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2703 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2706 mbcset
->range_starts
= new_array_start
;
2707 mbcset
->range_ends
= new_array_end
;
2708 *range_alloc
= new_nranges
;
2711 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2712 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2715 /* Build the table for single byte characters. */
2716 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2719 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2720 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2721 bitset_set (sbcset
, wc
);
2724 # else /* not RE_ENABLE_I18N */
2727 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2728 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2730 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2731 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2733 if (start_ch
> end_ch
)
2735 /* Build the table for single byte characters. */
2736 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2737 if (start_ch
<= ch
&& ch
<= end_ch
)
2738 bitset_set (sbcset
, ch
);
2740 # endif /* not RE_ENABLE_I18N */
2743 #endif /* not _LIBC */
2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2747 Build the collating element which is represented by NAME.
2748 The result are written to MBCSET and SBCSET.
2749 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2750 pointer argument since we may update it. */
2752 static reg_errcode_t
2753 # ifdef RE_ENABLE_I18N
2754 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2755 re_charset_t
*mbcset
;
2756 int *coll_sym_alloc
;
2757 # else /* not RE_ENABLE_I18N */
2758 build_collating_symbol (sbcset
, name
)
2759 # endif /* not RE_ENABLE_I18N */
2760 re_bitset_ptr_t sbcset
;
2761 const unsigned char *name
;
2763 size_t name_len
= strlen ((const char *) name
);
2764 if (BE (name_len
!= 1, 0))
2765 return REG_ECOLLATE
;
2768 bitset_set (sbcset
, name
[0]);
2772 #endif /* not _LIBC */
2774 /* This function parse bracket expression like "[abc]", "[a-c]",
2778 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2779 re_string_t
*regexp
;
2782 reg_syntax_t syntax
;
2786 const unsigned char *collseqmb
;
2787 const char *collseqwc
;
2790 const int32_t *symb_table
;
2791 const unsigned char *extra
;
2793 /* Local function for parse_bracket_exp used in _LIBC environement.
2794 Seek the collating symbol entry correspondings to NAME.
2795 Return the index of the symbol in the SYMB_TABLE. */
2798 __attribute ((always_inline
))
2799 seek_collating_symbol_entry (name
, name_len
)
2800 const unsigned char *name
;
2803 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2804 int32_t elem
= hash
% table_size
;
2805 int32_t second
= hash
% (table_size
- 2);
2806 while (symb_table
[2 * elem
] != 0)
2808 /* First compare the hashing value. */
2809 if (symb_table
[2 * elem
] == hash
2810 /* Compare the length of the name. */
2811 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2812 /* Compare the name. */
2813 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2816 /* Yep, this is the entry. */
2826 /* Local function for parse_bracket_exp used in _LIBC environement.
2827 Look up the collation sequence value of BR_ELEM.
2828 Return the value if succeeded, UINT_MAX otherwise. */
2830 auto inline unsigned int
2831 __attribute ((always_inline
))
2832 lookup_collation_sequence_value (br_elem
)
2833 bracket_elem_t
*br_elem
;
2835 if (br_elem
->type
== SB_CHAR
)
2838 if (MB_CUR_MAX == 1)
2841 return collseqmb
[br_elem
->opr
.ch
];
2844 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2845 return __collseq_table_lookup (collseqwc
, wc
);
2848 else if (br_elem
->type
== MB_CHAR
)
2850 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2852 else if (br_elem
->type
== COLL_SYM
)
2854 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2858 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2860 if (symb_table
[2 * elem
] != 0)
2862 /* We found the entry. */
2863 idx
= symb_table
[2 * elem
+ 1];
2864 /* Skip the name of collating element name. */
2865 idx
+= 1 + extra
[idx
];
2866 /* Skip the byte sequence of the collating element. */
2867 idx
+= 1 + extra
[idx
];
2868 /* Adjust for the alignment. */
2869 idx
= (idx
+ 3) & ~3;
2870 /* Skip the multibyte collation sequence value. */
2871 idx
+= sizeof (unsigned int);
2872 /* Skip the wide char sequence of the collating element. */
2873 idx
+= sizeof (unsigned int) *
2874 (1 + *(unsigned int *) (extra
+ idx
));
2875 /* Return the collation sequence value. */
2876 return *(unsigned int *) (extra
+ idx
);
2878 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2880 /* No valid character. Match it as a single byte
2882 return collseqmb
[br_elem
->opr
.name
[0]];
2885 else if (sym_name_len
== 1)
2886 return collseqmb
[br_elem
->opr
.name
[0]];
2891 /* Local function for parse_bracket_exp used in _LIBC environement.
2892 Build the range expression which starts from START_ELEM, and ends
2893 at END_ELEM. The result are written to MBCSET and SBCSET.
2894 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2895 mbcset->range_ends, is a pointer argument sinse we may
2898 auto inline reg_errcode_t
2899 __attribute ((always_inline
))
2900 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2901 re_charset_t
*mbcset
;
2903 re_bitset_ptr_t sbcset
;
2904 bracket_elem_t
*start_elem
, *end_elem
;
2907 uint32_t start_collseq
;
2908 uint32_t end_collseq
;
2910 /* Equivalence Classes and Character Classes can't be a range
2912 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2913 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2917 start_collseq
= lookup_collation_sequence_value (start_elem
);
2918 end_collseq
= lookup_collation_sequence_value (end_elem
);
2919 /* Check start/end collation sequence values. */
2920 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2921 return REG_ECOLLATE
;
2922 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2925 /* Got valid collation sequence values, add them as a new entry.
2926 However, if we have no collation elements, and the character set
2927 is single byte, the single byte character set that we
2928 build below suffices. */
2929 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2931 /* Check the space of the arrays. */
2932 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2934 /* There is not enough space, need realloc. */
2935 uint32_t *new_array_start
;
2936 uint32_t *new_array_end
;
2939 /* +1 in case of mbcset->nranges is 0. */
2940 new_nranges
= 2 * mbcset
->nranges
+ 1;
2941 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2943 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2946 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2949 mbcset
->range_starts
= new_array_start
;
2950 mbcset
->range_ends
= new_array_end
;
2951 *range_alloc
= new_nranges
;
2954 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2955 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2958 /* Build the table for single byte characters. */
2959 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2961 uint32_t ch_collseq
;
2963 if (MB_CUR_MAX == 1)
2966 ch_collseq
= collseqmb
[ch
];
2968 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2969 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2970 bitset_set (sbcset
, ch
);
2975 /* Local function for parse_bracket_exp used in _LIBC environement.
2976 Build the collating element which is represented by NAME.
2977 The result are written to MBCSET and SBCSET.
2978 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2979 pointer argument sinse we may update it. */
2981 auto inline reg_errcode_t
2982 __attribute ((always_inline
))
2983 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2984 re_charset_t
*mbcset
;
2985 int *coll_sym_alloc
;
2986 re_bitset_ptr_t sbcset
;
2987 const unsigned char *name
;
2990 size_t name_len
= strlen ((const char *) name
);
2993 elem
= seek_collating_symbol_entry (name
, name_len
);
2994 if (symb_table
[2 * elem
] != 0)
2996 /* We found the entry. */
2997 idx
= symb_table
[2 * elem
+ 1];
2998 /* Skip the name of collating element name. */
2999 idx
+= 1 + extra
[idx
];
3001 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3003 /* No valid character, treat it as a normal
3005 bitset_set (sbcset
, name
[0]);
3009 return REG_ECOLLATE
;
3011 /* Got valid collation sequence, add it as a new entry. */
3012 /* Check the space of the arrays. */
3013 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3015 /* Not enough, realloc it. */
3016 /* +1 in case of mbcset->ncoll_syms is 0. */
3017 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3018 /* Use realloc since mbcset->coll_syms is NULL
3020 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3021 new_coll_sym_alloc
);
3022 if (BE (new_coll_syms
== NULL
, 0))
3024 mbcset
->coll_syms
= new_coll_syms
;
3025 *coll_sym_alloc
= new_coll_sym_alloc
;
3027 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3032 if (BE (name_len
!= 1, 0))
3033 return REG_ECOLLATE
;
3036 bitset_set (sbcset
, name
[0]);
3043 re_token_t br_token
;
3044 re_bitset_ptr_t sbcset
;
3045 #ifdef RE_ENABLE_I18N
3046 re_charset_t
*mbcset
;
3047 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3048 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3049 #endif /* not RE_ENABLE_I18N */
3051 bin_tree_t
*work_tree
;
3053 int first_round
= 1;
3055 collseqmb
= (const unsigned char *)
3056 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3057 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3063 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3064 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3065 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3066 _NL_COLLATE_SYMB_TABLEMB
);
3067 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3068 _NL_COLLATE_SYMB_EXTRAMB
);
3071 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3072 #ifdef RE_ENABLE_I18N
3073 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3074 #endif /* RE_ENABLE_I18N */
3075 #ifdef RE_ENABLE_I18N
3076 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3078 if (BE (sbcset
== NULL
, 0))
3079 #endif /* RE_ENABLE_I18N */
3085 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3086 if (BE (token
->type
== END_OF_RE
, 0))
3089 goto parse_bracket_exp_free_return
;
3091 if (token
->type
== OP_NON_MATCH_LIST
)
3093 #ifdef RE_ENABLE_I18N
3094 mbcset
->non_match
= 1;
3095 #endif /* not RE_ENABLE_I18N */
3097 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3098 bitset_set (sbcset
, '\0');
3099 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3100 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3101 if (BE (token
->type
== END_OF_RE
, 0))
3104 goto parse_bracket_exp_free_return
;
3108 /* We treat the first ']' as a normal character. */
3109 if (token
->type
== OP_CLOSE_BRACKET
)
3110 token
->type
= CHARACTER
;
3114 bracket_elem_t start_elem
, end_elem
;
3115 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3116 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3118 int token_len2
= 0, is_range_exp
= 0;
3121 start_elem
.opr
.name
= start_name_buf
;
3122 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3123 syntax
, first_round
);
3124 if (BE (ret
!= REG_NOERROR
, 0))
3127 goto parse_bracket_exp_free_return
;
3131 /* Get information about the next token. We need it in any case. */
3132 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3134 /* Do not check for ranges if we know they are not allowed. */
3135 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3137 if (BE (token
->type
== END_OF_RE
, 0))
3140 goto parse_bracket_exp_free_return
;
3142 if (token
->type
== OP_CHARSET_RANGE
)
3144 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3145 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3146 if (BE (token2
.type
== END_OF_RE
, 0))
3149 goto parse_bracket_exp_free_return
;
3151 if (token2
.type
== OP_CLOSE_BRACKET
)
3153 /* We treat the last '-' as a normal character. */
3154 re_string_skip_bytes (regexp
, -token_len
);
3155 token
->type
= CHARACTER
;
3162 if (is_range_exp
== 1)
3164 end_elem
.opr
.name
= end_name_buf
;
3165 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3167 if (BE (ret
!= REG_NOERROR
, 0))
3170 goto parse_bracket_exp_free_return
;
3173 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3176 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3177 &start_elem
, &end_elem
);
3179 # ifdef RE_ENABLE_I18N
3180 *err
= build_range_exp (sbcset
,
3181 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3182 &range_alloc
, &start_elem
, &end_elem
);
3184 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3186 #endif /* RE_ENABLE_I18N */
3187 if (BE (*err
!= REG_NOERROR
, 0))
3188 goto parse_bracket_exp_free_return
;
3192 switch (start_elem
.type
)
3195 bitset_set (sbcset
, start_elem
.opr
.ch
);
3197 #ifdef RE_ENABLE_I18N
3199 /* Check whether the array has enough space. */
3200 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3202 wchar_t *new_mbchars
;
3203 /* Not enough, realloc it. */
3204 /* +1 in case of mbcset->nmbchars is 0. */
3205 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3206 /* Use realloc since array is NULL if *alloc == 0. */
3207 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3209 if (BE (new_mbchars
== NULL
, 0))
3210 goto parse_bracket_exp_espace
;
3211 mbcset
->mbchars
= new_mbchars
;
3213 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3215 #endif /* RE_ENABLE_I18N */
3217 *err
= build_equiv_class (sbcset
,
3218 #ifdef RE_ENABLE_I18N
3219 mbcset
, &equiv_class_alloc
,
3220 #endif /* RE_ENABLE_I18N */
3221 start_elem
.opr
.name
);
3222 if (BE (*err
!= REG_NOERROR
, 0))
3223 goto parse_bracket_exp_free_return
;
3226 *err
= build_collating_symbol (sbcset
,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset
, &coll_sym_alloc
,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem
.opr
.name
);
3231 if (BE (*err
!= REG_NOERROR
, 0))
3232 goto parse_bracket_exp_free_return
;
3235 *err
= build_charclass (regexp
->trans
, sbcset
,
3236 #ifdef RE_ENABLE_I18N
3237 mbcset
, &char_class_alloc
,
3238 #endif /* RE_ENABLE_I18N */
3239 start_elem
.opr
.name
, syntax
);
3240 if (BE (*err
!= REG_NOERROR
, 0))
3241 goto parse_bracket_exp_free_return
;
3248 if (BE (token
->type
== END_OF_RE
, 0))
3251 goto parse_bracket_exp_free_return
;
3253 if (token
->type
== OP_CLOSE_BRACKET
)
3257 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3259 /* If it is non-matching list. */
3261 bitset_not (sbcset
);
3263 #ifdef RE_ENABLE_I18N
3264 /* Ensure only single byte characters are set. */
3265 if (dfa
->mb_cur_max
> 1)
3266 bitset_mask (sbcset
, dfa
->sb_char
);
3268 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3269 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3270 || mbcset
->non_match
)))
3272 bin_tree_t
*mbc_tree
;
3274 /* Build a tree for complex bracket. */
3275 dfa
->has_mb_node
= 1;
3276 br_token
.type
= COMPLEX_BRACKET
;
3277 br_token
.opr
.mbcset
= mbcset
;
3278 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3279 if (BE (mbc_tree
== NULL
, 0))
3280 goto parse_bracket_exp_espace
;
3281 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3282 if (sbcset
[sbc_idx
])
3284 /* If there are no bits set in sbcset, there is no point
3285 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3286 if (sbc_idx
< BITSET_UINTS
)
3288 /* Build a tree for simple bracket. */
3289 br_token
.type
= SIMPLE_BRACKET
;
3290 br_token
.opr
.sbcset
= sbcset
;
3291 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3292 if (BE (work_tree
== NULL
, 0))
3293 goto parse_bracket_exp_espace
;
3295 /* Then join them by ALT node. */
3296 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3297 if (BE (work_tree
== NULL
, 0))
3298 goto parse_bracket_exp_espace
;
3303 work_tree
= mbc_tree
;
3308 /* Build a tree for simple bracket. */
3309 br_token
.type
= SIMPLE_BRACKET
;
3310 br_token
.opr
.sbcset
= sbcset
;
3311 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3312 if (BE (work_tree
== NULL
, 0))
3313 goto parse_bracket_exp_espace
;
3315 free_charset (mbcset
);
3317 #endif /* not RE_ENABLE_I18N */
3320 parse_bracket_exp_espace
:
3322 parse_bracket_exp_free_return
:
3324 #ifdef RE_ENABLE_I18N
3325 free_charset (mbcset
);
3326 #endif /* RE_ENABLE_I18N */
3330 /* Parse an element in the bracket expression. */
3332 static reg_errcode_t
3333 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3335 bracket_elem_t
*elem
;
3336 re_string_t
*regexp
;
3340 reg_syntax_t syntax
;
3343 #ifdef RE_ENABLE_I18N
3345 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3346 if (cur_char_size
> 1)
3348 elem
->type
= MB_CHAR
;
3349 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3350 re_string_skip_bytes (regexp
, cur_char_size
);
3353 #endif /* RE_ENABLE_I18N */
3354 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3355 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3356 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3357 return parse_bracket_symbol (elem
, regexp
, token
);
3358 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3360 /* A '-' must only appear as anything but a range indicator before
3361 the closing bracket. Everything else is an error. */
3363 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3364 if (token2
.type
!= OP_CLOSE_BRACKET
)
3365 /* The actual error value is not standardized since this whole
3366 case is undefined. But ERANGE makes good sense. */
3369 elem
->type
= SB_CHAR
;
3370 elem
->opr
.ch
= token
->opr
.c
;
3374 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3375 such as [:<character_class>:], [.<collating_element>.], and
3376 [=<equivalent_class>=]. */
3378 static reg_errcode_t
3379 parse_bracket_symbol (elem
, regexp
, token
)
3380 bracket_elem_t
*elem
;
3381 re_string_t
*regexp
;
3384 unsigned char ch
, delim
= token
->opr
.c
;
3386 if (re_string_eoi(regexp
))
3390 if (i
>= BRACKET_NAME_BUF_SIZE
)
3392 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3393 ch
= re_string_fetch_byte_case (regexp
);
3395 ch
= re_string_fetch_byte (regexp
);
3396 if (re_string_eoi(regexp
))
3398 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3400 elem
->opr
.name
[i
] = ch
;
3402 re_string_skip_bytes (regexp
, 1);
3403 elem
->opr
.name
[i
] = '\0';
3404 switch (token
->type
)
3406 case OP_OPEN_COLL_ELEM
:
3407 elem
->type
= COLL_SYM
;
3409 case OP_OPEN_EQUIV_CLASS
:
3410 elem
->type
= EQUIV_CLASS
;
3412 case OP_OPEN_CHAR_CLASS
:
3413 elem
->type
= CHAR_CLASS
;
3421 /* Helper function for parse_bracket_exp.
3422 Build the equivalence class which is represented by NAME.
3423 The result are written to MBCSET and SBCSET.
3424 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3425 is a pointer argument sinse we may update it. */
3427 static reg_errcode_t
3428 #ifdef RE_ENABLE_I18N
3429 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3430 re_charset_t
*mbcset
;
3431 int *equiv_class_alloc
;
3432 #else /* not RE_ENABLE_I18N */
3433 build_equiv_class (sbcset
, name
)
3434 #endif /* not RE_ENABLE_I18N */
3435 re_bitset_ptr_t sbcset
;
3436 const unsigned char *name
;
3439 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3442 const int32_t *table
, *indirect
;
3443 const unsigned char *weights
, *extra
, *cp
;
3444 unsigned char char_buf
[2];
3448 /* This #include defines a local function! */
3449 # include <locale/weight.h>
3450 /* Calculate the index for equivalence class. */
3452 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3453 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3454 _NL_COLLATE_WEIGHTMB
);
3455 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3456 _NL_COLLATE_EXTRAMB
);
3457 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3458 _NL_COLLATE_INDIRECTMB
);
3459 idx1
= findidx (&cp
);
3460 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3461 /* This isn't a valid character. */
3462 return REG_ECOLLATE
;
3464 /* Build single byte matcing table for this equivalence class. */
3465 char_buf
[1] = (unsigned char) '\0';
3466 len
= weights
[idx1
];
3467 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3471 idx2
= findidx (&cp
);
3476 /* This isn't a valid character. */
3478 if (len
== weights
[idx2
])
3481 while (cnt
<= len
&&
3482 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3486 bitset_set (sbcset
, ch
);
3489 /* Check whether the array has enough space. */
3490 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3492 /* Not enough, realloc it. */
3493 /* +1 in case of mbcset->nequiv_classes is 0. */
3494 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3495 /* Use realloc since the array is NULL if *alloc == 0. */
3496 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3498 new_equiv_class_alloc
);
3499 if (BE (new_equiv_classes
== NULL
, 0))
3501 mbcset
->equiv_classes
= new_equiv_classes
;
3502 *equiv_class_alloc
= new_equiv_class_alloc
;
3504 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3509 if (BE (strlen ((const char *) name
) != 1, 0))
3510 return REG_ECOLLATE
;
3511 bitset_set (sbcset
, *name
);
3516 /* Helper function for parse_bracket_exp.
3517 Build the character class which is represented by NAME.
3518 The result are written to MBCSET and SBCSET.
3519 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3520 is a pointer argument sinse we may update it. */
3522 static reg_errcode_t
3523 #ifdef RE_ENABLE_I18N
3524 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3525 re_charset_t
*mbcset
;
3526 int *char_class_alloc
;
3527 #else /* not RE_ENABLE_I18N */
3528 build_charclass (trans
, sbcset
, class_name
, syntax
)
3529 #endif /* not RE_ENABLE_I18N */
3530 unsigned RE_TRANSLATE_TYPE trans
;
3531 re_bitset_ptr_t sbcset
;
3532 const unsigned char *class_name
;
3533 reg_syntax_t syntax
;
3536 const char *name
= (const char *) class_name
;
3538 /* In case of REG_ICASE "upper" and "lower" match the both of
3539 upper and lower cases. */
3540 if ((syntax
& RE_ICASE
)
3541 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3544 #ifdef RE_ENABLE_I18N
3545 /* Check the space of the arrays. */
3546 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3548 /* Not enough, realloc it. */
3549 /* +1 in case of mbcset->nchar_classes is 0. */
3550 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3551 /* Use realloc since array is NULL if *alloc == 0. */
3552 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3553 new_char_class_alloc
);
3554 if (BE (new_char_classes
== NULL
, 0))
3556 mbcset
->char_classes
= new_char_classes
;
3557 *char_class_alloc
= new_char_class_alloc
;
3559 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3560 #endif /* RE_ENABLE_I18N */
3562 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3563 for (i = 0; i < SBC_MAX; ++i) \
3565 if (ctype_func (i)) \
3567 int ch = trans ? trans[i] : i; \
3568 bitset_set (sbcset, ch); \
3572 if (strcmp (name
, "alnum") == 0)
3573 BUILD_CHARCLASS_LOOP (isalnum
)
3574 else if (strcmp (name
, "cntrl") == 0)
3575 BUILD_CHARCLASS_LOOP (iscntrl
)
3576 else if (strcmp (name
, "lower") == 0)
3577 BUILD_CHARCLASS_LOOP (islower
)
3578 else if (strcmp (name
, "space") == 0)
3579 BUILD_CHARCLASS_LOOP (isspace
)
3580 else if (strcmp (name
, "alpha") == 0)
3581 BUILD_CHARCLASS_LOOP (isalpha
)
3582 else if (strcmp (name
, "digit") == 0)
3583 BUILD_CHARCLASS_LOOP (isdigit
)
3584 else if (strcmp (name
, "print") == 0)
3585 BUILD_CHARCLASS_LOOP (isprint
)
3586 else if (strcmp (name
, "upper") == 0)
3587 BUILD_CHARCLASS_LOOP (isupper
)
3588 else if (strcmp (name
, "blank") == 0)
3589 BUILD_CHARCLASS_LOOP (isblank
)
3590 else if (strcmp (name
, "graph") == 0)
3591 BUILD_CHARCLASS_LOOP (isgraph
)
3592 else if (strcmp (name
, "punct") == 0)
3593 BUILD_CHARCLASS_LOOP (ispunct
)
3594 else if (strcmp (name
, "xdigit") == 0)
3595 BUILD_CHARCLASS_LOOP (isxdigit
)
3603 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3605 unsigned RE_TRANSLATE_TYPE trans
;
3606 const unsigned char *class_name
;
3607 const unsigned char *extra
;
3611 re_bitset_ptr_t sbcset
;
3612 #ifdef RE_ENABLE_I18N
3613 re_charset_t
*mbcset
;
3615 #endif /* not RE_ENABLE_I18N */
3617 re_token_t br_token
;
3620 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3621 #ifdef RE_ENABLE_I18N
3622 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3623 #endif /* RE_ENABLE_I18N */
3625 #ifdef RE_ENABLE_I18N
3626 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3627 #else /* not RE_ENABLE_I18N */
3628 if (BE (sbcset
== NULL
, 0))
3629 #endif /* not RE_ENABLE_I18N */
3637 #ifdef RE_ENABLE_I18N
3639 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3640 bitset_set(cset->sbcset, '\0');
3642 mbcset
->non_match
= 1;
3643 #endif /* not RE_ENABLE_I18N */
3646 /* We don't care the syntax in this case. */
3647 ret
= build_charclass (trans
, sbcset
,
3648 #ifdef RE_ENABLE_I18N
3650 #endif /* RE_ENABLE_I18N */
3653 if (BE (ret
!= REG_NOERROR
, 0))
3656 #ifdef RE_ENABLE_I18N
3657 free_charset (mbcset
);
3658 #endif /* RE_ENABLE_I18N */
3662 /* \w match '_' also. */
3663 for (; *extra
; extra
++)
3664 bitset_set (sbcset
, *extra
);
3666 /* If it is non-matching list. */
3668 bitset_not (sbcset
);
3670 #ifdef RE_ENABLE_I18N
3671 /* Ensure only single byte characters are set. */
3672 if (dfa
->mb_cur_max
> 1)
3673 bitset_mask (sbcset
, dfa
->sb_char
);
3676 /* Build a tree for simple bracket. */
3677 br_token
.type
= SIMPLE_BRACKET
;
3678 br_token
.opr
.sbcset
= sbcset
;
3679 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3680 if (BE (tree
== NULL
, 0))
3681 goto build_word_op_espace
;
3683 #ifdef RE_ENABLE_I18N
3684 if (dfa
->mb_cur_max
> 1)
3686 bin_tree_t
*mbc_tree
;
3687 /* Build a tree for complex bracket. */
3688 br_token
.type
= COMPLEX_BRACKET
;
3689 br_token
.opr
.mbcset
= mbcset
;
3690 dfa
->has_mb_node
= 1;
3691 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3692 if (BE (mbc_tree
== NULL
, 0))
3693 goto build_word_op_espace
;
3694 /* Then join them by ALT node. */
3695 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3696 if (BE (mbc_tree
!= NULL
, 1))
3701 free_charset (mbcset
);
3704 #else /* not RE_ENABLE_I18N */
3706 #endif /* not RE_ENABLE_I18N */
3708 build_word_op_espace
:
3710 #ifdef RE_ENABLE_I18N
3711 free_charset (mbcset
);
3712 #endif /* RE_ENABLE_I18N */
3717 /* This is intended for the expressions like "a{1,3}".
3718 Fetch a number from `input', and return the number.
3719 Return -1, if the number field is empty like "{,1}".
3720 Return -2, If an error is occured. */
3723 fetch_number (input
, token
, syntax
)
3726 reg_syntax_t syntax
;
3732 fetch_token (token
, input
, syntax
);
3734 if (BE (token
->type
== END_OF_RE
, 0))
3736 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3738 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3739 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3740 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3745 #ifdef RE_ENABLE_I18N
3747 free_charset (re_charset_t
*cset
)
3749 re_free (cset
->mbchars
);
3751 re_free (cset
->coll_syms
);
3752 re_free (cset
->equiv_classes
);
3753 re_free (cset
->range_starts
);
3754 re_free (cset
->range_ends
);
3756 re_free (cset
->char_classes
);
3759 #endif /* RE_ENABLE_I18N */
3761 /* Functions for binary tree operation. */
3763 /* Create a tree node. */
3766 create_tree (dfa
, left
, right
, type
)
3770 re_token_type_t type
;
3774 return create_token_tree (dfa
, left
, right
, &t
);
3778 create_token_tree (dfa
, left
, right
, token
)
3782 const re_token_t
*token
;
3785 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3787 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3789 if (storage
== NULL
)
3791 storage
->next
= dfa
->str_tree_storage
;
3792 dfa
->str_tree_storage
= storage
;
3793 dfa
->str_tree_storage_idx
= 0;
3795 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3797 tree
->parent
= NULL
;
3799 tree
->right
= right
;
3800 tree
->token
= *token
;
3801 tree
->token
.duplicated
= 0;
3802 tree
->token
.opt_subexp
= 0;
3805 tree
->node_idx
= -1;
3808 left
->parent
= tree
;
3810 right
->parent
= tree
;
3814 /* Mark the tree SRC as an optional subexpression.
3815 To be called from preorder or postorder. */
3817 static reg_errcode_t
3818 mark_opt_subexp (extra
, node
)
3822 int idx
= (int) (long) extra
;
3823 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3824 node
->token
.opt_subexp
= 1;
3829 /* Free the allocated memory inside NODE. */
3832 free_token (re_token_t
*node
)
3834 #ifdef RE_ENABLE_I18N
3835 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3836 free_charset (node
->opr
.mbcset
);
3838 #endif /* RE_ENABLE_I18N */
3839 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3840 re_free (node
->opr
.sbcset
);
3843 /* Worker function for tree walking. Free the allocated memory inside NODE
3844 and its children. */
3846 static reg_errcode_t
3847 free_tree (void *extra
, bin_tree_t
*node
)
3849 free_token (&node
->token
);
3854 /* Duplicate the node SRC, and return new node. This is a preorder
3855 visit similar to the one implemented by the generic visitor, but
3856 we need more infrastructure to maintain two parallel trees --- so,
3857 it's easier to duplicate. */
3860 duplicate_tree (root
, dfa
)
3861 const bin_tree_t
*root
;
3864 const bin_tree_t
*node
;
3865 bin_tree_t
*dup_root
;
3866 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3868 for (node
= root
; ; )
3870 /* Create a new tree and link it back to the current parent. */
3871 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3874 (*p_new
)->parent
= dup_node
;
3875 (*p_new
)->token
.duplicated
= 1;
3878 /* Go to the left node, or up and to the right. */
3882 p_new
= &dup_node
->left
;
3886 const bin_tree_t
*prev
= NULL
;
3887 while (node
->right
== prev
|| node
->right
== NULL
)
3890 node
= node
->parent
;
3891 dup_node
= dup_node
->parent
;
3896 p_new
= &dup_node
->right
;