1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
27 static void init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (regex_t
*preg
);
37 static reg_errcode_t
preorder (bin_tree_t
*root
,
38 reg_errcode_t (fn (void *, bin_tree_t
*)),
40 static reg_errcode_t
postorder (bin_tree_t
*root
,
41 reg_errcode_t (fn (void *, bin_tree_t
*)),
43 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
44 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
45 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
47 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
49 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
50 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
51 int top_clone_node
, int root_node
,
52 unsigned int constraint
);
53 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
54 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
55 unsigned int constraint
);
56 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
57 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
59 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
60 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
62 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
64 static int peek_token (re_token_t
*token
, re_string_t
*input
,
66 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
68 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
69 reg_syntax_t syntax
, reg_errcode_t
*err
);
70 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
71 re_token_t
*token
, reg_syntax_t syntax
,
72 int nest
, reg_errcode_t
*err
);
73 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
74 re_token_t
*token
, reg_syntax_t syntax
,
75 int nest
, reg_errcode_t
*err
);
76 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
77 re_token_t
*token
, reg_syntax_t syntax
,
78 int nest
, reg_errcode_t
*err
);
79 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
80 re_token_t
*token
, reg_syntax_t syntax
,
81 int nest
, reg_errcode_t
*err
);
82 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
83 re_dfa_t
*dfa
, re_token_t
*token
,
84 reg_syntax_t syntax
, reg_errcode_t
*err
);
85 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
86 re_token_t
*token
, reg_syntax_t syntax
,
88 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
90 re_token_t
*token
, int token_len
,
94 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
100 re_charset_t
*mbcset
, int *range_alloc
,
101 bracket_elem_t
*start_elem
,
102 bracket_elem_t
*end_elem
);
103 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
104 re_charset_t
*mbcset
,
106 const unsigned char *name
);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
109 bracket_elem_t
*start_elem
,
110 bracket_elem_t
*end_elem
);
111 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
112 const unsigned char *name
);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
117 re_charset_t
*mbcset
,
118 int *equiv_class_alloc
,
119 const unsigned char *name
);
120 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
121 re_bitset_ptr_t sbcset
,
122 re_charset_t
*mbcset
,
123 int *char_class_alloc
,
124 const unsigned char *class_name
,
125 reg_syntax_t syntax
);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
128 const unsigned char *name
);
129 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
130 re_bitset_ptr_t sbcset
,
131 const unsigned char *class_name
,
132 reg_syntax_t syntax
);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
135 RE_TRANSLATE_TYPE trans
,
136 const unsigned char *class_name
,
137 const unsigned char *extra
,
138 int non_match
, reg_errcode_t
*err
);
139 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
140 bin_tree_t
*left
, bin_tree_t
*right
,
141 re_token_type_t type
);
142 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
143 bin_tree_t
*left
, bin_tree_t
*right
,
144 const re_token_t
*token
);
145 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
146 static void free_token (re_token_t
*node
);
147 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
148 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid
[] attribute_hidden
=
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx
[] attribute_hidden
=
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
240 re_compile_pattern (pattern
, length
, bufp
)
243 struct re_pattern_buffer
*bufp
;
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
252 /* Match anchors at newline. */
253 bufp
->newline_anchor
= 1;
255 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
259 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
262 weak_alias (__re_compile_pattern
, re_compile_pattern
)
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options
;
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
281 re_set_syntax (syntax
)
284 reg_syntax_t ret
= re_syntax_options
;
286 re_syntax_options
= syntax
;
290 weak_alias (__re_set_syntax
, re_set_syntax
)
294 re_compile_fastmap (bufp
)
295 struct re_pattern_buffer
*bufp
;
297 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
298 char *fastmap
= bufp
->fastmap
;
300 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
301 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
302 if (dfa
->init_state
!= dfa
->init_state_word
)
303 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
304 if (dfa
->init_state
!= dfa
->init_state_nl
)
305 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
306 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
307 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
308 bufp
->fastmap_accurate
= 1;
312 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
316 __attribute ((always_inline
))
317 re_set_fastmap (char *fastmap
, int icase
, int ch
)
321 fastmap
[tolower (ch
)] = 1;
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
328 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
330 const re_dfastate_t
*init_state
;
333 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
335 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
336 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
338 int node
= init_state
->nodes
.elems
[node_cnt
];
339 re_token_type_t type
= dfa
->nodes
[node
].type
;
341 if (type
== CHARACTER
)
343 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
347 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
352 *p
++ = dfa
->nodes
[node
].opr
.c
;
353 while (++node
< dfa
->nodes_len
354 && dfa
->nodes
[node
].type
== CHARACTER
355 && dfa
->nodes
[node
].mb_partial
)
356 *p
++ = dfa
->nodes
[node
].opr
.c
;
357 memset (&state
, 0, sizeof (state
));
358 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
360 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
362 re_set_fastmap (fastmap
, 0, buf
[0]);
366 else if (type
== SIMPLE_BRACKET
)
369 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
370 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
371 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1u << j
))
372 re_set_fastmap (fastmap
, icase
, ch
);
374 #ifdef RE_ENABLE_I18N
375 else if (type
== COMPLEX_BRACKET
)
378 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
379 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
380 || cset
->nranges
|| cset
->nchar_classes
)
383 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
385 /* In this case we want to catch the bytes which are
386 the first byte of any collation elements.
387 e.g. In da_DK, we want to catch 'a' since "aa"
388 is a valid collation element, and don't catch
389 'b' since 'b' is the only collation element
390 which starts from 'b'. */
392 const int32_t *table
= (const int32_t *)
393 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
394 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
395 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
397 re_set_fastmap (fastmap
, icase
, ch
);
400 if (dfa
->mb_cur_max
> 1)
401 for (i
= 0; i
< SBC_MAX
; ++i
)
402 if (__btowc (i
) == WEOF
)
403 re_set_fastmap (fastmap
, icase
, i
);
404 # endif /* not _LIBC */
406 for (i
= 0; i
< cset
->nmbchars
; ++i
)
410 memset (&state
, '\0', sizeof (state
));
411 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
412 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
413 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
415 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
417 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
421 #endif /* RE_ENABLE_I18N */
422 else if (type
== OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type
== OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type
== END_OF_RE
)
428 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
429 if (type
== END_OF_RE
)
430 bufp
->can_be_null
= 1;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg
, pattern
, cflags
)
474 regex_t
*__restrict preg
;
475 const char *__restrict pattern
;
479 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC
);
486 /* Try to allocate space for the fastmap. */
487 preg
->fastmap
= re_malloc (char, SBC_MAX
);
488 if (BE (preg
->fastmap
== NULL
, 0))
491 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags
& REG_NEWLINE
)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax
&= ~RE_DOT_NEWLINE
;
497 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
498 /* It also changes the matching behavior. */
499 preg
->newline_anchor
= 1;
502 preg
->newline_anchor
= 0;
503 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
504 preg
->translate
= NULL
;
506 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret
== REG_ERPAREN
)
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret
== REG_NOERROR
, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg
);
520 /* Some error occurred while compiling the expression. */
521 re_free (preg
->fastmap
);
522 preg
->fastmap
= NULL
;
528 weak_alias (__regcomp
, regcomp
)
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode
, preg
, errbuf
, errbuf_size
)
537 const regex_t
*__restrict preg
;
538 char *__restrict errbuf
;
545 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
546 / sizeof (__re_error_msgid_idx
[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
553 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
555 msg_size
= strlen (msg
) + 1; /* Includes the null. */
557 if (BE (errbuf_size
!= 0, 1))
559 if (BE (msg_size
> errbuf_size
, 0))
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
564 memcpy (errbuf
, msg
, errbuf_size
- 1);
565 errbuf
[errbuf_size
- 1] = 0;
569 memcpy (errbuf
, msg
, msg_size
);
575 weak_alias (__regerror
, regerror
)
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map
=
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
590 # error "Add case for new unsigned int size"
597 free_dfa_content (re_dfa_t
*dfa
)
602 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
603 free_token (dfa
->nodes
+ i
);
604 re_free (dfa
->nexts
);
605 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
607 if (dfa
->eclosures
!= NULL
)
608 re_node_set_free (dfa
->eclosures
+ i
);
609 if (dfa
->inveclosures
!= NULL
)
610 re_node_set_free (dfa
->inveclosures
+ i
);
611 if (dfa
->edests
!= NULL
)
612 re_node_set_free (dfa
->edests
+ i
);
614 re_free (dfa
->edests
);
615 re_free (dfa
->eclosures
);
616 re_free (dfa
->inveclosures
);
617 re_free (dfa
->nodes
);
619 if (dfa
->state_table
)
620 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
622 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
623 for (j
= 0; j
< entry
->num
; ++j
)
625 re_dfastate_t
*state
= entry
->array
[j
];
628 re_free (entry
->array
);
630 re_free (dfa
->state_table
);
631 #ifdef RE_ENABLE_I18N
632 if (dfa
->sb_char
!= utf8_sb_map
)
633 re_free (dfa
->sb_char
);
635 re_free (dfa
->subexp_map
);
637 re_free (dfa
->re_str
);
644 /* Free dynamically allocated space used by PREG. */
650 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
651 if (BE (dfa
!= NULL
, 1))
652 free_dfa_content (dfa
);
656 re_free (preg
->fastmap
);
657 preg
->fastmap
= NULL
;
659 re_free (preg
->translate
);
660 preg
->translate
= NULL
;
663 weak_alias (__regfree
, regfree
)
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf
;
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
689 if (!re_comp_buf
.buffer
)
690 return gettext ("No previous regular expression");
694 if (re_comp_buf
.buffer
)
696 fastmap
= re_comp_buf
.fastmap
;
697 re_comp_buf
.fastmap
= NULL
;
698 __regfree (&re_comp_buf
);
699 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
700 re_comp_buf
.fastmap
= fastmap
;
703 if (re_comp_buf
.fastmap
== NULL
)
705 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
706 if (re_comp_buf
.fastmap
== NULL
)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf
.newline_anchor
= 1;
717 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
727 libc_freeres_fn (free_mem
)
729 __regfree (&re_comp_buf
);
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
740 re_compile_internal (preg
, pattern
, length
, syntax
)
742 const char * pattern
;
746 reg_errcode_t err
= REG_NOERROR
;
750 /* Initialize the pattern buffer. */
751 preg
->fastmap_accurate
= 0;
752 preg
->syntax
= syntax
;
753 preg
->not_bol
= preg
->not_eol
= 0;
756 preg
->can_be_null
= 0;
757 preg
->regs_allocated
= REGS_UNALLOCATED
;
759 /* Initialize the dfa. */
760 dfa
= (re_dfa_t
*) preg
->buffer
;
761 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
770 preg
->allocated
= sizeof (re_dfa_t
);
771 preg
->buffer
= (unsigned char *) dfa
;
773 preg
->used
= sizeof (re_dfa_t
);
775 err
= init_dfa (dfa
, length
);
776 if (BE (err
!= REG_NOERROR
, 0))
778 free_dfa_content (dfa
);
784 /* Note: length+1 will not overflow since it is checked in init_dfa. */
785 dfa
->re_str
= re_malloc (char, length
+ 1);
786 strncpy (dfa
->re_str
, pattern
, length
+ 1);
789 __libc_lock_init (dfa
->lock
);
791 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
792 syntax
& RE_ICASE
, dfa
);
793 if (BE (err
!= REG_NOERROR
, 0))
795 re_compile_internal_free_return
:
796 free_workarea_compile (preg
);
797 re_string_destruct (®exp
);
798 free_dfa_content (dfa
);
804 /* Parse the regular expression, and build a structure tree. */
806 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
807 if (BE (dfa
->str_tree
== NULL
, 0))
808 goto re_compile_internal_free_return
;
810 /* Analyze the tree and create the nfa. */
811 err
= analyze (preg
);
812 if (BE (err
!= REG_NOERROR
, 0))
813 goto re_compile_internal_free_return
;
815 #ifdef RE_ENABLE_I18N
816 /* If possible, do searching in single byte encoding to speed things up. */
817 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
821 /* Then create the initial state of the dfa. */
822 err
= create_initial_state (dfa
);
824 /* Release work areas. */
825 free_workarea_compile (preg
);
826 re_string_destruct (®exp
);
828 if (BE (err
!= REG_NOERROR
, 0))
830 free_dfa_content (dfa
);
838 /* Initialize DFA. We use the length of the regular expression PAT_LEN
839 as the initial length of some arrays. */
842 init_dfa (dfa
, pat_len
)
846 unsigned int table_size
;
851 memset (dfa
, '\0', sizeof (re_dfa_t
));
853 /* Force allocation of str_tree_storage the first time. */
854 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
856 /* Avoid overflows. */
857 if (pat_len
== SIZE_MAX
)
860 dfa
->nodes_alloc
= pat_len
+ 1;
861 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
863 /* table_size = 2 ^ ceil(log pat_len) */
864 for (table_size
= 1; ; table_size
<<= 1)
865 if (table_size
> pat_len
)
868 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
869 dfa
->state_hash_mask
= table_size
- 1;
871 dfa
->mb_cur_max
= MB_CUR_MAX
;
873 if (dfa
->mb_cur_max
== 6
874 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
876 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
879 # ifdef HAVE_LANGINFO_CODESET
880 codeset_name
= nl_langinfo (CODESET
);
882 codeset_name
= getenv ("LC_ALL");
883 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
884 codeset_name
= getenv ("LC_CTYPE");
885 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
886 codeset_name
= getenv ("LANG");
887 if (codeset_name
== NULL
)
889 else if (strchr (codeset_name
, '.') != NULL
)
890 codeset_name
= strchr (codeset_name
, '.') + 1;
893 if (strcasecmp (codeset_name
, "UTF-8") == 0
894 || strcasecmp (codeset_name
, "UTF8") == 0)
897 /* We check exhaustively in the loop below if this charset is a
898 superset of ASCII. */
899 dfa
->map_notascii
= 0;
902 #ifdef RE_ENABLE_I18N
903 if (dfa
->mb_cur_max
> 1)
906 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
911 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
912 if (BE (dfa
->sb_char
== NULL
, 0))
915 /* Clear all bits by, then set those corresponding to single
917 bitset_empty (dfa
->sb_char
);
919 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
920 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
922 wint_t wch
= __btowc (ch
);
924 dfa
->sb_char
[i
] |= 1u << j
;
926 if (isascii (ch
) && wch
!= ch
)
927 dfa
->map_notascii
= 1;
934 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
939 /* Initialize WORD_CHAR table, which indicate which character is
940 "word". In this case "word" means that it is the word construction
941 character used by some operators like "\<", "\>", etc. */
948 dfa
->word_ops_used
= 1;
949 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
950 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
951 if (isalnum (ch
) || ch
== '_')
952 dfa
->word_char
[i
] |= 1u << j
;
955 /* Free the work area which are only used while compiling. */
958 free_workarea_compile (preg
)
961 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
962 bin_tree_storage_t
*storage
, *next
;
963 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
965 next
= storage
->next
;
968 dfa
->str_tree_storage
= NULL
;
969 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
970 dfa
->str_tree
= NULL
;
971 re_free (dfa
->org_indices
);
972 dfa
->org_indices
= NULL
;
975 /* Create initial states for all contexts. */
978 create_initial_state (dfa
)
983 re_node_set init_nodes
;
985 /* Initial states have the epsilon closure of the node which is
986 the first node of the regular expression. */
987 first
= dfa
->str_tree
->first
->node_idx
;
988 dfa
->init_node
= first
;
989 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
990 if (BE (err
!= REG_NOERROR
, 0))
993 /* The back-references which are in initial states can epsilon transit,
994 since in this case all of the subexpressions can be null.
995 Then we add epsilon closures of the nodes which are the next nodes of
996 the back-references. */
997 if (dfa
->nbackref
> 0)
998 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1000 int node_idx
= init_nodes
.elems
[i
];
1001 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1004 if (type
!= OP_BACK_REF
)
1006 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1008 re_token_t
*clexp_node
;
1009 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1010 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1011 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1014 if (clexp_idx
== init_nodes
.nelem
)
1017 if (type
== OP_BACK_REF
)
1019 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1020 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1022 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
1028 /* It must be the first time to invoke acquire_state. */
1029 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1030 /* We don't check ERR here, since the initial state must not be NULL. */
1031 if (BE (dfa
->init_state
== NULL
, 0))
1033 if (dfa
->init_state
->has_constraint
)
1035 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1037 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1039 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1043 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1044 || dfa
->init_state_begbuf
== NULL
, 0))
1048 dfa
->init_state_word
= dfa
->init_state_nl
1049 = dfa
->init_state_begbuf
= dfa
->init_state
;
1051 re_node_set_free (&init_nodes
);
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058 DFA nodes where needed. */
1064 int node
, i
, mb_chars
= 0, has_period
= 0;
1066 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1067 switch (dfa
->nodes
[node
].type
)
1070 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1074 switch (dfa
->nodes
[node
].opr
.idx
)
1082 /* Word anchors etc. cannot be handled. */
1092 case OP_DUP_ASTERISK
:
1093 case OP_OPEN_SUBEXP
:
1094 case OP_CLOSE_SUBEXP
:
1096 case COMPLEX_BRACKET
:
1098 case SIMPLE_BRACKET
:
1099 /* Just double check. */
1100 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1101 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1108 if (mb_chars
|| has_period
)
1109 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1111 if (dfa
->nodes
[node
].type
== CHARACTER
1112 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1113 dfa
->nodes
[node
].mb_partial
= 0;
1114 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1115 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1118 /* The search can be in single byte locale. */
1119 dfa
->mb_cur_max
= 1;
1121 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1125 /* Analyze the structure tree, and calculate "first", "next", "edest",
1126 "eclosure", and "inveclosure". */
1128 static reg_errcode_t
1132 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1135 /* Allocate arrays. */
1136 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1137 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1138 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1139 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1140 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1141 || dfa
->eclosures
== NULL
, 0))
1144 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1145 if (dfa
->subexp_map
!= NULL
)
1148 for (i
= 0; i
< preg
->re_nsub
; i
++)
1149 dfa
->subexp_map
[i
] = i
;
1150 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1151 for (i
= 0; i
< preg
->re_nsub
; i
++)
1152 if (dfa
->subexp_map
[i
] != i
)
1154 if (i
== preg
->re_nsub
)
1156 free (dfa
->subexp_map
);
1157 dfa
->subexp_map
= NULL
;
1161 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1162 if (BE (ret
!= REG_NOERROR
, 0))
1164 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1165 if (BE (ret
!= REG_NOERROR
, 0))
1167 preorder (dfa
->str_tree
, calc_next
, dfa
);
1168 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1169 if (BE (ret
!= REG_NOERROR
, 0))
1171 ret
= calc_eclosure (dfa
);
1172 if (BE (ret
!= REG_NOERROR
, 0))
1175 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1176 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1177 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1180 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1181 if (BE (dfa
->inveclosures
== NULL
, 0))
1183 ret
= calc_inveclosure (dfa
);
1189 /* Our parse trees are very unbalanced, so we cannot use a stack to
1190 implement parse tree visits. Instead, we use parent pointers and
1191 some hairy code in these two functions. */
1192 static reg_errcode_t
1193 postorder (root
, fn
, extra
)
1195 reg_errcode_t (fn (void *, bin_tree_t
*));
1198 bin_tree_t
*node
, *prev
;
1200 for (node
= root
; ; )
1202 /* Descend down the tree, preferably to the left (or to the right
1203 if that's the only child). */
1204 while (node
->left
|| node
->right
)
1212 reg_errcode_t err
= fn (extra
, node
);
1213 if (BE (err
!= REG_NOERROR
, 0))
1215 if (node
->parent
== NULL
)
1218 node
= node
->parent
;
1220 /* Go up while we have a node that is reached from the right. */
1221 while (node
->right
== prev
|| node
->right
== NULL
);
1226 static reg_errcode_t
1227 preorder (root
, fn
, extra
)
1229 reg_errcode_t (fn (void *, bin_tree_t
*));
1234 for (node
= root
; ; )
1236 reg_errcode_t err
= fn (extra
, node
);
1237 if (BE (err
!= REG_NOERROR
, 0))
1240 /* Go to the left node, or up and to the right. */
1245 bin_tree_t
*prev
= NULL
;
1246 while (node
->right
== prev
|| node
->right
== NULL
)
1249 node
= node
->parent
;
1258 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1259 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1260 backreferences as well. Requires a preorder visit. */
1261 static reg_errcode_t
1262 optimize_subexps (extra
, node
)
1266 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1268 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1270 int idx
= node
->token
.opr
.idx
;
1271 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1272 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1275 else if (node
->token
.type
== SUBEXP
1276 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1278 int other_idx
= node
->left
->token
.opr
.idx
;
1280 node
->left
= node
->left
->left
;
1282 node
->left
->parent
= node
;
1284 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1285 if (other_idx
< CHAR_BIT
* sizeof dfa
->used_bkref_map
)
1286 dfa
->used_bkref_map
&= ~(1u << other_idx
);
1292 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1293 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1294 static reg_errcode_t
1295 lower_subexps (extra
, node
)
1299 regex_t
*preg
= (regex_t
*) extra
;
1300 reg_errcode_t err
= REG_NOERROR
;
1302 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1304 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1306 node
->left
->parent
= node
;
1308 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1310 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1312 node
->right
->parent
= node
;
1319 lower_subexp (err
, preg
, node
)
1324 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1325 bin_tree_t
*body
= node
->left
;
1326 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1329 /* We do not optimize empty subexpressions, because otherwise we may
1330 have bad CONCAT nodes with NULL children. This is obviously not
1331 very common, so we do not lose much. An example that triggers
1332 this case is the sed "script" /\(\)/x. */
1333 && node
->left
!= NULL
1334 && (node
->token
.opr
.idx
>= CHAR_BIT
* sizeof dfa
->used_bkref_map
1335 || !(dfa
->used_bkref_map
& (1u << node
->token
.opr
.idx
))))
1338 /* Convert the SUBEXP node to the concatenation of an
1339 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1340 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1341 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1342 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1343 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1344 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1350 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1351 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356 nodes. Requires a postorder visit. */
1357 static reg_errcode_t
1358 calc_first (extra
, node
)
1362 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1363 if (node
->token
.type
== CONCAT
)
1365 node
->first
= node
->left
->first
;
1366 node
->node_idx
= node
->left
->node_idx
;
1371 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1372 if (BE (node
->node_idx
== -1, 0))
1378 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1379 static reg_errcode_t
1380 calc_next (extra
, node
)
1384 switch (node
->token
.type
)
1386 case OP_DUP_ASTERISK
:
1387 node
->left
->next
= node
;
1390 node
->left
->next
= node
->right
->first
;
1391 node
->right
->next
= node
->next
;
1395 node
->left
->next
= node
->next
;
1397 node
->right
->next
= node
->next
;
1403 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1404 static reg_errcode_t
1405 link_nfa_nodes (extra
, node
)
1409 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1410 int idx
= node
->node_idx
;
1411 reg_errcode_t err
= REG_NOERROR
;
1413 switch (node
->token
.type
)
1419 assert (node
->next
== NULL
);
1422 case OP_DUP_ASTERISK
:
1426 dfa
->has_plural_match
= 1;
1427 if (node
->left
!= NULL
)
1428 left
= node
->left
->first
->node_idx
;
1430 left
= node
->next
->node_idx
;
1431 if (node
->right
!= NULL
)
1432 right
= node
->right
->first
->node_idx
;
1434 right
= node
->next
->node_idx
;
1436 assert (right
> -1);
1437 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1442 case OP_OPEN_SUBEXP
:
1443 case OP_CLOSE_SUBEXP
:
1444 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1448 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1449 if (node
->token
.type
== OP_BACK_REF
)
1450 re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1454 assert (!IS_EPSILON_NODE (node
->token
.type
));
1455 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1462 /* Duplicate the epsilon closure of the node ROOT_NODE.
1463 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1464 to their own constraint. */
1466 static reg_errcode_t
1467 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1470 int top_org_node
, top_clone_node
, root_node
;
1471 unsigned int init_constraint
;
1473 int org_node
, clone_node
, ret
;
1474 unsigned int constraint
= init_constraint
;
1475 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1477 int org_dest
, clone_dest
;
1478 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1480 /* If the back reference epsilon-transit, its destination must
1481 also have the constraint. Then duplicate the epsilon closure
1482 of the destination of the back reference, and store it in
1483 edests of the back reference. */
1484 org_dest
= dfa
->nexts
[org_node
];
1485 re_node_set_empty (dfa
->edests
+ clone_node
);
1486 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1487 if (BE (clone_dest
== -1, 0))
1489 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1490 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1491 if (BE (ret
< 0, 0))
1494 else if (dfa
->edests
[org_node
].nelem
== 0)
1496 /* In case of the node can't epsilon-transit, don't duplicate the
1497 destination and store the original destination as the
1498 destination of the node. */
1499 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1502 else if (dfa
->edests
[org_node
].nelem
== 1)
1504 /* In case of the node can epsilon-transit, and it has only one
1506 org_dest
= dfa
->edests
[org_node
].elems
[0];
1507 re_node_set_empty (dfa
->edests
+ clone_node
);
1508 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1510 /* In case of the node has another constraint, append it. */
1511 if (org_node
== root_node
&& clone_node
!= org_node
)
1513 /* ...but if the node is root_node itself, it means the
1514 epsilon closure have a loop, then tie it to the
1515 destination of the root_node. */
1516 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1518 if (BE (ret
< 0, 0))
1522 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1524 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1525 if (BE (clone_dest
== -1, 0))
1527 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1528 if (BE (ret
< 0, 0))
1531 else /* dfa->edests[org_node].nelem == 2 */
1533 /* In case of the node can epsilon-transit, and it has two
1534 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1535 org_dest
= dfa
->edests
[org_node
].elems
[0];
1536 re_node_set_empty (dfa
->edests
+ clone_node
);
1537 /* Search for a duplicated node which satisfies the constraint. */
1538 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1539 if (clone_dest
== -1)
1541 /* There are no such a duplicated node, create a new one. */
1543 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1544 if (BE (clone_dest
== -1, 0))
1546 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1547 if (BE (ret
< 0, 0))
1549 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1550 root_node
, constraint
);
1551 if (BE (err
!= REG_NOERROR
, 0))
1556 /* There are a duplicated node which satisfy the constraint,
1557 use it to avoid infinite loop. */
1558 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1559 if (BE (ret
< 0, 0))
1563 org_dest
= dfa
->edests
[org_node
].elems
[1];
1564 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1565 if (BE (clone_dest
== -1, 0))
1567 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1568 if (BE (ret
< 0, 0))
1571 org_node
= org_dest
;
1572 clone_node
= clone_dest
;
1577 /* Search for a node which is duplicated from the node ORG_NODE, and
1578 satisfies the constraint CONSTRAINT. */
1581 search_duplicated_node (dfa
, org_node
, constraint
)
1582 const re_dfa_t
*dfa
;
1584 unsigned int constraint
;
1587 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1589 if (org_node
== dfa
->org_indices
[idx
]
1590 && constraint
== dfa
->nodes
[idx
].constraint
)
1591 return idx
; /* Found. */
1593 return -1; /* Not found. */
1596 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1597 Return the index of the new node, or -1 if insufficient storage is
1601 duplicate_node (dfa
, org_idx
, constraint
)
1604 unsigned int constraint
;
1606 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1607 if (BE (dup_idx
!= -1, 1))
1609 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1610 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1611 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1612 dfa
->nodes
[dup_idx
].duplicated
= 1;
1614 /* Store the index of the original node. */
1615 dfa
->org_indices
[dup_idx
] = org_idx
;
1620 static reg_errcode_t
1621 calc_inveclosure (dfa
)
1625 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1626 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1628 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1630 int *elems
= dfa
->eclosures
[src
].elems
;
1631 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1633 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1634 if (BE (ret
== -1, 0))
1642 /* Calculate "eclosure" for all the node in DFA. */
1644 static reg_errcode_t
1648 int node_idx
, incomplete
;
1650 assert (dfa
->nodes_len
> 0);
1653 /* For each nodes, calculate epsilon closure. */
1654 for (node_idx
= 0; ; ++node_idx
)
1657 re_node_set eclosure_elem
;
1658 if (node_idx
== dfa
->nodes_len
)
1667 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1670 /* If we have already calculated, skip it. */
1671 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1673 /* Calculate epsilon closure of `node_idx'. */
1674 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1675 if (BE (err
!= REG_NOERROR
, 0))
1678 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1681 re_node_set_free (&eclosure_elem
);
1687 /* Calculate epsilon closure of NODE. */
1689 static reg_errcode_t
1690 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1691 re_node_set
*new_set
;
1696 unsigned int constraint
;
1698 re_node_set eclosure
;
1700 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1701 if (BE (err
!= REG_NOERROR
, 0))
1704 /* This indicates that we are calculating this node now.
1705 We reference this value to avoid infinite loop. */
1706 dfa
->eclosures
[node
].nelem
= -1;
1708 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1709 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1710 /* If the current node has constraints, duplicate all nodes.
1711 Since they must inherit the constraints. */
1713 && dfa
->edests
[node
].nelem
1714 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1716 int org_node
, cur_node
;
1717 org_node
= cur_node
= node
;
1718 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1719 if (BE (err
!= REG_NOERROR
, 0))
1723 /* Expand each epsilon destination nodes. */
1724 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1725 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1727 re_node_set eclosure_elem
;
1728 int edest
= dfa
->edests
[node
].elems
[i
];
1729 /* If calculating the epsilon closure of `edest' is in progress,
1730 return intermediate result. */
1731 if (dfa
->eclosures
[edest
].nelem
== -1)
1736 /* If we haven't calculated the epsilon closure of `edest' yet,
1737 calculate now. Otherwise use calculated epsilon closure. */
1738 if (dfa
->eclosures
[edest
].nelem
== 0)
1740 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1741 if (BE (err
!= REG_NOERROR
, 0))
1745 eclosure_elem
= dfa
->eclosures
[edest
];
1746 /* Merge the epsilon closure of `edest'. */
1747 re_node_set_merge (&eclosure
, &eclosure_elem
);
1748 /* If the epsilon closure of `edest' is incomplete,
1749 the epsilon closure of this node is also incomplete. */
1750 if (dfa
->eclosures
[edest
].nelem
== 0)
1753 re_node_set_free (&eclosure_elem
);
1757 /* Epsilon closures include itself. */
1758 re_node_set_insert (&eclosure
, node
);
1759 if (incomplete
&& !root
)
1760 dfa
->eclosures
[node
].nelem
= 0;
1762 dfa
->eclosures
[node
] = eclosure
;
1763 *new_set
= eclosure
;
1767 /* Functions for token which are used in the parser. */
1769 /* Fetch a token from INPUT.
1770 We must not use this function inside bracket expressions. */
1773 fetch_token (result
, input
, syntax
)
1776 reg_syntax_t syntax
;
1778 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1781 /* Peek a token from INPUT, and return the length of the token.
1782 We must not use this function inside bracket expressions. */
1785 peek_token (token
, input
, syntax
)
1788 reg_syntax_t syntax
;
1792 if (re_string_eoi (input
))
1794 token
->type
= END_OF_RE
;
1798 c
= re_string_peek_byte (input
, 0);
1801 token
->word_char
= 0;
1802 #ifdef RE_ENABLE_I18N
1803 token
->mb_partial
= 0;
1804 if (input
->mb_cur_max
> 1 &&
1805 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1807 token
->type
= CHARACTER
;
1808 token
->mb_partial
= 1;
1815 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1817 token
->type
= BACK_SLASH
;
1821 c2
= re_string_peek_byte_case (input
, 1);
1823 token
->type
= CHARACTER
;
1824 #ifdef RE_ENABLE_I18N
1825 if (input
->mb_cur_max
> 1)
1827 wint_t wc
= re_string_wchar_at (input
,
1828 re_string_cur_idx (input
) + 1);
1829 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1833 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1838 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1839 token
->type
= OP_ALT
;
1841 case '1': case '2': case '3': case '4': case '5':
1842 case '6': case '7': case '8': case '9':
1843 if (!(syntax
& RE_NO_BK_REFS
))
1845 token
->type
= OP_BACK_REF
;
1846 token
->opr
.idx
= c2
- '1';
1850 if (!(syntax
& RE_NO_GNU_OPS
))
1852 token
->type
= ANCHOR
;
1853 token
->opr
.ctx_type
= WORD_FIRST
;
1857 if (!(syntax
& RE_NO_GNU_OPS
))
1859 token
->type
= ANCHOR
;
1860 token
->opr
.ctx_type
= WORD_LAST
;
1864 if (!(syntax
& RE_NO_GNU_OPS
))
1866 token
->type
= ANCHOR
;
1867 token
->opr
.ctx_type
= WORD_DELIM
;
1871 if (!(syntax
& RE_NO_GNU_OPS
))
1873 token
->type
= ANCHOR
;
1874 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1878 if (!(syntax
& RE_NO_GNU_OPS
))
1879 token
->type
= OP_WORD
;
1882 if (!(syntax
& RE_NO_GNU_OPS
))
1883 token
->type
= OP_NOTWORD
;
1886 if (!(syntax
& RE_NO_GNU_OPS
))
1887 token
->type
= OP_SPACE
;
1890 if (!(syntax
& RE_NO_GNU_OPS
))
1891 token
->type
= OP_NOTSPACE
;
1894 if (!(syntax
& RE_NO_GNU_OPS
))
1896 token
->type
= ANCHOR
;
1897 token
->opr
.ctx_type
= BUF_FIRST
;
1901 if (!(syntax
& RE_NO_GNU_OPS
))
1903 token
->type
= ANCHOR
;
1904 token
->opr
.ctx_type
= BUF_LAST
;
1908 if (!(syntax
& RE_NO_BK_PARENS
))
1909 token
->type
= OP_OPEN_SUBEXP
;
1912 if (!(syntax
& RE_NO_BK_PARENS
))
1913 token
->type
= OP_CLOSE_SUBEXP
;
1916 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1917 token
->type
= OP_DUP_PLUS
;
1920 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1921 token
->type
= OP_DUP_QUESTION
;
1924 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1925 token
->type
= OP_OPEN_DUP_NUM
;
1928 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1929 token
->type
= OP_CLOSE_DUP_NUM
;
1937 token
->type
= CHARACTER
;
1938 #ifdef RE_ENABLE_I18N
1939 if (input
->mb_cur_max
> 1)
1941 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1942 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1946 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1951 if (syntax
& RE_NEWLINE_ALT
)
1952 token
->type
= OP_ALT
;
1955 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1956 token
->type
= OP_ALT
;
1959 token
->type
= OP_DUP_ASTERISK
;
1962 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1963 token
->type
= OP_DUP_PLUS
;
1966 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1967 token
->type
= OP_DUP_QUESTION
;
1970 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1971 token
->type
= OP_OPEN_DUP_NUM
;
1974 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1975 token
->type
= OP_CLOSE_DUP_NUM
;
1978 if (syntax
& RE_NO_BK_PARENS
)
1979 token
->type
= OP_OPEN_SUBEXP
;
1982 if (syntax
& RE_NO_BK_PARENS
)
1983 token
->type
= OP_CLOSE_SUBEXP
;
1986 token
->type
= OP_OPEN_BRACKET
;
1989 token
->type
= OP_PERIOD
;
1992 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1993 re_string_cur_idx (input
) != 0)
1995 char prev
= re_string_peek_byte (input
, -1);
1996 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1999 token
->type
= ANCHOR
;
2000 token
->opr
.ctx_type
= LINE_FIRST
;
2003 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
2004 re_string_cur_idx (input
) + 1 != re_string_length (input
))
2007 re_string_skip_bytes (input
, 1);
2008 peek_token (&next
, input
, syntax
);
2009 re_string_skip_bytes (input
, -1);
2010 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
2013 token
->type
= ANCHOR
;
2014 token
->opr
.ctx_type
= LINE_LAST
;
2022 /* Peek a token from INPUT, and return the length of the token.
2023 We must not use this function out of bracket expressions. */
2026 peek_token_bracket (token
, input
, syntax
)
2029 reg_syntax_t syntax
;
2032 if (re_string_eoi (input
))
2034 token
->type
= END_OF_RE
;
2037 c
= re_string_peek_byte (input
, 0);
2040 #ifdef RE_ENABLE_I18N
2041 if (input
->mb_cur_max
> 1 &&
2042 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2044 token
->type
= CHARACTER
;
2047 #endif /* RE_ENABLE_I18N */
2049 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2050 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2052 /* In this case, '\' escape a character. */
2054 re_string_skip_bytes (input
, 1);
2055 c2
= re_string_peek_byte (input
, 0);
2057 token
->type
= CHARACTER
;
2060 if (c
== '[') /* '[' is a special char in a bracket exps. */
2064 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2065 c2
= re_string_peek_byte (input
, 1);
2073 token
->type
= OP_OPEN_COLL_ELEM
;
2076 token
->type
= OP_OPEN_EQUIV_CLASS
;
2079 if (syntax
& RE_CHAR_CLASSES
)
2081 token
->type
= OP_OPEN_CHAR_CLASS
;
2084 /* else fall through. */
2086 token
->type
= CHARACTER
;
2096 token
->type
= OP_CHARSET_RANGE
;
2099 token
->type
= OP_CLOSE_BRACKET
;
2102 token
->type
= OP_NON_MATCH_LIST
;
2105 token
->type
= CHARACTER
;
2110 /* Functions for parser. */
2112 /* Entry point of the parser.
2113 Parse the regular expression REGEXP and return the structure tree.
2114 If an error is occured, ERR is set by error code, and return NULL.
2115 This function build the following tree, from regular expression <reg_exp>:
2121 CAT means concatenation.
2122 EOR means end of regular expression. */
2125 parse (regexp
, preg
, syntax
, err
)
2126 re_string_t
*regexp
;
2128 reg_syntax_t syntax
;
2131 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2132 bin_tree_t
*tree
, *eor
, *root
;
2133 re_token_t current_token
;
2134 dfa
->syntax
= syntax
;
2135 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2136 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2137 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2139 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2141 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2144 if (BE (eor
== NULL
|| root
== NULL
, 0))
2152 /* This function build the following tree, from regular expression
2153 <branch1>|<branch2>:
2159 ALT means alternative, which represents the operator `|'. */
2162 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2163 re_string_t
*regexp
;
2166 reg_syntax_t syntax
;
2170 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2171 bin_tree_t
*tree
, *branch
= NULL
;
2172 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2173 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2176 while (token
->type
== OP_ALT
)
2178 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2179 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2180 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2182 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2183 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2188 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2189 if (BE (tree
== NULL
, 0))
2198 /* This function build the following tree, from regular expression
2205 CAT means concatenation. */
2208 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2209 re_string_t
*regexp
;
2212 reg_syntax_t syntax
;
2216 bin_tree_t
*tree
, *exp
;
2217 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2218 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2219 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2222 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2223 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2225 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2226 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2230 if (tree
!= NULL
&& exp
!= NULL
)
2232 tree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2239 else if (tree
== NULL
)
2241 /* Otherwise exp == NULL, we don't need to create new tree. */
2246 /* This function build the following tree, from regular expression a*:
2253 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2254 re_string_t
*regexp
;
2257 reg_syntax_t syntax
;
2261 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2263 switch (token
->type
)
2266 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2267 if (BE (tree
== NULL
, 0))
2272 #ifdef RE_ENABLE_I18N
2273 if (dfa
->mb_cur_max
> 1)
2275 while (!re_string_eoi (regexp
)
2276 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2278 bin_tree_t
*mbc_remain
;
2279 fetch_token (token
, regexp
, syntax
);
2280 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2281 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2282 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2291 case OP_OPEN_SUBEXP
:
2292 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2293 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2296 case OP_OPEN_BRACKET
:
2297 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2298 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2302 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2307 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2308 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2309 if (BE (tree
== NULL
, 0))
2315 dfa
->has_mb_node
= 1;
2317 case OP_OPEN_DUP_NUM
:
2318 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2324 case OP_DUP_ASTERISK
:
2326 case OP_DUP_QUESTION
:
2327 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2332 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2334 fetch_token (token
, regexp
, syntax
);
2335 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2337 /* else fall through */
2338 case OP_CLOSE_SUBEXP
:
2339 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2340 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2345 /* else fall through */
2346 case OP_CLOSE_DUP_NUM
:
2347 /* We treat it as a normal character. */
2349 /* Then we can these characters as normal characters. */
2350 token
->type
= CHARACTER
;
2351 /* mb_partial and word_char bits should be initialized already
2353 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2354 if (BE (tree
== NULL
, 0))
2361 if ((token
->opr
.ctx_type
2362 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2363 && dfa
->word_ops_used
== 0)
2364 init_word_char (dfa
);
2365 if (token
->opr
.ctx_type
== WORD_DELIM
2366 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2368 bin_tree_t
*tree_first
, *tree_last
;
2369 if (token
->opr
.ctx_type
== WORD_DELIM
)
2371 token
->opr
.ctx_type
= WORD_FIRST
;
2372 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2373 token
->opr
.ctx_type
= WORD_LAST
;
2377 token
->opr
.ctx_type
= INSIDE_WORD
;
2378 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2379 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2381 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2382 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2383 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2391 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2392 if (BE (tree
== NULL
, 0))
2398 /* We must return here, since ANCHORs can't be followed
2399 by repetition operators.
2400 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2401 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2402 fetch_token (token
, regexp
, syntax
);
2405 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2406 if (BE (tree
== NULL
, 0))
2411 if (dfa
->mb_cur_max
> 1)
2412 dfa
->has_mb_node
= 1;
2416 tree
= build_charclass_op (dfa
, regexp
->trans
,
2417 (const unsigned char *) "alnum",
2418 (const unsigned char *) "_",
2419 token
->type
== OP_NOTWORD
, err
);
2420 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2425 tree
= build_charclass_op (dfa
, regexp
->trans
,
2426 (const unsigned char *) "space",
2427 (const unsigned char *) "",
2428 token
->type
== OP_NOTSPACE
, err
);
2429 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2439 /* Must not happen? */
2445 fetch_token (token
, regexp
, syntax
);
2447 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2448 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2450 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2451 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2453 /* In BRE consecutive duplications are not allowed. */
2454 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2455 && (token
->type
== OP_DUP_ASTERISK
2456 || token
->type
== OP_OPEN_DUP_NUM
))
2466 /* This function build the following tree, from regular expression
2474 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2475 re_string_t
*regexp
;
2478 reg_syntax_t syntax
;
2482 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2485 cur_nsub
= preg
->re_nsub
++;
2487 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2489 /* The subexpression may be a null string. */
2490 if (token
->type
== OP_CLOSE_SUBEXP
)
2494 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2495 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2497 if (BE (*err
!= REG_NOERROR
, 0))
2501 if (cur_nsub
<= '9' - '1')
2502 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2504 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2505 if (BE (tree
== NULL
, 0))
2510 tree
->token
.opr
.idx
= cur_nsub
;
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2517 parse_dup_op (elem
, regexp
, dfa
, token
, syntax
, err
)
2519 re_string_t
*regexp
;
2522 reg_syntax_t syntax
;
2525 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2526 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2527 re_token_t start_token
= *token
;
2529 if (token
->type
== OP_OPEN_DUP_NUM
)
2532 start
= fetch_number (regexp
, token
, syntax
);
2535 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2536 start
= 0; /* We treat "{,m}" as "{0,m}". */
2539 *err
= REG_BADBR
; /* <re>{} is invalid. */
2543 if (BE (start
!= -2, 1))
2545 /* We treat "{n}" as "{n,n}". */
2546 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2547 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2548 ? fetch_number (regexp
, token
, syntax
) : -2));
2550 if (BE (start
== -2 || end
== -2, 0))
2552 /* Invalid sequence. */
2553 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2555 if (token
->type
== END_OF_RE
)
2563 /* If the syntax bit is set, rollback. */
2564 re_string_set_index (regexp
, start_idx
);
2565 *token
= start_token
;
2566 token
->type
= CHARACTER
;
2567 /* mb_partial and word_char bits should be already initialized by
2572 if (BE (end
!= -1 && start
> end
, 0))
2574 /* First number greater than second. */
2581 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2582 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2585 fetch_token (token
, regexp
, syntax
);
2587 if (BE (elem
== NULL
, 0))
2589 if (BE (start
== 0 && end
== 0, 0))
2591 postorder (elem
, free_tree
, NULL
);
2595 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2596 if (BE (start
> 0, 0))
2599 for (i
= 2; i
<= start
; ++i
)
2601 elem
= duplicate_tree (elem
, dfa
);
2602 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2603 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2604 goto parse_dup_op_espace
;
2610 /* Duplicate ELEM before it is marked optional. */
2611 elem
= duplicate_tree (elem
, dfa
);
2617 if (elem
->token
.type
== SUBEXP
)
2618 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2620 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2621 if (BE (tree
== NULL
, 0))
2622 goto parse_dup_op_espace
;
2624 /* This loop is actually executed only when end != -1,
2625 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2626 already created the start+1-th copy. */
2627 for (i
= start
+ 2; i
<= end
; ++i
)
2629 elem
= duplicate_tree (elem
, dfa
);
2630 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2631 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2632 goto parse_dup_op_espace
;
2634 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2635 if (BE (tree
== NULL
, 0))
2636 goto parse_dup_op_espace
;
2640 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2644 parse_dup_op_espace
:
2649 /* Size of the names for collating symbol/equivalence_class/character_class.
2650 I'm not sure, but maybe enough. */
2651 #define BRACKET_NAME_BUF_SIZE 32
2654 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2655 Build the range expression which starts from START_ELEM, and ends
2656 at END_ELEM. The result are written to MBCSET and SBCSET.
2657 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2658 mbcset->range_ends, is a pointer argument sinse we may
2661 static reg_errcode_t
2662 # ifdef RE_ENABLE_I18N
2663 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2664 re_charset_t
*mbcset
;
2666 # else /* not RE_ENABLE_I18N */
2667 build_range_exp (sbcset
, start_elem
, end_elem
)
2668 # endif /* not RE_ENABLE_I18N */
2669 re_bitset_ptr_t sbcset
;
2670 bracket_elem_t
*start_elem
, *end_elem
;
2672 unsigned int start_ch
, end_ch
;
2673 /* Equivalence Classes and Character Classes can't be a range start/end. */
2674 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2675 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2679 /* We can handle no multi character collating elements without libc
2681 if (BE ((start_elem
->type
== COLL_SYM
2682 && strlen ((char *) start_elem
->opr
.name
) > 1)
2683 || (end_elem
->type
== COLL_SYM
2684 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2685 return REG_ECOLLATE
;
2687 # ifdef RE_ENABLE_I18N
2692 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2694 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2695 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2697 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2698 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2700 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2701 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2702 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2703 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2704 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2705 return REG_ECOLLATE
;
2706 cmp_buf
[0] = start_wc
;
2707 cmp_buf
[4] = end_wc
;
2708 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2711 /* Got valid collation sequence values, add them as a new entry.
2712 However, for !_LIBC we have no collation elements: if the
2713 character set is single byte, the single byte character set
2714 that we build below suffices. parse_bracket_exp passes
2715 no MBCSET if dfa->mb_cur_max == 1. */
2718 /* Check the space of the arrays. */
2719 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2721 /* There is not enough space, need realloc. */
2722 wchar_t *new_array_start
, *new_array_end
;
2725 /* +1 in case of mbcset->nranges is 0. */
2726 new_nranges
= 2 * mbcset
->nranges
+ 1;
2727 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2728 are NULL if *range_alloc == 0. */
2729 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2731 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2734 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2737 mbcset
->range_starts
= new_array_start
;
2738 mbcset
->range_ends
= new_array_end
;
2739 *range_alloc
= new_nranges
;
2742 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2743 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2746 /* Build the table for single byte characters. */
2747 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2750 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2751 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2752 bitset_set (sbcset
, wc
);
2755 # else /* not RE_ENABLE_I18N */
2758 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2759 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2761 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2762 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2764 if (start_ch
> end_ch
)
2766 /* Build the table for single byte characters. */
2767 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2768 if (start_ch
<= ch
&& ch
<= end_ch
)
2769 bitset_set (sbcset
, ch
);
2771 # endif /* not RE_ENABLE_I18N */
2774 #endif /* not _LIBC */
2777 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2778 Build the collating element which is represented by NAME.
2779 The result are written to MBCSET and SBCSET.
2780 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2781 pointer argument since we may update it. */
2783 static reg_errcode_t
2784 # ifdef RE_ENABLE_I18N
2785 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2786 re_charset_t
*mbcset
;
2787 int *coll_sym_alloc
;
2788 # else /* not RE_ENABLE_I18N */
2789 build_collating_symbol (sbcset
, name
)
2790 # endif /* not RE_ENABLE_I18N */
2791 re_bitset_ptr_t sbcset
;
2792 const unsigned char *name
;
2794 size_t name_len
= strlen ((const char *) name
);
2795 if (BE (name_len
!= 1, 0))
2796 return REG_ECOLLATE
;
2799 bitset_set (sbcset
, name
[0]);
2803 #endif /* not _LIBC */
2805 /* This function parse bracket expression like "[abc]", "[a-c]",
2809 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2810 re_string_t
*regexp
;
2813 reg_syntax_t syntax
;
2817 const unsigned char *collseqmb
;
2818 const char *collseqwc
;
2821 const int32_t *symb_table
;
2822 const unsigned char *extra
;
2824 /* Local function for parse_bracket_exp used in _LIBC environement.
2825 Seek the collating symbol entry correspondings to NAME.
2826 Return the index of the symbol in the SYMB_TABLE. */
2829 __attribute ((always_inline
))
2830 seek_collating_symbol_entry (name
, name_len
)
2831 const unsigned char *name
;
2834 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2835 int32_t elem
= hash
% table_size
;
2836 int32_t second
= hash
% (table_size
- 2);
2837 while (symb_table
[2 * elem
] != 0)
2839 /* First compare the hashing value. */
2840 if (symb_table
[2 * elem
] == hash
2841 /* Compare the length of the name. */
2842 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2843 /* Compare the name. */
2844 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2847 /* Yep, this is the entry. */
2857 /* Local function for parse_bracket_exp used in _LIBC environement.
2858 Look up the collation sequence value of BR_ELEM.
2859 Return the value if succeeded, UINT_MAX otherwise. */
2861 auto inline unsigned int
2862 __attribute ((always_inline
))
2863 lookup_collation_sequence_value (br_elem
)
2864 bracket_elem_t
*br_elem
;
2866 if (br_elem
->type
== SB_CHAR
)
2869 if (MB_CUR_MAX == 1)
2872 return collseqmb
[br_elem
->opr
.ch
];
2875 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2876 return __collseq_table_lookup (collseqwc
, wc
);
2879 else if (br_elem
->type
== MB_CHAR
)
2881 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2883 else if (br_elem
->type
== COLL_SYM
)
2885 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2889 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2891 if (symb_table
[2 * elem
] != 0)
2893 /* We found the entry. */
2894 idx
= symb_table
[2 * elem
+ 1];
2895 /* Skip the name of collating element name. */
2896 idx
+= 1 + extra
[idx
];
2897 /* Skip the byte sequence of the collating element. */
2898 idx
+= 1 + extra
[idx
];
2899 /* Adjust for the alignment. */
2900 idx
= (idx
+ 3) & ~3;
2901 /* Skip the multibyte collation sequence value. */
2902 idx
+= sizeof (unsigned int);
2903 /* Skip the wide char sequence of the collating element. */
2904 idx
+= sizeof (unsigned int) *
2905 (1 + *(unsigned int *) (extra
+ idx
));
2906 /* Return the collation sequence value. */
2907 return *(unsigned int *) (extra
+ idx
);
2909 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2911 /* No valid character. Match it as a single byte
2913 return collseqmb
[br_elem
->opr
.name
[0]];
2916 else if (sym_name_len
== 1)
2917 return collseqmb
[br_elem
->opr
.name
[0]];
2922 /* Local function for parse_bracket_exp used in _LIBC environement.
2923 Build the range expression which starts from START_ELEM, and ends
2924 at END_ELEM. The result are written to MBCSET and SBCSET.
2925 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2926 mbcset->range_ends, is a pointer argument sinse we may
2929 auto inline reg_errcode_t
2930 __attribute ((always_inline
))
2931 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2932 re_charset_t
*mbcset
;
2934 re_bitset_ptr_t sbcset
;
2935 bracket_elem_t
*start_elem
, *end_elem
;
2938 uint32_t start_collseq
;
2939 uint32_t end_collseq
;
2941 /* Equivalence Classes and Character Classes can't be a range
2943 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2944 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2948 start_collseq
= lookup_collation_sequence_value (start_elem
);
2949 end_collseq
= lookup_collation_sequence_value (end_elem
);
2950 /* Check start/end collation sequence values. */
2951 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2952 return REG_ECOLLATE
;
2953 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2956 /* Got valid collation sequence values, add them as a new entry.
2957 However, if we have no collation elements, and the character set
2958 is single byte, the single byte character set that we
2959 build below suffices. */
2960 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2962 /* Check the space of the arrays. */
2963 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2965 /* There is not enough space, need realloc. */
2966 uint32_t *new_array_start
;
2967 uint32_t *new_array_end
;
2970 /* +1 in case of mbcset->nranges is 0. */
2971 new_nranges
= 2 * mbcset
->nranges
+ 1;
2972 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2974 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2977 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2980 mbcset
->range_starts
= new_array_start
;
2981 mbcset
->range_ends
= new_array_end
;
2982 *range_alloc
= new_nranges
;
2985 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2986 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2989 /* Build the table for single byte characters. */
2990 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2992 uint32_t ch_collseq
;
2994 if (MB_CUR_MAX == 1)
2997 ch_collseq
= collseqmb
[ch
];
2999 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
3000 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
3001 bitset_set (sbcset
, ch
);
3006 /* Local function for parse_bracket_exp used in _LIBC environement.
3007 Build the collating element which is represented by NAME.
3008 The result are written to MBCSET and SBCSET.
3009 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3010 pointer argument sinse we may update it. */
3012 auto inline reg_errcode_t
3013 __attribute ((always_inline
))
3014 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
3015 re_charset_t
*mbcset
;
3016 int *coll_sym_alloc
;
3017 re_bitset_ptr_t sbcset
;
3018 const unsigned char *name
;
3021 size_t name_len
= strlen ((const char *) name
);
3024 elem
= seek_collating_symbol_entry (name
, name_len
);
3025 if (symb_table
[2 * elem
] != 0)
3027 /* We found the entry. */
3028 idx
= symb_table
[2 * elem
+ 1];
3029 /* Skip the name of collating element name. */
3030 idx
+= 1 + extra
[idx
];
3032 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
3034 /* No valid character, treat it as a normal
3036 bitset_set (sbcset
, name
[0]);
3040 return REG_ECOLLATE
;
3042 /* Got valid collation sequence, add it as a new entry. */
3043 /* Check the space of the arrays. */
3044 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3046 /* Not enough, realloc it. */
3047 /* +1 in case of mbcset->ncoll_syms is 0. */
3048 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3049 /* Use realloc since mbcset->coll_syms is NULL
3051 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3052 new_coll_sym_alloc
);
3053 if (BE (new_coll_syms
== NULL
, 0))
3055 mbcset
->coll_syms
= new_coll_syms
;
3056 *coll_sym_alloc
= new_coll_sym_alloc
;
3058 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3063 if (BE (name_len
!= 1, 0))
3064 return REG_ECOLLATE
;
3067 bitset_set (sbcset
, name
[0]);
3074 re_token_t br_token
;
3075 re_bitset_ptr_t sbcset
;
3076 #ifdef RE_ENABLE_I18N
3077 re_charset_t
*mbcset
;
3078 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3079 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3080 #endif /* not RE_ENABLE_I18N */
3082 bin_tree_t
*work_tree
;
3084 int first_round
= 1;
3086 collseqmb
= (const unsigned char *)
3087 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3088 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3094 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3095 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3096 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3097 _NL_COLLATE_SYMB_TABLEMB
);
3098 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3099 _NL_COLLATE_SYMB_EXTRAMB
);
3102 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3103 #ifdef RE_ENABLE_I18N
3104 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3105 #endif /* RE_ENABLE_I18N */
3106 #ifdef RE_ENABLE_I18N
3107 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3109 if (BE (sbcset
== NULL
, 0))
3110 #endif /* RE_ENABLE_I18N */
3116 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3117 if (BE (token
->type
== END_OF_RE
, 0))
3120 goto parse_bracket_exp_free_return
;
3122 if (token
->type
== OP_NON_MATCH_LIST
)
3124 #ifdef RE_ENABLE_I18N
3125 mbcset
->non_match
= 1;
3126 #endif /* not RE_ENABLE_I18N */
3128 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3129 bitset_set (sbcset
, '\0');
3130 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3131 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3132 if (BE (token
->type
== END_OF_RE
, 0))
3135 goto parse_bracket_exp_free_return
;
3139 /* We treat the first ']' as a normal character. */
3140 if (token
->type
== OP_CLOSE_BRACKET
)
3141 token
->type
= CHARACTER
;
3145 bracket_elem_t start_elem
, end_elem
;
3146 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3147 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3149 int token_len2
= 0, is_range_exp
= 0;
3152 start_elem
.opr
.name
= start_name_buf
;
3153 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3154 syntax
, first_round
);
3155 if (BE (ret
!= REG_NOERROR
, 0))
3158 goto parse_bracket_exp_free_return
;
3162 /* Get information about the next token. We need it in any case. */
3163 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3165 /* Do not check for ranges if we know they are not allowed. */
3166 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3168 if (BE (token
->type
== END_OF_RE
, 0))
3171 goto parse_bracket_exp_free_return
;
3173 if (token
->type
== OP_CHARSET_RANGE
)
3175 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3176 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3177 if (BE (token2
.type
== END_OF_RE
, 0))
3180 goto parse_bracket_exp_free_return
;
3182 if (token2
.type
== OP_CLOSE_BRACKET
)
3184 /* We treat the last '-' as a normal character. */
3185 re_string_skip_bytes (regexp
, -token_len
);
3186 token
->type
= CHARACTER
;
3193 if (is_range_exp
== 1)
3195 end_elem
.opr
.name
= end_name_buf
;
3196 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3198 if (BE (ret
!= REG_NOERROR
, 0))
3201 goto parse_bracket_exp_free_return
;
3204 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3207 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3208 &start_elem
, &end_elem
);
3210 # ifdef RE_ENABLE_I18N
3211 *err
= build_range_exp (sbcset
,
3212 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3213 &range_alloc
, &start_elem
, &end_elem
);
3215 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3217 #endif /* RE_ENABLE_I18N */
3218 if (BE (*err
!= REG_NOERROR
, 0))
3219 goto parse_bracket_exp_free_return
;
3223 switch (start_elem
.type
)
3226 bitset_set (sbcset
, start_elem
.opr
.ch
);
3228 #ifdef RE_ENABLE_I18N
3230 /* Check whether the array has enough space. */
3231 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3233 wchar_t *new_mbchars
;
3234 /* Not enough, realloc it. */
3235 /* +1 in case of mbcset->nmbchars is 0. */
3236 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3237 /* Use realloc since array is NULL if *alloc == 0. */
3238 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3240 if (BE (new_mbchars
== NULL
, 0))
3241 goto parse_bracket_exp_espace
;
3242 mbcset
->mbchars
= new_mbchars
;
3244 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3246 #endif /* RE_ENABLE_I18N */
3248 *err
= build_equiv_class (sbcset
,
3249 #ifdef RE_ENABLE_I18N
3250 mbcset
, &equiv_class_alloc
,
3251 #endif /* RE_ENABLE_I18N */
3252 start_elem
.opr
.name
);
3253 if (BE (*err
!= REG_NOERROR
, 0))
3254 goto parse_bracket_exp_free_return
;
3257 *err
= build_collating_symbol (sbcset
,
3258 #ifdef RE_ENABLE_I18N
3259 mbcset
, &coll_sym_alloc
,
3260 #endif /* RE_ENABLE_I18N */
3261 start_elem
.opr
.name
);
3262 if (BE (*err
!= REG_NOERROR
, 0))
3263 goto parse_bracket_exp_free_return
;
3266 *err
= build_charclass (regexp
->trans
, sbcset
,
3267 #ifdef RE_ENABLE_I18N
3268 mbcset
, &char_class_alloc
,
3269 #endif /* RE_ENABLE_I18N */
3270 start_elem
.opr
.name
, syntax
);
3271 if (BE (*err
!= REG_NOERROR
, 0))
3272 goto parse_bracket_exp_free_return
;
3279 if (BE (token
->type
== END_OF_RE
, 0))
3282 goto parse_bracket_exp_free_return
;
3284 if (token
->type
== OP_CLOSE_BRACKET
)
3288 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3290 /* If it is non-matching list. */
3292 bitset_not (sbcset
);
3294 #ifdef RE_ENABLE_I18N
3295 /* Ensure only single byte characters are set. */
3296 if (dfa
->mb_cur_max
> 1)
3297 bitset_mask (sbcset
, dfa
->sb_char
);
3299 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3300 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3301 || mbcset
->non_match
)))
3303 bin_tree_t
*mbc_tree
;
3305 /* Build a tree for complex bracket. */
3306 dfa
->has_mb_node
= 1;
3307 br_token
.type
= COMPLEX_BRACKET
;
3308 br_token
.opr
.mbcset
= mbcset
;
3309 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3310 if (BE (mbc_tree
== NULL
, 0))
3311 goto parse_bracket_exp_espace
;
3312 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3313 if (sbcset
[sbc_idx
])
3315 /* If there are no bits set in sbcset, there is no point
3316 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3317 if (sbc_idx
< BITSET_UINTS
)
3319 /* Build a tree for simple bracket. */
3320 br_token
.type
= SIMPLE_BRACKET
;
3321 br_token
.opr
.sbcset
= sbcset
;
3322 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3323 if (BE (work_tree
== NULL
, 0))
3324 goto parse_bracket_exp_espace
;
3326 /* Then join them by ALT node. */
3327 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3328 if (BE (work_tree
== NULL
, 0))
3329 goto parse_bracket_exp_espace
;
3334 work_tree
= mbc_tree
;
3338 #endif /* not RE_ENABLE_I18N */
3340 #ifdef RE_ENABLE_I18N
3341 free_charset (mbcset
);
3343 /* Build a tree for simple bracket. */
3344 br_token
.type
= SIMPLE_BRACKET
;
3345 br_token
.opr
.sbcset
= sbcset
;
3346 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3347 if (BE (work_tree
== NULL
, 0))
3348 goto parse_bracket_exp_espace
;
3352 parse_bracket_exp_espace
:
3354 parse_bracket_exp_free_return
:
3356 #ifdef RE_ENABLE_I18N
3357 free_charset (mbcset
);
3358 #endif /* RE_ENABLE_I18N */
3362 /* Parse an element in the bracket expression. */
3364 static reg_errcode_t
3365 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3367 bracket_elem_t
*elem
;
3368 re_string_t
*regexp
;
3372 reg_syntax_t syntax
;
3375 #ifdef RE_ENABLE_I18N
3377 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3378 if (cur_char_size
> 1)
3380 elem
->type
= MB_CHAR
;
3381 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3382 re_string_skip_bytes (regexp
, cur_char_size
);
3385 #endif /* RE_ENABLE_I18N */
3386 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3387 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3388 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3389 return parse_bracket_symbol (elem
, regexp
, token
);
3390 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3392 /* A '-' must only appear as anything but a range indicator before
3393 the closing bracket. Everything else is an error. */
3395 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3396 if (token2
.type
!= OP_CLOSE_BRACKET
)
3397 /* The actual error value is not standardized since this whole
3398 case is undefined. But ERANGE makes good sense. */
3401 elem
->type
= SB_CHAR
;
3402 elem
->opr
.ch
= token
->opr
.c
;
3406 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3407 such as [:<character_class>:], [.<collating_element>.], and
3408 [=<equivalent_class>=]. */
3410 static reg_errcode_t
3411 parse_bracket_symbol (elem
, regexp
, token
)
3412 bracket_elem_t
*elem
;
3413 re_string_t
*regexp
;
3416 unsigned char ch
, delim
= token
->opr
.c
;
3418 if (re_string_eoi(regexp
))
3422 if (i
>= BRACKET_NAME_BUF_SIZE
)
3424 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3425 ch
= re_string_fetch_byte_case (regexp
);
3427 ch
= re_string_fetch_byte (regexp
);
3428 if (re_string_eoi(regexp
))
3430 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3432 elem
->opr
.name
[i
] = ch
;
3434 re_string_skip_bytes (regexp
, 1);
3435 elem
->opr
.name
[i
] = '\0';
3436 switch (token
->type
)
3438 case OP_OPEN_COLL_ELEM
:
3439 elem
->type
= COLL_SYM
;
3441 case OP_OPEN_EQUIV_CLASS
:
3442 elem
->type
= EQUIV_CLASS
;
3444 case OP_OPEN_CHAR_CLASS
:
3445 elem
->type
= CHAR_CLASS
;
3453 /* Helper function for parse_bracket_exp.
3454 Build the equivalence class which is represented by NAME.
3455 The result are written to MBCSET and SBCSET.
3456 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3457 is a pointer argument sinse we may update it. */
3459 static reg_errcode_t
3460 #ifdef RE_ENABLE_I18N
3461 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3462 re_charset_t
*mbcset
;
3463 int *equiv_class_alloc
;
3464 #else /* not RE_ENABLE_I18N */
3465 build_equiv_class (sbcset
, name
)
3466 #endif /* not RE_ENABLE_I18N */
3467 re_bitset_ptr_t sbcset
;
3468 const unsigned char *name
;
3471 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3474 const int32_t *table
, *indirect
;
3475 const unsigned char *weights
, *extra
, *cp
;
3476 unsigned char char_buf
[2];
3480 /* This #include defines a local function! */
3481 # include <locale/weight.h>
3482 /* Calculate the index for equivalence class. */
3484 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3485 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3486 _NL_COLLATE_WEIGHTMB
);
3487 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3488 _NL_COLLATE_EXTRAMB
);
3489 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3490 _NL_COLLATE_INDIRECTMB
);
3491 idx1
= findidx (&cp
);
3492 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3493 /* This isn't a valid character. */
3494 return REG_ECOLLATE
;
3496 /* Build single byte matcing table for this equivalence class. */
3497 char_buf
[1] = (unsigned char) '\0';
3498 len
= weights
[idx1
];
3499 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3503 idx2
= findidx (&cp
);
3508 /* This isn't a valid character. */
3510 if (len
== weights
[idx2
])
3513 while (cnt
<= len
&&
3514 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3518 bitset_set (sbcset
, ch
);
3521 /* Check whether the array has enough space. */
3522 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3524 /* Not enough, realloc it. */
3525 /* +1 in case of mbcset->nequiv_classes is 0. */
3526 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3527 /* Use realloc since the array is NULL if *alloc == 0. */
3528 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3530 new_equiv_class_alloc
);
3531 if (BE (new_equiv_classes
== NULL
, 0))
3533 mbcset
->equiv_classes
= new_equiv_classes
;
3534 *equiv_class_alloc
= new_equiv_class_alloc
;
3536 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3541 if (BE (strlen ((const char *) name
) != 1, 0))
3542 return REG_ECOLLATE
;
3543 bitset_set (sbcset
, *name
);
3548 /* Helper function for parse_bracket_exp.
3549 Build the character class which is represented by NAME.
3550 The result are written to MBCSET and SBCSET.
3551 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3552 is a pointer argument sinse we may update it. */
3554 static reg_errcode_t
3555 #ifdef RE_ENABLE_I18N
3556 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3557 re_charset_t
*mbcset
;
3558 int *char_class_alloc
;
3559 #else /* not RE_ENABLE_I18N */
3560 build_charclass (trans
, sbcset
, class_name
, syntax
)
3561 #endif /* not RE_ENABLE_I18N */
3562 RE_TRANSLATE_TYPE trans
;
3563 re_bitset_ptr_t sbcset
;
3564 const unsigned char *class_name
;
3565 reg_syntax_t syntax
;
3568 const char *name
= (const char *) class_name
;
3570 /* In case of REG_ICASE "upper" and "lower" match the both of
3571 upper and lower cases. */
3572 if ((syntax
& RE_ICASE
)
3573 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3576 #ifdef RE_ENABLE_I18N
3577 /* Check the space of the arrays. */
3578 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3580 /* Not enough, realloc it. */
3581 /* +1 in case of mbcset->nchar_classes is 0. */
3582 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3583 /* Use realloc since array is NULL if *alloc == 0. */
3584 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3585 new_char_class_alloc
);
3586 if (BE (new_char_classes
== NULL
, 0))
3588 mbcset
->char_classes
= new_char_classes
;
3589 *char_class_alloc
= new_char_class_alloc
;
3591 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3592 #endif /* RE_ENABLE_I18N */
3594 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3595 for (i = 0; i < SBC_MAX; ++i) \
3597 if (ctype_func (i)) \
3599 int ch = trans ? trans[i] : i; \
3600 bitset_set (sbcset, ch); \
3604 if (strcmp (name
, "alnum") == 0)
3605 BUILD_CHARCLASS_LOOP (isalnum
)
3606 else if (strcmp (name
, "cntrl") == 0)
3607 BUILD_CHARCLASS_LOOP (iscntrl
)
3608 else if (strcmp (name
, "lower") == 0)
3609 BUILD_CHARCLASS_LOOP (islower
)
3610 else if (strcmp (name
, "space") == 0)
3611 BUILD_CHARCLASS_LOOP (isspace
)
3612 else if (strcmp (name
, "alpha") == 0)
3613 BUILD_CHARCLASS_LOOP (isalpha
)
3614 else if (strcmp (name
, "digit") == 0)
3615 BUILD_CHARCLASS_LOOP (isdigit
)
3616 else if (strcmp (name
, "print") == 0)
3617 BUILD_CHARCLASS_LOOP (isprint
)
3618 else if (strcmp (name
, "upper") == 0)
3619 BUILD_CHARCLASS_LOOP (isupper
)
3620 else if (strcmp (name
, "blank") == 0)
3621 BUILD_CHARCLASS_LOOP (isblank
)
3622 else if (strcmp (name
, "graph") == 0)
3623 BUILD_CHARCLASS_LOOP (isgraph
)
3624 else if (strcmp (name
, "punct") == 0)
3625 BUILD_CHARCLASS_LOOP (ispunct
)
3626 else if (strcmp (name
, "xdigit") == 0)
3627 BUILD_CHARCLASS_LOOP (isxdigit
)
3635 build_charclass_op (dfa
, trans
, class_name
, extra
, non_match
, err
)
3637 RE_TRANSLATE_TYPE trans
;
3638 const unsigned char *class_name
;
3639 const unsigned char *extra
;
3643 re_bitset_ptr_t sbcset
;
3644 #ifdef RE_ENABLE_I18N
3645 re_charset_t
*mbcset
;
3647 #endif /* not RE_ENABLE_I18N */
3649 re_token_t br_token
;
3652 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3653 #ifdef RE_ENABLE_I18N
3654 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3655 #endif /* RE_ENABLE_I18N */
3657 #ifdef RE_ENABLE_I18N
3658 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3659 #else /* not RE_ENABLE_I18N */
3660 if (BE (sbcset
== NULL
, 0))
3661 #endif /* not RE_ENABLE_I18N */
3669 #ifdef RE_ENABLE_I18N
3671 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3672 bitset_set(cset->sbcset, '\0');
3674 mbcset
->non_match
= 1;
3675 #endif /* not RE_ENABLE_I18N */
3678 /* We don't care the syntax in this case. */
3679 ret
= build_charclass (trans
, sbcset
,
3680 #ifdef RE_ENABLE_I18N
3682 #endif /* RE_ENABLE_I18N */
3685 if (BE (ret
!= REG_NOERROR
, 0))
3688 #ifdef RE_ENABLE_I18N
3689 free_charset (mbcset
);
3690 #endif /* RE_ENABLE_I18N */
3694 /* \w match '_' also. */
3695 for (; *extra
; extra
++)
3696 bitset_set (sbcset
, *extra
);
3698 /* If it is non-matching list. */
3700 bitset_not (sbcset
);
3702 #ifdef RE_ENABLE_I18N
3703 /* Ensure only single byte characters are set. */
3704 if (dfa
->mb_cur_max
> 1)
3705 bitset_mask (sbcset
, dfa
->sb_char
);
3708 /* Build a tree for simple bracket. */
3709 br_token
.type
= SIMPLE_BRACKET
;
3710 br_token
.opr
.sbcset
= sbcset
;
3711 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3712 if (BE (tree
== NULL
, 0))
3713 goto build_word_op_espace
;
3715 #ifdef RE_ENABLE_I18N
3716 if (dfa
->mb_cur_max
> 1)
3718 bin_tree_t
*mbc_tree
;
3719 /* Build a tree for complex bracket. */
3720 br_token
.type
= COMPLEX_BRACKET
;
3721 br_token
.opr
.mbcset
= mbcset
;
3722 dfa
->has_mb_node
= 1;
3723 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3724 if (BE (mbc_tree
== NULL
, 0))
3725 goto build_word_op_espace
;
3726 /* Then join them by ALT node. */
3727 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3728 if (BE (mbc_tree
!= NULL
, 1))
3733 free_charset (mbcset
);
3736 #else /* not RE_ENABLE_I18N */
3738 #endif /* not RE_ENABLE_I18N */
3740 build_word_op_espace
:
3742 #ifdef RE_ENABLE_I18N
3743 free_charset (mbcset
);
3744 #endif /* RE_ENABLE_I18N */
3749 /* This is intended for the expressions like "a{1,3}".
3750 Fetch a number from `input', and return the number.
3751 Return -1, if the number field is empty like "{,1}".
3752 Return -2, If an error is occured. */
3755 fetch_number (input
, token
, syntax
)
3758 reg_syntax_t syntax
;
3764 fetch_token (token
, input
, syntax
);
3766 if (BE (token
->type
== END_OF_RE
, 0))
3768 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3770 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3771 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3772 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3777 #ifdef RE_ENABLE_I18N
3779 free_charset (re_charset_t
*cset
)
3781 re_free (cset
->mbchars
);
3783 re_free (cset
->coll_syms
);
3784 re_free (cset
->equiv_classes
);
3785 re_free (cset
->range_starts
);
3786 re_free (cset
->range_ends
);
3788 re_free (cset
->char_classes
);
3791 #endif /* RE_ENABLE_I18N */
3793 /* Functions for binary tree operation. */
3795 /* Create a tree node. */
3798 create_tree (dfa
, left
, right
, type
)
3802 re_token_type_t type
;
3806 return create_token_tree (dfa
, left
, right
, &t
);
3810 create_token_tree (dfa
, left
, right
, token
)
3814 const re_token_t
*token
;
3817 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3819 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3821 if (storage
== NULL
)
3823 storage
->next
= dfa
->str_tree_storage
;
3824 dfa
->str_tree_storage
= storage
;
3825 dfa
->str_tree_storage_idx
= 0;
3827 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3829 tree
->parent
= NULL
;
3831 tree
->right
= right
;
3832 tree
->token
= *token
;
3833 tree
->token
.duplicated
= 0;
3834 tree
->token
.opt_subexp
= 0;
3837 tree
->node_idx
= -1;
3840 left
->parent
= tree
;
3842 right
->parent
= tree
;
3846 /* Mark the tree SRC as an optional subexpression.
3847 To be called from preorder or postorder. */
3849 static reg_errcode_t
3850 mark_opt_subexp (extra
, node
)
3854 int idx
= (int) (long) extra
;
3855 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3856 node
->token
.opt_subexp
= 1;
3861 /* Free the allocated memory inside NODE. */
3864 free_token (re_token_t
*node
)
3866 #ifdef RE_ENABLE_I18N
3867 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3868 free_charset (node
->opr
.mbcset
);
3870 #endif /* RE_ENABLE_I18N */
3871 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3872 re_free (node
->opr
.sbcset
);
3875 /* Worker function for tree walking. Free the allocated memory inside NODE
3876 and its children. */
3878 static reg_errcode_t
3879 free_tree (void *extra
, bin_tree_t
*node
)
3881 free_token (&node
->token
);
3886 /* Duplicate the node SRC, and return new node. This is a preorder
3887 visit similar to the one implemented by the generic visitor, but
3888 we need more infrastructure to maintain two parallel trees --- so,
3889 it's easier to duplicate. */
3892 duplicate_tree (root
, dfa
)
3893 const bin_tree_t
*root
;
3896 const bin_tree_t
*node
;
3897 bin_tree_t
*dup_root
;
3898 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3900 for (node
= root
; ; )
3902 /* Create a new tree and link it back to the current parent. */
3903 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3906 (*p_new
)->parent
= dup_node
;
3907 (*p_new
)->token
.duplicated
= 1;
3910 /* Go to the left node, or up and to the right. */
3914 p_new
= &dup_node
->left
;
3918 const bin_tree_t
*prev
= NULL
;
3919 while (node
->right
== prev
|| node
->right
== NULL
)
3922 node
= node
->parent
;
3923 dup_node
= dup_node
->parent
;
3928 p_new
= &dup_node
->right
;