1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 size_t length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, size_t pat_len
);
28 static void free_charset (re_charset_t
*cset
);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t
*preg
);
31 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
33 static void optimize_utf8 (re_dfa_t
*dfa
);
35 static reg_errcode_t
analyze (regex_t
*preg
);
36 static reg_errcode_t
preorder (bin_tree_t
*root
,
37 reg_errcode_t (fn (void *, bin_tree_t
*)),
39 static reg_errcode_t
postorder (bin_tree_t
*root
,
40 reg_errcode_t (fn (void *, bin_tree_t
*)),
42 static reg_errcode_t
optimize_subexps (void *extra
, bin_tree_t
*node
);
43 static reg_errcode_t
lower_subexps (void *extra
, bin_tree_t
*node
);
44 static bin_tree_t
*lower_subexp (reg_errcode_t
*err
, regex_t
*preg
,
46 static reg_errcode_t
calc_first (void *extra
, bin_tree_t
*node
);
47 static reg_errcode_t
calc_next (void *extra
, bin_tree_t
*node
);
48 static reg_errcode_t
link_nfa_nodes (void *extra
, bin_tree_t
*node
);
49 static int duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
);
50 static int search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
51 unsigned int constraint
);
52 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
53 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
55 static reg_errcode_t
calc_inveclosure (re_dfa_t
*dfa
);
56 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
58 static int peek_token (re_token_t
*token
, re_string_t
*input
,
59 reg_syntax_t syntax
) internal_function
;
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
92 int *equiv_class_alloc
,
93 const unsigned char *name
);
94 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
97 int *char_class_alloc
,
98 const unsigned char *class_name
,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t
build_equiv_class (bitset_t sbcset
,
102 const unsigned char *name
);
103 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
105 const unsigned char *class_name
,
106 reg_syntax_t syntax
);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
,
109 RE_TRANSLATE_TYPE trans
,
110 const unsigned char *class_name
,
111 const unsigned char *extra
,
112 int non_match
, reg_errcode_t
*err
);
113 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
114 bin_tree_t
*left
, bin_tree_t
*right
,
115 re_token_type_t type
);
116 static bin_tree_t
*create_token_tree (re_dfa_t
*dfa
,
117 bin_tree_t
*left
, bin_tree_t
*right
,
118 const re_token_t
*token
);
119 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
120 static void free_token (re_token_t
*node
);
121 static reg_errcode_t
free_tree (void *extra
, bin_tree_t
*node
);
122 static reg_errcode_t
mark_opt_subexp (void *extra
, bin_tree_t
*node
);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid
[] attribute_hidden
=
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx
[] attribute_hidden
=
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
214 re_compile_pattern (pattern
, length
, bufp
)
217 struct re_pattern_buffer
*bufp
;
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp
->no_sub
= !!(re_syntax_options
& RE_NO_SUB
);
226 /* Match anchors at newline. */
227 bufp
->newline_anchor
= 1;
229 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
233 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
236 weak_alias (__re_compile_pattern
, re_compile_pattern
)
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options
;
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
255 re_set_syntax (syntax
)
258 reg_syntax_t ret
= re_syntax_options
;
260 re_syntax_options
= syntax
;
264 weak_alias (__re_set_syntax
, re_set_syntax
)
268 re_compile_fastmap (bufp
)
269 struct re_pattern_buffer
*bufp
;
271 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
272 char *fastmap
= bufp
->fastmap
;
274 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
275 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
276 if (dfa
->init_state
!= dfa
->init_state_word
)
277 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
278 if (dfa
->init_state
!= dfa
->init_state_nl
)
279 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
280 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
281 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
282 bufp
->fastmap_accurate
= 1;
286 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
290 __attribute ((always_inline
))
291 re_set_fastmap (char *fastmap
, int icase
, int ch
)
295 fastmap
[tolower (ch
)] = 1;
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
302 re_compile_fastmap_iter (regex_t
*bufp
, const re_dfastate_t
*init_state
,
305 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
307 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
308 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
310 int node
= init_state
->nodes
.elems
[node_cnt
];
311 re_token_type_t type
= dfa
->nodes
[node
].type
;
313 if (type
== CHARACTER
)
315 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
319 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
324 *p
++ = dfa
->nodes
[node
].opr
.c
;
325 while (++node
< dfa
->nodes_len
326 && dfa
->nodes
[node
].type
== CHARACTER
327 && dfa
->nodes
[node
].mb_partial
)
328 *p
++ = dfa
->nodes
[node
].opr
.c
;
329 memset (&state
, '\0', sizeof (state
));
330 if (__mbrtowc (&wc
, (const char *) buf
, p
- buf
,
332 && (__wcrtomb ((char *) buf
, towlower (wc
), &state
)
334 re_set_fastmap (fastmap
, 0, buf
[0]);
338 else if (type
== SIMPLE_BRACKET
)
341 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
344 bitset_word_t w
= dfa
->nodes
[node
].opr
.sbcset
[i
];
345 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
346 if (w
& ((bitset_word_t
) 1 << j
))
347 re_set_fastmap (fastmap
, icase
, ch
);
350 #ifdef RE_ENABLE_I18N
351 else if (type
== COMPLEX_BRACKET
)
353 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
357 /* See if we have to try all bytes which start multiple collation
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0
364 && (cset
->ncoll_syms
|| cset
->nranges
))
366 const int32_t *table
= (const int32_t *)
367 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
368 for (i
= 0; i
< SBC_MAX
; ++i
)
370 re_set_fastmap (fastmap
, icase
, i
);
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa
->mb_cur_max
> 1
379 && (cset
->nchar_classes
|| cset
->non_match
|| cset
->nranges
381 || cset
->nequiv_classes
389 memset (&mbs
, 0, sizeof (mbs
));
390 if (__mbrtowc (NULL
, (char *) &c
, 1, &mbs
) == (size_t) -2)
391 re_set_fastmap (fastmap
, false, (int) c
);
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i
= 0; i
< cset
->nmbchars
; ++i
)
403 memset (&state
, '\0', sizeof (state
));
404 if (__wcrtomb (buf
, cset
->mbchars
[i
], &state
) != (size_t) -1)
405 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
406 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
408 if (__wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
)
410 re_set_fastmap (fastmap
, false, *(unsigned char *) buf
);
415 #endif /* RE_ENABLE_I18N */
416 else if (type
== OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type
== OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type
== END_OF_RE
)
422 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
423 if (type
== END_OF_RE
)
424 bufp
->can_be_null
= 1;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 `buffer' to the compiled pattern;
437 `used' to the length of the compiled pattern;
438 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 `fastmap' to an allocated space for the fastmap;
443 `fastmap_accurate' to zero;
444 `re_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (preg
, pattern
, cflags
)
468 regex_t
*__restrict preg
;
469 const char *__restrict pattern
;
473 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
474 : RE_SYNTAX_POSIX_BASIC
);
480 /* Try to allocate space for the fastmap. */
481 preg
->fastmap
= re_malloc (char, SBC_MAX
);
482 if (BE (preg
->fastmap
== NULL
, 0))
485 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
487 /* If REG_NEWLINE is set, newlines are treated differently. */
488 if (cflags
& REG_NEWLINE
)
489 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
490 syntax
&= ~RE_DOT_NEWLINE
;
491 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
492 /* It also changes the matching behavior. */
493 preg
->newline_anchor
= 1;
496 preg
->newline_anchor
= 0;
497 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
498 preg
->translate
= NULL
;
500 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
502 /* POSIX doesn't distinguish between an unmatched open-group and an
503 unmatched close-group: both are REG_EPAREN. */
504 if (ret
== REG_ERPAREN
)
507 /* We have already checked preg->fastmap != NULL. */
508 if (BE (ret
== REG_NOERROR
, 1))
509 /* Compute the fastmap now, since regexec cannot modify the pattern
510 buffer. This function never fails in this implementation. */
511 (void) re_compile_fastmap (preg
);
514 /* Some error occurred while compiling the expression. */
515 re_free (preg
->fastmap
);
516 preg
->fastmap
= NULL
;
522 weak_alias (__regcomp
, regcomp
)
525 /* Returns a message corresponding to an error code, ERRCODE, returned
526 from either regcomp or regexec. We don't use PREG here. */
529 regerror (errcode
, preg
, errbuf
, errbuf_size
)
531 const regex_t
*__restrict preg
;
532 char *__restrict errbuf
;
539 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
540 / sizeof (__re_error_msgid_idx
[0])), 0))
541 /* Only error codes returned by the rest of the code should be passed
542 to this routine. If we are given anything else, or if other regex
543 code generates an invalid error code, then the program has a bug.
544 Dump core so we can fix it. */
547 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
549 msg_size
= strlen (msg
) + 1; /* Includes the null. */
551 if (BE (errbuf_size
!= 0, 1))
553 if (BE (msg_size
> errbuf_size
, 0))
555 #if defined HAVE_MEMPCPY || defined _LIBC
556 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
558 memcpy (errbuf
, msg
, errbuf_size
- 1);
559 errbuf
[errbuf_size
- 1] = 0;
563 memcpy (errbuf
, msg
, msg_size
);
569 weak_alias (__regerror
, regerror
)
573 #ifdef RE_ENABLE_I18N
574 /* This static array is used for the map to single-byte characters when
575 UTF-8 is used. Otherwise we would allocate memory just to initialize
576 it the same all the time. UTF-8 is the preferred encoding so this is
577 a worthwhile optimization. */
578 static const bitset_t utf8_sb_map
=
580 /* Set the first 128 bits. */
581 [0 ... 0x80 / BITSET_WORD_BITS
- 1] = BITSET_WORD_MAX
587 free_dfa_content (re_dfa_t
*dfa
)
592 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
593 free_token (dfa
->nodes
+ i
);
594 re_free (dfa
->nexts
);
595 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
597 if (dfa
->eclosures
!= NULL
)
598 re_node_set_free (dfa
->eclosures
+ i
);
599 if (dfa
->inveclosures
!= NULL
)
600 re_node_set_free (dfa
->inveclosures
+ i
);
601 if (dfa
->edests
!= NULL
)
602 re_node_set_free (dfa
->edests
+ i
);
604 re_free (dfa
->edests
);
605 re_free (dfa
->eclosures
);
606 re_free (dfa
->inveclosures
);
607 re_free (dfa
->nodes
);
609 if (dfa
->state_table
)
610 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
612 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
613 for (j
= 0; j
< entry
->num
; ++j
)
615 re_dfastate_t
*state
= entry
->array
[j
];
618 re_free (entry
->array
);
620 re_free (dfa
->state_table
);
621 #ifdef RE_ENABLE_I18N
622 if (dfa
->sb_char
!= utf8_sb_map
)
623 re_free (dfa
->sb_char
);
625 re_free (dfa
->subexp_map
);
627 re_free (dfa
->re_str
);
634 /* Free dynamically allocated space used by PREG. */
640 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
641 if (BE (dfa
!= NULL
, 1))
642 free_dfa_content (dfa
);
646 re_free (preg
->fastmap
);
647 preg
->fastmap
= NULL
;
649 re_free (preg
->translate
);
650 preg
->translate
= NULL
;
653 weak_alias (__regfree
, regfree
)
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
659 #if defined _REGEX_RE_COMP || defined _LIBC
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf
;
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
679 if (!re_comp_buf
.buffer
)
680 return gettext ("No previous regular expression");
684 if (re_comp_buf
.buffer
)
686 fastmap
= re_comp_buf
.fastmap
;
687 re_comp_buf
.fastmap
= NULL
;
688 __regfree (&re_comp_buf
);
689 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
690 re_comp_buf
.fastmap
= fastmap
;
693 if (re_comp_buf
.fastmap
== NULL
)
695 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
696 if (re_comp_buf
.fastmap
== NULL
)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
704 /* Match anchors at newlines. */
705 re_comp_buf
.newline_anchor
= 1;
707 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
717 libc_freeres_fn (free_mem
)
719 __regfree (&re_comp_buf
);
723 #endif /* _REGEX_RE_COMP */
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
730 re_compile_internal (regex_t
*preg
, const char * pattern
, size_t length
,
733 reg_errcode_t err
= REG_NOERROR
;
737 /* Initialize the pattern buffer. */
738 preg
->fastmap_accurate
= 0;
739 preg
->syntax
= syntax
;
740 preg
->not_bol
= preg
->not_eol
= 0;
743 preg
->can_be_null
= 0;
744 preg
->regs_allocated
= REGS_UNALLOCATED
;
746 /* Initialize the dfa. */
747 dfa
= (re_dfa_t
*) preg
->buffer
;
748 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
757 preg
->allocated
= sizeof (re_dfa_t
);
758 preg
->buffer
= (unsigned char *) dfa
;
760 preg
->used
= sizeof (re_dfa_t
);
762 err
= init_dfa (dfa
, length
);
763 if (BE (err
!= REG_NOERROR
, 0))
765 free_dfa_content (dfa
);
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa
->re_str
= re_malloc (char, length
+ 1);
773 strncpy (dfa
->re_str
, pattern
, length
+ 1);
776 __libc_lock_init (dfa
->lock
);
778 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
779 syntax
& RE_ICASE
, dfa
);
780 if (BE (err
!= REG_NOERROR
, 0))
782 re_compile_internal_free_return
:
783 free_workarea_compile (preg
);
784 re_string_destruct (®exp
);
785 free_dfa_content (dfa
);
791 /* Parse the regular expression, and build a structure tree. */
793 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
794 if (BE (dfa
->str_tree
== NULL
, 0))
795 goto re_compile_internal_free_return
;
797 /* Analyze the tree and create the nfa. */
798 err
= analyze (preg
);
799 if (BE (err
!= REG_NOERROR
, 0))
800 goto re_compile_internal_free_return
;
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
808 /* Then create the initial state of the dfa. */
809 err
= create_initial_state (dfa
);
811 /* Release work areas. */
812 free_workarea_compile (preg
);
813 re_string_destruct (®exp
);
815 if (BE (err
!= REG_NOERROR
, 0))
817 free_dfa_content (dfa
);
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
829 init_dfa (re_dfa_t
*dfa
, size_t pat_len
)
831 unsigned int table_size
;
836 memset (dfa
, '\0', sizeof (re_dfa_t
));
838 /* Force allocation of str_tree_storage the first time. */
839 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
841 /* Avoid overflows. */
842 if (pat_len
== SIZE_MAX
)
845 dfa
->nodes_alloc
= pat_len
+ 1;
846 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
848 /* table_size = 2 ^ ceil(log pat_len) */
849 for (table_size
= 1; ; table_size
<<= 1)
850 if (table_size
> pat_len
)
853 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
854 dfa
->state_hash_mask
= table_size
- 1;
856 dfa
->mb_cur_max
= MB_CUR_MAX
;
858 if (dfa
->mb_cur_max
== 6
859 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
861 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
864 # ifdef HAVE_LANGINFO_CODESET
865 codeset_name
= nl_langinfo (CODESET
);
867 codeset_name
= getenv ("LC_ALL");
868 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
869 codeset_name
= getenv ("LC_CTYPE");
870 if (codeset_name
== NULL
|| codeset_name
[0] == '\0')
871 codeset_name
= getenv ("LANG");
872 if (codeset_name
== NULL
)
874 else if (strchr (codeset_name
, '.') != NULL
)
875 codeset_name
= strchr (codeset_name
, '.') + 1;
878 if (strcasecmp (codeset_name
, "UTF-8") == 0
879 || strcasecmp (codeset_name
, "UTF8") == 0)
882 /* We check exhaustively in the loop below if this charset is a
883 superset of ASCII. */
884 dfa
->map_notascii
= 0;
887 #ifdef RE_ENABLE_I18N
888 if (dfa
->mb_cur_max
> 1)
891 dfa
->sb_char
= (re_bitset_ptr_t
) utf8_sb_map
;
896 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
897 if (BE (dfa
->sb_char
== NULL
, 0))
900 /* Set the bits corresponding to single byte chars. */
901 for (i
= 0, ch
= 0; i
< BITSET_WORDS
; ++i
)
902 for (j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
904 wint_t wch
= __btowc (ch
);
906 dfa
->sb_char
[i
] |= (bitset_word_t
) 1 << j
;
908 if (isascii (ch
) && wch
!= ch
)
909 dfa
->map_notascii
= 1;
916 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
, 0))
921 /* Initialize WORD_CHAR table, which indicate which character is
922 "word". In this case "word" means that it is the word construction
923 character used by some operators like "\<", "\>", etc. */
927 init_word_char (re_dfa_t
*dfa
)
929 dfa
->word_ops_used
= 1;
932 if (BE (dfa
->map_notascii
== 0, 1))
934 if (sizeof (dfa
->word_char
[0]) == 8)
936 dfa
->word_char
[0] = UINT64_C (0x03ff000000000000);
937 dfa
->word_char
[1] = UINT64_C (0x07fffffe87fffffe);
940 else if (sizeof (dfa
->word_char
[0]) == 4)
942 dfa
->word_char
[0] = UINT32_C (0x00000000);
943 dfa
->word_char
[1] = UINT32_C (0x03ff0000);
944 dfa
->word_char
[2] = UINT32_C (0x87fffffe);
945 dfa
->word_char
[3] = UINT32_C (0x07fffffe);
952 if (BE (dfa
->is_utf8
, 1))
954 memset (&dfa
->word_char
[i
], '\0', (SBC_MAX
- ch
) / 8);
959 for (; i
< BITSET_WORDS
; ++i
)
960 for (int j
= 0; j
< BITSET_WORD_BITS
; ++j
, ++ch
)
961 if (isalnum (ch
) || ch
== '_')
962 dfa
->word_char
[i
] |= (bitset_word_t
) 1 << j
;
965 /* Free the work area which are only used while compiling. */
968 free_workarea_compile (regex_t
*preg
)
970 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
971 bin_tree_storage_t
*storage
, *next
;
972 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
974 next
= storage
->next
;
977 dfa
->str_tree_storage
= NULL
;
978 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
979 dfa
->str_tree
= NULL
;
980 re_free (dfa
->org_indices
);
981 dfa
->org_indices
= NULL
;
984 /* Create initial states for all contexts. */
987 create_initial_state (re_dfa_t
*dfa
)
991 re_node_set init_nodes
;
993 /* Initial states have the epsilon closure of the node which is
994 the first node of the regular expression. */
995 first
= dfa
->str_tree
->first
->node_idx
;
996 dfa
->init_node
= first
;
997 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
998 if (BE (err
!= REG_NOERROR
, 0))
1001 /* The back-references which are in initial states can epsilon transit,
1002 since in this case all of the subexpressions can be null.
1003 Then we add epsilon closures of the nodes which are the next nodes of
1004 the back-references. */
1005 if (dfa
->nbackref
> 0)
1006 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
1008 int node_idx
= init_nodes
.elems
[i
];
1009 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1012 if (type
!= OP_BACK_REF
)
1014 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
1016 re_token_t
*clexp_node
;
1017 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
1018 if (clexp_node
->type
== OP_CLOSE_SUBEXP
1019 && clexp_node
->opr
.idx
== dfa
->nodes
[node_idx
].opr
.idx
)
1022 if (clexp_idx
== init_nodes
.nelem
)
1025 if (type
== OP_BACK_REF
)
1027 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
1028 if (!re_node_set_contains (&init_nodes
, dest_idx
))
1030 reg_errcode_t err
= re_node_set_merge (&init_nodes
,
1033 if (err
!= REG_NOERROR
)
1040 /* It must be the first time to invoke acquire_state. */
1041 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
1042 /* We don't check ERR here, since the initial state must not be NULL. */
1043 if (BE (dfa
->init_state
== NULL
, 0))
1045 if (dfa
->init_state
->has_constraint
)
1047 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1049 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
1051 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
1055 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
1056 || dfa
->init_state_begbuf
== NULL
, 0))
1060 dfa
->init_state_word
= dfa
->init_state_nl
1061 = dfa
->init_state_begbuf
= dfa
->init_state
;
1063 re_node_set_free (&init_nodes
);
1067 #ifdef RE_ENABLE_I18N
1068 /* If it is possible to do searching in single byte encoding instead of UTF-8
1069 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1070 DFA nodes where needed. */
1073 optimize_utf8 (re_dfa_t
*dfa
)
1075 int node
, i
, mb_chars
= 0, has_period
= 0;
1077 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1078 switch (dfa
->nodes
[node
].type
)
1081 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1085 switch (dfa
->nodes
[node
].opr
.ctx_type
)
1093 /* Word anchors etc. cannot be handled. It's okay to test
1094 opr.ctx_type since constraints (for all DFA nodes) are
1095 created by ORing one or more opr.ctx_type values. */
1105 case OP_DUP_ASTERISK
:
1106 case OP_OPEN_SUBEXP
:
1107 case OP_CLOSE_SUBEXP
:
1109 case COMPLEX_BRACKET
:
1111 case SIMPLE_BRACKET
:
1112 /* Just double check. The non-ASCII range starts at 0x80. */
1113 assert (0x80 % BITSET_WORD_BITS
== 0);
1114 for (i
= 0x80 / BITSET_WORD_BITS
; i
< BITSET_WORDS
; ++i
)
1115 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1122 if (mb_chars
|| has_period
)
1123 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1125 if (dfa
->nodes
[node
].type
== CHARACTER
1126 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1127 dfa
->nodes
[node
].mb_partial
= 0;
1128 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1129 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1132 /* The search can be in single byte locale. */
1133 dfa
->mb_cur_max
= 1;
1135 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1139 /* Analyze the structure tree, and calculate "first", "next", "edest",
1140 "eclosure", and "inveclosure". */
1142 static reg_errcode_t
1143 analyze (regex_t
*preg
)
1145 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1148 /* Allocate arrays. */
1149 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1150 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1151 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1152 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1153 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1154 || dfa
->eclosures
== NULL
, 0))
1157 dfa
->subexp_map
= re_malloc (int, preg
->re_nsub
);
1158 if (dfa
->subexp_map
!= NULL
)
1161 for (i
= 0; i
< preg
->re_nsub
; i
++)
1162 dfa
->subexp_map
[i
] = i
;
1163 preorder (dfa
->str_tree
, optimize_subexps
, dfa
);
1164 for (i
= 0; i
< preg
->re_nsub
; i
++)
1165 if (dfa
->subexp_map
[i
] != i
)
1167 if (i
== preg
->re_nsub
)
1169 free (dfa
->subexp_map
);
1170 dfa
->subexp_map
= NULL
;
1174 ret
= postorder (dfa
->str_tree
, lower_subexps
, preg
);
1175 if (BE (ret
!= REG_NOERROR
, 0))
1177 ret
= postorder (dfa
->str_tree
, calc_first
, dfa
);
1178 if (BE (ret
!= REG_NOERROR
, 0))
1180 preorder (dfa
->str_tree
, calc_next
, dfa
);
1181 ret
= preorder (dfa
->str_tree
, link_nfa_nodes
, dfa
);
1182 if (BE (ret
!= REG_NOERROR
, 0))
1184 ret
= calc_eclosure (dfa
);
1185 if (BE (ret
!= REG_NOERROR
, 0))
1188 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1189 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1190 if ((!preg
->no_sub
&& preg
->re_nsub
> 0 && dfa
->has_plural_match
)
1193 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_len
);
1194 if (BE (dfa
->inveclosures
== NULL
, 0))
1196 ret
= calc_inveclosure (dfa
);
1202 /* Our parse trees are very unbalanced, so we cannot use a stack to
1203 implement parse tree visits. Instead, we use parent pointers and
1204 some hairy code in these two functions. */
1205 static reg_errcode_t
1206 postorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1209 bin_tree_t
*node
, *prev
;
1211 for (node
= root
; ; )
1213 /* Descend down the tree, preferably to the left (or to the right
1214 if that's the only child). */
1215 while (node
->left
|| node
->right
)
1223 reg_errcode_t err
= fn (extra
, node
);
1224 if (BE (err
!= REG_NOERROR
, 0))
1226 if (node
->parent
== NULL
)
1229 node
= node
->parent
;
1231 /* Go up while we have a node that is reached from the right. */
1232 while (node
->right
== prev
|| node
->right
== NULL
);
1237 static reg_errcode_t
1238 preorder (bin_tree_t
*root
, reg_errcode_t (fn (void *, bin_tree_t
*)),
1243 for (node
= root
; ; )
1245 reg_errcode_t err
= fn (extra
, node
);
1246 if (BE (err
!= REG_NOERROR
, 0))
1249 /* Go to the left node, or up and to the right. */
1254 bin_tree_t
*prev
= NULL
;
1255 while (node
->right
== prev
|| node
->right
== NULL
)
1258 node
= node
->parent
;
1267 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1268 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1269 backreferences as well. Requires a preorder visit. */
1270 static reg_errcode_t
1271 optimize_subexps (void *extra
, bin_tree_t
*node
)
1273 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1275 if (node
->token
.type
== OP_BACK_REF
&& dfa
->subexp_map
)
1277 int idx
= node
->token
.opr
.idx
;
1278 node
->token
.opr
.idx
= dfa
->subexp_map
[idx
];
1279 dfa
->used_bkref_map
|= 1 << node
->token
.opr
.idx
;
1282 else if (node
->token
.type
== SUBEXP
1283 && node
->left
&& node
->left
->token
.type
== SUBEXP
)
1285 int other_idx
= node
->left
->token
.opr
.idx
;
1287 node
->left
= node
->left
->left
;
1289 node
->left
->parent
= node
;
1291 dfa
->subexp_map
[other_idx
] = dfa
->subexp_map
[node
->token
.opr
.idx
];
1292 if (other_idx
< BITSET_WORD_BITS
)
1293 dfa
->used_bkref_map
&= ~((bitset_word_t
) 1 << other_idx
);
1299 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1300 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1301 static reg_errcode_t
1302 lower_subexps (void *extra
, bin_tree_t
*node
)
1304 regex_t
*preg
= (regex_t
*) extra
;
1305 reg_errcode_t err
= REG_NOERROR
;
1307 if (node
->left
&& node
->left
->token
.type
== SUBEXP
)
1309 node
->left
= lower_subexp (&err
, preg
, node
->left
);
1311 node
->left
->parent
= node
;
1313 if (node
->right
&& node
->right
->token
.type
== SUBEXP
)
1315 node
->right
= lower_subexp (&err
, preg
, node
->right
);
1317 node
->right
->parent
= node
;
1324 lower_subexp (reg_errcode_t
*err
, regex_t
*preg
, bin_tree_t
*node
)
1326 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1327 bin_tree_t
*body
= node
->left
;
1328 bin_tree_t
*op
, *cls
, *tree1
, *tree
;
1331 /* We do not optimize empty subexpressions, because otherwise we may
1332 have bad CONCAT nodes with NULL children. This is obviously not
1333 very common, so we do not lose much. An example that triggers
1334 this case is the sed "script" /\(\)/x. */
1335 && node
->left
!= NULL
1336 && (node
->token
.opr
.idx
>= BITSET_WORD_BITS
1337 || !(dfa
->used_bkref_map
1338 & ((bitset_word_t
) 1 << node
->token
.opr
.idx
))))
1341 /* Convert the SUBEXP node to the concatenation of an
1342 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1343 op
= create_tree (dfa
, NULL
, NULL
, OP_OPEN_SUBEXP
);
1344 cls
= create_tree (dfa
, NULL
, NULL
, OP_CLOSE_SUBEXP
);
1345 tree1
= body
? create_tree (dfa
, body
, cls
, CONCAT
) : cls
;
1346 tree
= create_tree (dfa
, op
, tree1
, CONCAT
);
1347 if (BE (tree
== NULL
|| tree1
== NULL
|| op
== NULL
|| cls
== NULL
, 0))
1353 op
->token
.opr
.idx
= cls
->token
.opr
.idx
= node
->token
.opr
.idx
;
1354 op
->token
.opt_subexp
= cls
->token
.opt_subexp
= node
->token
.opt_subexp
;
1358 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1359 nodes. Requires a postorder visit. */
1360 static reg_errcode_t
1361 calc_first (void *extra
, bin_tree_t
*node
)
1363 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1364 if (node
->token
.type
== CONCAT
)
1366 node
->first
= node
->left
->first
;
1367 node
->node_idx
= node
->left
->node_idx
;
1372 node
->node_idx
= re_dfa_add_node (dfa
, node
->token
);
1373 if (BE (node
->node_idx
== -1, 0))
1375 if (node
->token
.type
== ANCHOR
)
1376 dfa
->nodes
[node
->node_idx
].constraint
= node
->token
.opr
.ctx_type
;
1381 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1382 static reg_errcode_t
1383 calc_next (void *extra
, bin_tree_t
*node
)
1385 switch (node
->token
.type
)
1387 case OP_DUP_ASTERISK
:
1388 node
->left
->next
= node
;
1391 node
->left
->next
= node
->right
->first
;
1392 node
->right
->next
= node
->next
;
1396 node
->left
->next
= node
->next
;
1398 node
->right
->next
= node
->next
;
1404 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1405 static reg_errcode_t
1406 link_nfa_nodes (void *extra
, bin_tree_t
*node
)
1408 re_dfa_t
*dfa
= (re_dfa_t
*) extra
;
1409 int idx
= node
->node_idx
;
1410 reg_errcode_t err
= REG_NOERROR
;
1412 switch (node
->token
.type
)
1418 assert (node
->next
== NULL
);
1421 case OP_DUP_ASTERISK
:
1425 dfa
->has_plural_match
= 1;
1426 if (node
->left
!= NULL
)
1427 left
= node
->left
->first
->node_idx
;
1429 left
= node
->next
->node_idx
;
1430 if (node
->right
!= NULL
)
1431 right
= node
->right
->first
->node_idx
;
1433 right
= node
->next
->node_idx
;
1435 assert (right
> -1);
1436 err
= re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1441 case OP_OPEN_SUBEXP
:
1442 case OP_CLOSE_SUBEXP
:
1443 err
= re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
->node_idx
);
1447 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1448 if (node
->token
.type
== OP_BACK_REF
)
1449 err
= re_node_set_init_1 (dfa
->edests
+ idx
, dfa
->nexts
[idx
]);
1453 assert (!IS_EPSILON_NODE (node
->token
.type
));
1454 dfa
->nexts
[idx
] = node
->next
->node_idx
;
1461 /* Duplicate the epsilon closure of the node ROOT_NODE.
1462 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1463 to their own constraint. */
1465 static reg_errcode_t
1467 duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
, int top_clone_node
,
1468 int root_node
, unsigned int init_constraint
)
1470 int org_node
, clone_node
, ret
;
1471 unsigned int constraint
= init_constraint
;
1472 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1474 int org_dest
, clone_dest
;
1475 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1477 /* If the back reference epsilon-transit, its destination must
1478 also have the constraint. Then duplicate the epsilon closure
1479 of the destination of the back reference, and store it in
1480 edests of the back reference. */
1481 org_dest
= dfa
->nexts
[org_node
];
1482 re_node_set_empty (dfa
->edests
+ clone_node
);
1483 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1484 if (BE (clone_dest
== -1, 0))
1486 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1487 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1488 if (BE (ret
< 0, 0))
1491 else if (dfa
->edests
[org_node
].nelem
== 0)
1493 /* In case of the node can't epsilon-transit, don't duplicate the
1494 destination and store the original destination as the
1495 destination of the node. */
1496 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1499 else if (dfa
->edests
[org_node
].nelem
== 1)
1501 /* In case of the node can epsilon-transit, and it has only one
1503 org_dest
= dfa
->edests
[org_node
].elems
[0];
1504 re_node_set_empty (dfa
->edests
+ clone_node
);
1505 /* If the node is root_node itself, it means the epsilon clsoure
1506 has a loop. Then tie it to the destination of the root_node. */
1507 if (org_node
== root_node
&& clone_node
!= org_node
)
1509 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, org_dest
);
1510 if (BE (ret
< 0, 0))
1514 /* In case of the node has another constraint, add it. */
1515 constraint
|= dfa
->nodes
[org_node
].constraint
;
1516 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1517 if (BE (clone_dest
== -1, 0))
1519 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1520 if (BE (ret
< 0, 0))
1523 else /* dfa->edests[org_node].nelem == 2 */
1525 /* In case of the node can epsilon-transit, and it has two
1526 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1527 org_dest
= dfa
->edests
[org_node
].elems
[0];
1528 re_node_set_empty (dfa
->edests
+ clone_node
);
1529 /* Search for a duplicated node which satisfies the constraint. */
1530 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1531 if (clone_dest
== -1)
1533 /* There is no such duplicated node, create a new one. */
1535 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1536 if (BE (clone_dest
== -1, 0))
1538 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1539 if (BE (ret
< 0, 0))
1541 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1542 root_node
, constraint
);
1543 if (BE (err
!= REG_NOERROR
, 0))
1548 /* There is a duplicated node which satisfies the constraint,
1549 use it to avoid infinite loop. */
1550 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1551 if (BE (ret
< 0, 0))
1555 org_dest
= dfa
->edests
[org_node
].elems
[1];
1556 clone_dest
= duplicate_node (dfa
, org_dest
, constraint
);
1557 if (BE (clone_dest
== -1, 0))
1559 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1560 if (BE (ret
< 0, 0))
1563 org_node
= org_dest
;
1564 clone_node
= clone_dest
;
1569 /* Search for a node which is duplicated from the node ORG_NODE, and
1570 satisfies the constraint CONSTRAINT. */
1573 search_duplicated_node (const re_dfa_t
*dfa
, int org_node
,
1574 unsigned int constraint
)
1577 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1579 if (org_node
== dfa
->org_indices
[idx
]
1580 && constraint
== dfa
->nodes
[idx
].constraint
)
1581 return idx
; /* Found. */
1583 return -1; /* Not found. */
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587 Return the index of the new node, or -1 if insufficient storage is
1591 duplicate_node (re_dfa_t
*dfa
, int org_idx
, unsigned int constraint
)
1593 int dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
]);
1594 if (BE (dup_idx
!= -1, 1))
1596 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1597 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].constraint
;
1598 dfa
->nodes
[dup_idx
].duplicated
= 1;
1600 /* Store the index of the original node. */
1601 dfa
->org_indices
[dup_idx
] = org_idx
;
1606 static reg_errcode_t
1607 calc_inveclosure (re_dfa_t
*dfa
)
1610 for (idx
= 0; idx
< dfa
->nodes_len
; ++idx
)
1611 re_node_set_init_empty (dfa
->inveclosures
+ idx
);
1613 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1615 int *elems
= dfa
->eclosures
[src
].elems
;
1616 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1618 ret
= re_node_set_insert_last (dfa
->inveclosures
+ elems
[idx
], src
);
1619 if (BE (ret
== -1, 0))
1627 /* Calculate "eclosure" for all the node in DFA. */
1629 static reg_errcode_t
1630 calc_eclosure (re_dfa_t
*dfa
)
1632 int node_idx
, incomplete
;
1634 assert (dfa
->nodes_len
> 0);
1637 /* For each nodes, calculate epsilon closure. */
1638 for (node_idx
= 0; ; ++node_idx
)
1641 re_node_set eclosure_elem
;
1642 if (node_idx
== dfa
->nodes_len
)
1651 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1654 /* If we have already calculated, skip it. */
1655 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1657 /* Calculate epsilon closure of `node_idx'. */
1658 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1659 if (BE (err
!= REG_NOERROR
, 0))
1662 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1665 re_node_set_free (&eclosure_elem
);
1671 /* Calculate epsilon closure of NODE. */
1673 static reg_errcode_t
1674 calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
, int node
, int root
)
1678 re_node_set eclosure
;
1681 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1682 if (BE (err
!= REG_NOERROR
, 0))
1685 /* This indicates that we are calculating this node now.
1686 We reference this value to avoid infinite loop. */
1687 dfa
->eclosures
[node
].nelem
= -1;
1689 /* If the current node has constraints, duplicate all nodes
1690 since they must inherit the constraints. */
1691 if (dfa
->nodes
[node
].constraint
1692 && dfa
->edests
[node
].nelem
1693 && !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1695 err
= duplicate_node_closure (dfa
, node
, node
, node
,
1696 dfa
->nodes
[node
].constraint
);
1697 if (BE (err
!= REG_NOERROR
, 0))
1701 /* Expand each epsilon destination nodes. */
1702 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1703 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1705 re_node_set eclosure_elem
;
1706 int edest
= dfa
->edests
[node
].elems
[i
];
1707 /* If calculating the epsilon closure of `edest' is in progress,
1708 return intermediate result. */
1709 if (dfa
->eclosures
[edest
].nelem
== -1)
1714 /* If we haven't calculated the epsilon closure of `edest' yet,
1715 calculate now. Otherwise use calculated epsilon closure. */
1716 if (dfa
->eclosures
[edest
].nelem
== 0)
1718 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1719 if (BE (err
!= REG_NOERROR
, 0))
1723 eclosure_elem
= dfa
->eclosures
[edest
];
1724 /* Merge the epsilon closure of `edest'. */
1725 err
= re_node_set_merge (&eclosure
, &eclosure_elem
);
1726 if (BE (err
!= REG_NOERROR
, 0))
1728 /* If the epsilon closure of `edest' is incomplete,
1729 the epsilon closure of this node is also incomplete. */
1730 if (dfa
->eclosures
[edest
].nelem
== 0)
1733 re_node_set_free (&eclosure_elem
);
1737 /* An epsilon closure includes itself. */
1738 ret
= re_node_set_insert (&eclosure
, node
);
1739 if (BE (ret
< 0, 0))
1741 if (incomplete
&& !root
)
1742 dfa
->eclosures
[node
].nelem
= 0;
1744 dfa
->eclosures
[node
] = eclosure
;
1745 *new_set
= eclosure
;
1749 /* Functions for token which are used in the parser. */
1751 /* Fetch a token from INPUT.
1752 We must not use this function inside bracket expressions. */
1756 fetch_token (re_token_t
*result
, re_string_t
*input
, reg_syntax_t syntax
)
1758 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1761 /* Peek a token from INPUT, and return the length of the token.
1762 We must not use this function inside bracket expressions. */
1766 peek_token (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
1770 if (re_string_eoi (input
))
1772 token
->type
= END_OF_RE
;
1776 c
= re_string_peek_byte (input
, 0);
1779 token
->word_char
= 0;
1780 #ifdef RE_ENABLE_I18N
1781 token
->mb_partial
= 0;
1782 if (input
->mb_cur_max
> 1 &&
1783 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1785 token
->type
= CHARACTER
;
1786 token
->mb_partial
= 1;
1793 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1795 token
->type
= BACK_SLASH
;
1799 c2
= re_string_peek_byte_case (input
, 1);
1801 token
->type
= CHARACTER
;
1802 #ifdef RE_ENABLE_I18N
1803 if (input
->mb_cur_max
> 1)
1805 wint_t wc
= re_string_wchar_at (input
,
1806 re_string_cur_idx (input
) + 1);
1807 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1811 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1816 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1817 token
->type
= OP_ALT
;
1819 case '1': case '2': case '3': case '4': case '5':
1820 case '6': case '7': case '8': case '9':
1821 if (!(syntax
& RE_NO_BK_REFS
))
1823 token
->type
= OP_BACK_REF
;
1824 token
->opr
.idx
= c2
- '1';
1828 if (!(syntax
& RE_NO_GNU_OPS
))
1830 token
->type
= ANCHOR
;
1831 token
->opr
.ctx_type
= WORD_FIRST
;
1835 if (!(syntax
& RE_NO_GNU_OPS
))
1837 token
->type
= ANCHOR
;
1838 token
->opr
.ctx_type
= WORD_LAST
;
1842 if (!(syntax
& RE_NO_GNU_OPS
))
1844 token
->type
= ANCHOR
;
1845 token
->opr
.ctx_type
= WORD_DELIM
;
1849 if (!(syntax
& RE_NO_GNU_OPS
))
1851 token
->type
= ANCHOR
;
1852 token
->opr
.ctx_type
= NOT_WORD_DELIM
;
1856 if (!(syntax
& RE_NO_GNU_OPS
))
1857 token
->type
= OP_WORD
;
1860 if (!(syntax
& RE_NO_GNU_OPS
))
1861 token
->type
= OP_NOTWORD
;
1864 if (!(syntax
& RE_NO_GNU_OPS
))
1865 token
->type
= OP_SPACE
;
1868 if (!(syntax
& RE_NO_GNU_OPS
))
1869 token
->type
= OP_NOTSPACE
;
1872 if (!(syntax
& RE_NO_GNU_OPS
))
1874 token
->type
= ANCHOR
;
1875 token
->opr
.ctx_type
= BUF_FIRST
;
1879 if (!(syntax
& RE_NO_GNU_OPS
))
1881 token
->type
= ANCHOR
;
1882 token
->opr
.ctx_type
= BUF_LAST
;
1886 if (!(syntax
& RE_NO_BK_PARENS
))
1887 token
->type
= OP_OPEN_SUBEXP
;
1890 if (!(syntax
& RE_NO_BK_PARENS
))
1891 token
->type
= OP_CLOSE_SUBEXP
;
1894 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1895 token
->type
= OP_DUP_PLUS
;
1898 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1899 token
->type
= OP_DUP_QUESTION
;
1902 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1903 token
->type
= OP_OPEN_DUP_NUM
;
1906 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1907 token
->type
= OP_CLOSE_DUP_NUM
;
1915 token
->type
= CHARACTER
;
1916 #ifdef RE_ENABLE_I18N
1917 if (input
->mb_cur_max
> 1)
1919 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1920 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1924 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1929 if (syntax
& RE_NEWLINE_ALT
)
1930 token
->type
= OP_ALT
;
1933 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1934 token
->type
= OP_ALT
;
1937 token
->type
= OP_DUP_ASTERISK
;
1940 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1941 token
->type
= OP_DUP_PLUS
;
1944 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1945 token
->type
= OP_DUP_QUESTION
;
1948 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1949 token
->type
= OP_OPEN_DUP_NUM
;
1952 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1953 token
->type
= OP_CLOSE_DUP_NUM
;
1956 if (syntax
& RE_NO_BK_PARENS
)
1957 token
->type
= OP_OPEN_SUBEXP
;
1960 if (syntax
& RE_NO_BK_PARENS
)
1961 token
->type
= OP_CLOSE_SUBEXP
;
1964 token
->type
= OP_OPEN_BRACKET
;
1967 token
->type
= OP_PERIOD
;
1970 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1971 re_string_cur_idx (input
) != 0)
1973 char prev
= re_string_peek_byte (input
, -1);
1974 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1977 token
->type
= ANCHOR
;
1978 token
->opr
.ctx_type
= LINE_FIRST
;
1981 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1982 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1985 re_string_skip_bytes (input
, 1);
1986 peek_token (&next
, input
, syntax
);
1987 re_string_skip_bytes (input
, -1);
1988 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1991 token
->type
= ANCHOR
;
1992 token
->opr
.ctx_type
= LINE_LAST
;
2000 /* Peek a token from INPUT, and return the length of the token.
2001 We must not use this function out of bracket expressions. */
2005 peek_token_bracket (re_token_t
*token
, re_string_t
*input
, reg_syntax_t syntax
)
2008 if (re_string_eoi (input
))
2010 token
->type
= END_OF_RE
;
2013 c
= re_string_peek_byte (input
, 0);
2016 #ifdef RE_ENABLE_I18N
2017 if (input
->mb_cur_max
> 1 &&
2018 !re_string_first_byte (input
, re_string_cur_idx (input
)))
2020 token
->type
= CHARACTER
;
2023 #endif /* RE_ENABLE_I18N */
2025 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
)
2026 && re_string_cur_idx (input
) + 1 < re_string_length (input
))
2028 /* In this case, '\' escape a character. */
2030 re_string_skip_bytes (input
, 1);
2031 c2
= re_string_peek_byte (input
, 0);
2033 token
->type
= CHARACTER
;
2036 if (c
== '[') /* '[' is a special char in a bracket exps. */
2040 if (re_string_cur_idx (input
) + 1 < re_string_length (input
))
2041 c2
= re_string_peek_byte (input
, 1);
2049 token
->type
= OP_OPEN_COLL_ELEM
;
2052 token
->type
= OP_OPEN_EQUIV_CLASS
;
2055 if (syntax
& RE_CHAR_CLASSES
)
2057 token
->type
= OP_OPEN_CHAR_CLASS
;
2060 /* else fall through. */
2062 token
->type
= CHARACTER
;
2072 token
->type
= OP_CHARSET_RANGE
;
2075 token
->type
= OP_CLOSE_BRACKET
;
2078 token
->type
= OP_NON_MATCH_LIST
;
2081 token
->type
= CHARACTER
;
2086 /* Functions for parser. */
2088 /* Entry point of the parser.
2089 Parse the regular expression REGEXP and return the structure tree.
2090 If an error is occured, ERR is set by error code, and return NULL.
2091 This function build the following tree, from regular expression <reg_exp>:
2097 CAT means concatenation.
2098 EOR means end of regular expression. */
2101 parse (re_string_t
*regexp
, regex_t
*preg
, reg_syntax_t syntax
,
2104 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2105 bin_tree_t
*tree
, *eor
, *root
;
2106 re_token_t current_token
;
2107 dfa
->syntax
= syntax
;
2108 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2109 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
2110 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2112 eor
= create_tree (dfa
, NULL
, NULL
, END_OF_RE
);
2114 root
= create_tree (dfa
, tree
, eor
, CONCAT
);
2117 if (BE (eor
== NULL
|| root
== NULL
, 0))
2125 /* This function build the following tree, from regular expression
2126 <branch1>|<branch2>:
2132 ALT means alternative, which represents the operator `|'. */
2135 parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2136 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2138 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2139 bin_tree_t
*tree
, *branch
= NULL
;
2140 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2141 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2144 while (token
->type
== OP_ALT
)
2146 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2147 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2148 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2150 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2151 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2156 tree
= create_tree (dfa
, tree
, branch
, OP_ALT
);
2157 if (BE (tree
== NULL
, 0))
2166 /* This function build the following tree, from regular expression
2173 CAT means concatenation. */
2176 parse_branch (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2177 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2179 bin_tree_t
*tree
, *exp
;
2180 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2181 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2182 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2185 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2186 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2188 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2189 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2192 postorder (tree
, free_tree
, NULL
);
2195 if (tree
!= NULL
&& exp
!= NULL
)
2197 bin_tree_t
*newtree
= create_tree (dfa
, tree
, exp
, CONCAT
);
2198 if (newtree
== NULL
)
2200 postorder (exp
, free_tree
, NULL
);
2201 postorder (tree
, free_tree
, NULL
);
2207 else if (tree
== NULL
)
2209 /* Otherwise exp == NULL, we don't need to create new tree. */
2214 /* This function build the following tree, from regular expression a*:
2221 parse_expression (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2222 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2224 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2226 switch (token
->type
)
2229 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2230 if (BE (tree
== NULL
, 0))
2235 #ifdef RE_ENABLE_I18N
2236 if (dfa
->mb_cur_max
> 1)
2238 while (!re_string_eoi (regexp
)
2239 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2241 bin_tree_t
*mbc_remain
;
2242 fetch_token (token
, regexp
, syntax
);
2243 mbc_remain
= create_token_tree (dfa
, NULL
, NULL
, token
);
2244 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
);
2245 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2254 case OP_OPEN_SUBEXP
:
2255 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2256 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2259 case OP_OPEN_BRACKET
:
2260 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2261 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2265 if (!BE (dfa
->completed_bkref_map
& (1 << token
->opr
.idx
), 1))
2270 dfa
->used_bkref_map
|= 1 << token
->opr
.idx
;
2271 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2272 if (BE (tree
== NULL
, 0))
2278 dfa
->has_mb_node
= 1;
2280 case OP_OPEN_DUP_NUM
:
2281 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2287 case OP_DUP_ASTERISK
:
2289 case OP_DUP_QUESTION
:
2290 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2295 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2297 fetch_token (token
, regexp
, syntax
);
2298 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2300 /* else fall through */
2301 case OP_CLOSE_SUBEXP
:
2302 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2303 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2308 /* else fall through */
2309 case OP_CLOSE_DUP_NUM
:
2310 /* We treat it as a normal character. */
2312 /* Then we can these characters as normal characters. */
2313 token
->type
= CHARACTER
;
2314 /* mb_partial and word_char bits should be initialized already
2316 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2317 if (BE (tree
== NULL
, 0))
2324 if ((token
->opr
.ctx_type
2325 & (WORD_DELIM
| NOT_WORD_DELIM
| WORD_FIRST
| WORD_LAST
))
2326 && dfa
->word_ops_used
== 0)
2327 init_word_char (dfa
);
2328 if (token
->opr
.ctx_type
== WORD_DELIM
2329 || token
->opr
.ctx_type
== NOT_WORD_DELIM
)
2331 bin_tree_t
*tree_first
, *tree_last
;
2332 if (token
->opr
.ctx_type
== WORD_DELIM
)
2334 token
->opr
.ctx_type
= WORD_FIRST
;
2335 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2336 token
->opr
.ctx_type
= WORD_LAST
;
2340 token
->opr
.ctx_type
= INSIDE_WORD
;
2341 tree_first
= create_token_tree (dfa
, NULL
, NULL
, token
);
2342 token
->opr
.ctx_type
= INSIDE_NOTWORD
;
2344 tree_last
= create_token_tree (dfa
, NULL
, NULL
, token
);
2345 tree
= create_tree (dfa
, tree_first
, tree_last
, OP_ALT
);
2346 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2354 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2355 if (BE (tree
== NULL
, 0))
2361 /* We must return here, since ANCHORs can't be followed
2362 by repetition operators.
2363 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2364 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2365 fetch_token (token
, regexp
, syntax
);
2368 tree
= create_token_tree (dfa
, NULL
, NULL
, token
);
2369 if (BE (tree
== NULL
, 0))
2374 if (dfa
->mb_cur_max
> 1)
2375 dfa
->has_mb_node
= 1;
2379 tree
= build_charclass_op (dfa
, regexp
->trans
,
2380 (const unsigned char *) "alnum",
2381 (const unsigned char *) "_",
2382 token
->type
== OP_NOTWORD
, err
);
2383 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2388 tree
= build_charclass_op (dfa
, regexp
->trans
,
2389 (const unsigned char *) "space",
2390 (const unsigned char *) "",
2391 token
->type
== OP_NOTSPACE
, err
);
2392 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2402 /* Must not happen? */
2408 fetch_token (token
, regexp
, syntax
);
2410 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2411 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2413 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2414 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2416 /* In BRE consecutive duplications are not allowed. */
2417 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2418 && (token
->type
== OP_DUP_ASTERISK
2419 || token
->type
== OP_OPEN_DUP_NUM
))
2429 /* This function build the following tree, from regular expression
2437 parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
, re_token_t
*token
,
2438 reg_syntax_t syntax
, int nest
, reg_errcode_t
*err
)
2440 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2443 cur_nsub
= preg
->re_nsub
++;
2445 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2447 /* The subexpression may be a null string. */
2448 if (token
->type
== OP_CLOSE_SUBEXP
)
2452 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2453 if (BE (*err
== REG_NOERROR
&& token
->type
!= OP_CLOSE_SUBEXP
, 0))
2456 postorder (tree
, free_tree
, NULL
);
2459 if (BE (*err
!= REG_NOERROR
, 0))
2463 if (cur_nsub
<= '9' - '1')
2464 dfa
->completed_bkref_map
|= 1 << cur_nsub
;
2466 tree
= create_tree (dfa
, tree
, NULL
, SUBEXP
);
2467 if (BE (tree
== NULL
, 0))
2472 tree
->token
.opr
.idx
= cur_nsub
;
2476 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2479 parse_dup_op (bin_tree_t
*elem
, re_string_t
*regexp
, re_dfa_t
*dfa
,
2480 re_token_t
*token
, reg_syntax_t syntax
, reg_errcode_t
*err
)
2482 bin_tree_t
*tree
= NULL
, *old_tree
= NULL
;
2483 int i
, start
, end
, start_idx
= re_string_cur_idx (regexp
);
2484 re_token_t start_token
= *token
;
2486 if (token
->type
== OP_OPEN_DUP_NUM
)
2489 start
= fetch_number (regexp
, token
, syntax
);
2492 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2493 start
= 0; /* We treat "{,m}" as "{0,m}". */
2496 *err
= REG_BADBR
; /* <re>{} is invalid. */
2500 if (BE (start
!= -2, 1))
2502 /* We treat "{n}" as "{n,n}". */
2503 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2504 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2505 ? fetch_number (regexp
, token
, syntax
) : -2));
2507 if (BE (start
== -2 || end
== -2, 0))
2509 /* Invalid sequence. */
2510 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2512 if (token
->type
== END_OF_RE
)
2520 /* If the syntax bit is set, rollback. */
2521 re_string_set_index (regexp
, start_idx
);
2522 *token
= start_token
;
2523 token
->type
= CHARACTER
;
2524 /* mb_partial and word_char bits should be already initialized by
2529 if (BE ((end
!= -1 && start
> end
) || token
->type
!= OP_CLOSE_DUP_NUM
, 0))
2531 /* First number greater than second. */
2538 start
= (token
->type
== OP_DUP_PLUS
) ? 1 : 0;
2539 end
= (token
->type
== OP_DUP_QUESTION
) ? 1 : -1;
2542 fetch_token (token
, regexp
, syntax
);
2544 if (BE (elem
== NULL
, 0))
2546 if (BE (start
== 0 && end
== 0, 0))
2548 postorder (elem
, free_tree
, NULL
);
2552 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2553 if (BE (start
> 0, 0))
2556 for (i
= 2; i
<= start
; ++i
)
2558 elem
= duplicate_tree (elem
, dfa
);
2559 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2560 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2561 goto parse_dup_op_espace
;
2567 /* Duplicate ELEM before it is marked optional. */
2568 elem
= duplicate_tree (elem
, dfa
);
2574 if (elem
->token
.type
== SUBEXP
)
2575 postorder (elem
, mark_opt_subexp
, (void *) (long) elem
->token
.opr
.idx
);
2577 tree
= create_tree (dfa
, elem
, NULL
, (end
== -1 ? OP_DUP_ASTERISK
: OP_ALT
));
2578 if (BE (tree
== NULL
, 0))
2579 goto parse_dup_op_espace
;
2581 /* This loop is actually executed only when end != -1,
2582 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2583 already created the start+1-th copy. */
2584 for (i
= start
+ 2; i
<= end
; ++i
)
2586 elem
= duplicate_tree (elem
, dfa
);
2587 tree
= create_tree (dfa
, tree
, elem
, CONCAT
);
2588 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2589 goto parse_dup_op_espace
;
2591 tree
= create_tree (dfa
, tree
, NULL
, OP_ALT
);
2592 if (BE (tree
== NULL
, 0))
2593 goto parse_dup_op_espace
;
2597 tree
= create_tree (dfa
, old_tree
, tree
, CONCAT
);
2601 parse_dup_op_espace
:
2606 /* Size of the names for collating symbol/equivalence_class/character_class.
2607 I'm not sure, but maybe enough. */
2608 #define BRACKET_NAME_BUF_SIZE 32
2611 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2612 Build the range expression which starts from START_ELEM, and ends
2613 at END_ELEM. The result are written to MBCSET and SBCSET.
2614 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2615 mbcset->range_ends, is a pointer argument sinse we may
2618 static reg_errcode_t
2620 # ifdef RE_ENABLE_I18N
2621 build_range_exp (bitset_t sbcset
, re_charset_t
*mbcset
, int *range_alloc
,
2622 bracket_elem_t
*start_elem
, bracket_elem_t
*end_elem
)
2623 # else /* not RE_ENABLE_I18N */
2624 build_range_exp (bitset_t sbcset
, bracket_elem_t
*start_elem
,
2625 bracket_elem_t
*end_elem
)
2626 # endif /* not RE_ENABLE_I18N */
2628 unsigned int start_ch
, end_ch
;
2629 /* Equivalence Classes and Character Classes can't be a range start/end. */
2630 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2631 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2635 /* We can handle no multi character collating elements without libc
2637 if (BE ((start_elem
->type
== COLL_SYM
2638 && strlen ((char *) start_elem
->opr
.name
) > 1)
2639 || (end_elem
->type
== COLL_SYM
2640 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2641 return REG_ECOLLATE
;
2643 # ifdef RE_ENABLE_I18N
2648 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2650 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2651 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2653 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2654 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2656 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2657 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2658 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2659 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2660 if (start_wc
== WEOF
|| end_wc
== WEOF
)
2661 return REG_ECOLLATE
;
2662 cmp_buf
[0] = start_wc
;
2663 cmp_buf
[4] = end_wc
;
2664 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2667 /* Got valid collation sequence values, add them as a new entry.
2668 However, for !_LIBC we have no collation elements: if the
2669 character set is single byte, the single byte character set
2670 that we build below suffices. parse_bracket_exp passes
2671 no MBCSET if dfa->mb_cur_max == 1. */
2674 /* Check the space of the arrays. */
2675 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2677 /* There is not enough space, need realloc. */
2678 wchar_t *new_array_start
, *new_array_end
;
2681 /* +1 in case of mbcset->nranges is 0. */
2682 new_nranges
= 2 * mbcset
->nranges
+ 1;
2683 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2684 are NULL if *range_alloc == 0. */
2685 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2687 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2690 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2693 mbcset
->range_starts
= new_array_start
;
2694 mbcset
->range_ends
= new_array_end
;
2695 *range_alloc
= new_nranges
;
2698 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2699 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2702 /* Build the table for single byte characters. */
2703 for (wc
= 0; wc
< SBC_MAX
; ++wc
)
2706 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2707 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2708 bitset_set (sbcset
, wc
);
2711 # else /* not RE_ENABLE_I18N */
2714 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2715 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2717 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2718 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2720 if (start_ch
> end_ch
)
2722 /* Build the table for single byte characters. */
2723 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
2724 if (start_ch
<= ch
&& ch
<= end_ch
)
2725 bitset_set (sbcset
, ch
);
2727 # endif /* not RE_ENABLE_I18N */
2730 #endif /* not _LIBC */
2733 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2734 Build the collating element which is represented by NAME.
2735 The result are written to MBCSET and SBCSET.
2736 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2737 pointer argument since we may update it. */
2739 static reg_errcode_t
2741 # ifdef RE_ENABLE_I18N
2742 build_collating_symbol (bitset_t sbcset
, re_charset_t
*mbcset
,
2743 int *coll_sym_alloc
, const unsigned char *name
)
2744 # else /* not RE_ENABLE_I18N */
2745 build_collating_symbol (bitset_t sbcset
, const unsigned char *name
)
2746 # endif /* not RE_ENABLE_I18N */
2748 size_t name_len
= strlen ((const char *) name
);
2749 if (BE (name_len
!= 1, 0))
2750 return REG_ECOLLATE
;
2753 bitset_set (sbcset
, name
[0]);
2757 #endif /* not _LIBC */
2759 /* This function parse bracket expression like "[abc]", "[a-c]",
2763 parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
, re_token_t
*token
,
2764 reg_syntax_t syntax
, reg_errcode_t
*err
)
2767 const unsigned char *collseqmb
;
2768 const char *collseqwc
;
2771 const int32_t *symb_table
;
2772 const unsigned char *extra
;
2774 /* Local function for parse_bracket_exp used in _LIBC environement.
2775 Seek the collating symbol entry correspondings to NAME.
2776 Return the index of the symbol in the SYMB_TABLE. */
2779 __attribute ((always_inline
))
2780 seek_collating_symbol_entry (name
, name_len
)
2781 const unsigned char *name
;
2784 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2785 int32_t elem
= hash
% table_size
;
2786 if (symb_table
[2 * elem
] != 0)
2788 int32_t second
= hash
% (table_size
- 2) + 1;
2792 /* First compare the hashing value. */
2793 if (symb_table
[2 * elem
] == hash
2794 /* Compare the length of the name. */
2795 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2796 /* Compare the name. */
2797 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2800 /* Yep, this is the entry. */
2807 while (symb_table
[2 * elem
] != 0);
2812 /* Local function for parse_bracket_exp used in _LIBC environment.
2813 Look up the collation sequence value of BR_ELEM.
2814 Return the value if succeeded, UINT_MAX otherwise. */
2816 auto inline unsigned int
2817 __attribute ((always_inline
))
2818 lookup_collation_sequence_value (br_elem
)
2819 bracket_elem_t
*br_elem
;
2821 if (br_elem
->type
== SB_CHAR
)
2824 if (MB_CUR_MAX == 1)
2827 return collseqmb
[br_elem
->opr
.ch
];
2830 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2831 return __collseq_table_lookup (collseqwc
, wc
);
2834 else if (br_elem
->type
== MB_CHAR
)
2837 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2839 else if (br_elem
->type
== COLL_SYM
)
2841 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2845 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2847 if (symb_table
[2 * elem
] != 0)
2849 /* We found the entry. */
2850 idx
= symb_table
[2 * elem
+ 1];
2851 /* Skip the name of collating element name. */
2852 idx
+= 1 + extra
[idx
];
2853 /* Skip the byte sequence of the collating element. */
2854 idx
+= 1 + extra
[idx
];
2855 /* Adjust for the alignment. */
2856 idx
= (idx
+ 3) & ~3;
2857 /* Skip the multibyte collation sequence value. */
2858 idx
+= sizeof (unsigned int);
2859 /* Skip the wide char sequence of the collating element. */
2860 idx
+= sizeof (unsigned int) *
2861 (1 + *(unsigned int *) (extra
+ idx
));
2862 /* Return the collation sequence value. */
2863 return *(unsigned int *) (extra
+ idx
);
2865 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2867 /* No valid character. Match it as a single byte
2869 return collseqmb
[br_elem
->opr
.name
[0]];
2872 else if (sym_name_len
== 1)
2873 return collseqmb
[br_elem
->opr
.name
[0]];
2878 /* Local function for parse_bracket_exp used in _LIBC environement.
2879 Build the range expression which starts from START_ELEM, and ends
2880 at END_ELEM. The result are written to MBCSET and SBCSET.
2881 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2882 mbcset->range_ends, is a pointer argument sinse we may
2885 auto inline reg_errcode_t
2886 __attribute ((always_inline
))
2887 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2888 re_charset_t
*mbcset
;
2891 bracket_elem_t
*start_elem
, *end_elem
;
2894 uint32_t start_collseq
;
2895 uint32_t end_collseq
;
2897 /* Equivalence Classes and Character Classes can't be a range
2899 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2900 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2904 start_collseq
= lookup_collation_sequence_value (start_elem
);
2905 end_collseq
= lookup_collation_sequence_value (end_elem
);
2906 /* Check start/end collation sequence values. */
2907 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2908 return REG_ECOLLATE
;
2909 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2912 /* Got valid collation sequence values, add them as a new entry.
2913 However, if we have no collation elements, and the character set
2914 is single byte, the single byte character set that we
2915 build below suffices. */
2916 if (nrules
> 0 || dfa
->mb_cur_max
> 1)
2918 /* Check the space of the arrays. */
2919 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2921 /* There is not enough space, need realloc. */
2922 uint32_t *new_array_start
;
2923 uint32_t *new_array_end
;
2926 /* +1 in case of mbcset->nranges is 0. */
2927 new_nranges
= 2 * mbcset
->nranges
+ 1;
2928 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2930 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2933 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2936 mbcset
->range_starts
= new_array_start
;
2937 mbcset
->range_ends
= new_array_end
;
2938 *range_alloc
= new_nranges
;
2941 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2942 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2945 /* Build the table for single byte characters. */
2946 for (ch
= 0; ch
< SBC_MAX
; ch
++)
2948 uint32_t ch_collseq
;
2950 if (MB_CUR_MAX == 1)
2953 ch_collseq
= collseqmb
[ch
];
2955 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2956 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2957 bitset_set (sbcset
, ch
);
2962 /* Local function for parse_bracket_exp used in _LIBC environement.
2963 Build the collating element which is represented by NAME.
2964 The result are written to MBCSET and SBCSET.
2965 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2966 pointer argument sinse we may update it. */
2968 auto inline reg_errcode_t
2969 __attribute ((always_inline
))
2970 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2971 re_charset_t
*mbcset
;
2972 int *coll_sym_alloc
;
2974 const unsigned char *name
;
2977 size_t name_len
= strlen ((const char *) name
);
2980 elem
= seek_collating_symbol_entry (name
, name_len
);
2981 if (symb_table
[2 * elem
] != 0)
2983 /* We found the entry. */
2984 idx
= symb_table
[2 * elem
+ 1];
2985 /* Skip the name of collating element name. */
2986 idx
+= 1 + extra
[idx
];
2988 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2990 /* No valid character, treat it as a normal
2992 bitset_set (sbcset
, name
[0]);
2996 return REG_ECOLLATE
;
2998 /* Got valid collation sequence, add it as a new entry. */
2999 /* Check the space of the arrays. */
3000 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
3002 /* Not enough, realloc it. */
3003 /* +1 in case of mbcset->ncoll_syms is 0. */
3004 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
3005 /* Use realloc since mbcset->coll_syms is NULL
3007 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
3008 new_coll_sym_alloc
);
3009 if (BE (new_coll_syms
== NULL
, 0))
3011 mbcset
->coll_syms
= new_coll_syms
;
3012 *coll_sym_alloc
= new_coll_sym_alloc
;
3014 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
3019 if (BE (name_len
!= 1, 0))
3020 return REG_ECOLLATE
;
3023 bitset_set (sbcset
, name
[0]);
3030 re_token_t br_token
;
3031 re_bitset_ptr_t sbcset
;
3032 #ifdef RE_ENABLE_I18N
3033 re_charset_t
*mbcset
;
3034 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
3035 int equiv_class_alloc
= 0, char_class_alloc
= 0;
3036 #endif /* not RE_ENABLE_I18N */
3038 bin_tree_t
*work_tree
;
3040 int first_round
= 1;
3042 collseqmb
= (const unsigned char *)
3043 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
3044 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3050 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
3051 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
3052 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3053 _NL_COLLATE_SYMB_TABLEMB
);
3054 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3055 _NL_COLLATE_SYMB_EXTRAMB
);
3058 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3059 #ifdef RE_ENABLE_I18N
3060 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3061 #endif /* RE_ENABLE_I18N */
3062 #ifdef RE_ENABLE_I18N
3063 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3065 if (BE (sbcset
== NULL
, 0))
3066 #endif /* RE_ENABLE_I18N */
3069 #ifdef RE_ENABLE_I18N
3076 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3077 if (BE (token
->type
== END_OF_RE
, 0))
3080 goto parse_bracket_exp_free_return
;
3082 if (token
->type
== OP_NON_MATCH_LIST
)
3084 #ifdef RE_ENABLE_I18N
3085 mbcset
->non_match
= 1;
3086 #endif /* not RE_ENABLE_I18N */
3088 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3089 bitset_set (sbcset
, '\n');
3090 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3091 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3092 if (BE (token
->type
== END_OF_RE
, 0))
3095 goto parse_bracket_exp_free_return
;
3099 /* We treat the first ']' as a normal character. */
3100 if (token
->type
== OP_CLOSE_BRACKET
)
3101 token
->type
= CHARACTER
;
3105 bracket_elem_t start_elem
, end_elem
;
3106 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3107 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3109 int token_len2
= 0, is_range_exp
= 0;
3112 start_elem
.opr
.name
= start_name_buf
;
3113 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3114 syntax
, first_round
);
3115 if (BE (ret
!= REG_NOERROR
, 0))
3118 goto parse_bracket_exp_free_return
;
3122 /* Get information about the next token. We need it in any case. */
3123 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3125 /* Do not check for ranges if we know they are not allowed. */
3126 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3128 if (BE (token
->type
== END_OF_RE
, 0))
3131 goto parse_bracket_exp_free_return
;
3133 if (token
->type
== OP_CHARSET_RANGE
)
3135 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3136 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3137 if (BE (token2
.type
== END_OF_RE
, 0))
3140 goto parse_bracket_exp_free_return
;
3142 if (token2
.type
== OP_CLOSE_BRACKET
)
3144 /* We treat the last '-' as a normal character. */
3145 re_string_skip_bytes (regexp
, -token_len
);
3146 token
->type
= CHARACTER
;
3153 if (is_range_exp
== 1)
3155 end_elem
.opr
.name
= end_name_buf
;
3156 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3158 if (BE (ret
!= REG_NOERROR
, 0))
3161 goto parse_bracket_exp_free_return
;
3164 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3167 *err
= build_range_exp (sbcset
, mbcset
, &range_alloc
,
3168 &start_elem
, &end_elem
);
3170 # ifdef RE_ENABLE_I18N
3171 *err
= build_range_exp (sbcset
,
3172 dfa
->mb_cur_max
> 1 ? mbcset
: NULL
,
3173 &range_alloc
, &start_elem
, &end_elem
);
3175 *err
= build_range_exp (sbcset
, &start_elem
, &end_elem
);
3177 #endif /* RE_ENABLE_I18N */
3178 if (BE (*err
!= REG_NOERROR
, 0))
3179 goto parse_bracket_exp_free_return
;
3183 switch (start_elem
.type
)
3186 bitset_set (sbcset
, start_elem
.opr
.ch
);
3188 #ifdef RE_ENABLE_I18N
3190 /* Check whether the array has enough space. */
3191 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3193 wchar_t *new_mbchars
;
3194 /* Not enough, realloc it. */
3195 /* +1 in case of mbcset->nmbchars is 0. */
3196 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3197 /* Use realloc since array is NULL if *alloc == 0. */
3198 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3200 if (BE (new_mbchars
== NULL
, 0))
3201 goto parse_bracket_exp_espace
;
3202 mbcset
->mbchars
= new_mbchars
;
3204 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3206 #endif /* RE_ENABLE_I18N */
3208 *err
= build_equiv_class (sbcset
,
3209 #ifdef RE_ENABLE_I18N
3210 mbcset
, &equiv_class_alloc
,
3211 #endif /* RE_ENABLE_I18N */
3212 start_elem
.opr
.name
);
3213 if (BE (*err
!= REG_NOERROR
, 0))
3214 goto parse_bracket_exp_free_return
;
3217 *err
= build_collating_symbol (sbcset
,
3218 #ifdef RE_ENABLE_I18N
3219 mbcset
, &coll_sym_alloc
,
3220 #endif /* RE_ENABLE_I18N */
3221 start_elem
.opr
.name
);
3222 if (BE (*err
!= REG_NOERROR
, 0))
3223 goto parse_bracket_exp_free_return
;
3226 *err
= build_charclass (regexp
->trans
, sbcset
,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset
, &char_class_alloc
,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem
.opr
.name
, syntax
);
3231 if (BE (*err
!= REG_NOERROR
, 0))
3232 goto parse_bracket_exp_free_return
;
3239 if (BE (token
->type
== END_OF_RE
, 0))
3242 goto parse_bracket_exp_free_return
;
3244 if (token
->type
== OP_CLOSE_BRACKET
)
3248 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3250 /* If it is non-matching list. */
3252 bitset_not (sbcset
);
3254 #ifdef RE_ENABLE_I18N
3255 /* Ensure only single byte characters are set. */
3256 if (dfa
->mb_cur_max
> 1)
3257 bitset_mask (sbcset
, dfa
->sb_char
);
3259 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3260 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3261 || mbcset
->non_match
)))
3263 bin_tree_t
*mbc_tree
;
3265 /* Build a tree for complex bracket. */
3266 dfa
->has_mb_node
= 1;
3267 br_token
.type
= COMPLEX_BRACKET
;
3268 br_token
.opr
.mbcset
= mbcset
;
3269 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3270 if (BE (mbc_tree
== NULL
, 0))
3271 goto parse_bracket_exp_espace
;
3272 for (sbc_idx
= 0; sbc_idx
< BITSET_WORDS
; ++sbc_idx
)
3273 if (sbcset
[sbc_idx
])
3275 /* If there are no bits set in sbcset, there is no point
3276 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3277 if (sbc_idx
< BITSET_WORDS
)
3279 /* Build a tree for simple bracket. */
3280 br_token
.type
= SIMPLE_BRACKET
;
3281 br_token
.opr
.sbcset
= sbcset
;
3282 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3283 if (BE (work_tree
== NULL
, 0))
3284 goto parse_bracket_exp_espace
;
3286 /* Then join them by ALT node. */
3287 work_tree
= create_tree (dfa
, work_tree
, mbc_tree
, OP_ALT
);
3288 if (BE (work_tree
== NULL
, 0))
3289 goto parse_bracket_exp_espace
;
3294 work_tree
= mbc_tree
;
3298 #endif /* not RE_ENABLE_I18N */
3300 #ifdef RE_ENABLE_I18N
3301 free_charset (mbcset
);
3303 /* Build a tree for simple bracket. */
3304 br_token
.type
= SIMPLE_BRACKET
;
3305 br_token
.opr
.sbcset
= sbcset
;
3306 work_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3307 if (BE (work_tree
== NULL
, 0))
3308 goto parse_bracket_exp_espace
;
3312 parse_bracket_exp_espace
:
3314 parse_bracket_exp_free_return
:
3316 #ifdef RE_ENABLE_I18N
3317 free_charset (mbcset
);
3318 #endif /* RE_ENABLE_I18N */
3322 /* Parse an element in the bracket expression. */
3324 static reg_errcode_t
3325 parse_bracket_element (bracket_elem_t
*elem
, re_string_t
*regexp
,
3326 re_token_t
*token
, int token_len
, re_dfa_t
*dfa
,
3327 reg_syntax_t syntax
, int accept_hyphen
)
3329 #ifdef RE_ENABLE_I18N
3331 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3332 if (cur_char_size
> 1)
3334 elem
->type
= MB_CHAR
;
3335 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3336 re_string_skip_bytes (regexp
, cur_char_size
);
3339 #endif /* RE_ENABLE_I18N */
3340 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3341 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3342 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3343 return parse_bracket_symbol (elem
, regexp
, token
);
3344 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3346 /* A '-' must only appear as anything but a range indicator before
3347 the closing bracket. Everything else is an error. */
3349 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3350 if (token2
.type
!= OP_CLOSE_BRACKET
)
3351 /* The actual error value is not standardized since this whole
3352 case is undefined. But ERANGE makes good sense. */
3355 elem
->type
= SB_CHAR
;
3356 elem
->opr
.ch
= token
->opr
.c
;
3360 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3361 such as [:<character_class>:], [.<collating_element>.], and
3362 [=<equivalent_class>=]. */
3364 static reg_errcode_t
3365 parse_bracket_symbol (bracket_elem_t
*elem
, re_string_t
*regexp
,
3368 unsigned char ch
, delim
= token
->opr
.c
;
3370 if (re_string_eoi(regexp
))
3374 if (i
>= BRACKET_NAME_BUF_SIZE
)
3376 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3377 ch
= re_string_fetch_byte_case (regexp
);
3379 ch
= re_string_fetch_byte (regexp
);
3380 if (re_string_eoi(regexp
))
3382 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3384 elem
->opr
.name
[i
] = ch
;
3386 re_string_skip_bytes (regexp
, 1);
3387 elem
->opr
.name
[i
] = '\0';
3388 switch (token
->type
)
3390 case OP_OPEN_COLL_ELEM
:
3391 elem
->type
= COLL_SYM
;
3393 case OP_OPEN_EQUIV_CLASS
:
3394 elem
->type
= EQUIV_CLASS
;
3396 case OP_OPEN_CHAR_CLASS
:
3397 elem
->type
= CHAR_CLASS
;
3405 /* Helper function for parse_bracket_exp.
3406 Build the equivalence class which is represented by NAME.
3407 The result are written to MBCSET and SBCSET.
3408 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3409 is a pointer argument sinse we may update it. */
3411 static reg_errcode_t
3412 #ifdef RE_ENABLE_I18N
3413 build_equiv_class (bitset_t sbcset
, re_charset_t
*mbcset
,
3414 int *equiv_class_alloc
, const unsigned char *name
)
3415 #else /* not RE_ENABLE_I18N */
3416 build_equiv_class (bitset_t sbcset
, const unsigned char *name
)
3417 #endif /* not RE_ENABLE_I18N */
3420 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3423 const int32_t *table
, *indirect
;
3424 const unsigned char *weights
, *extra
, *cp
;
3425 unsigned char char_buf
[2];
3429 /* This #include defines a local function! */
3430 # include <locale/weight.h>
3431 /* Calculate the index for equivalence class. */
3433 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3434 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3435 _NL_COLLATE_WEIGHTMB
);
3436 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3437 _NL_COLLATE_EXTRAMB
);
3438 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3439 _NL_COLLATE_INDIRECTMB
);
3440 idx1
= findidx (&cp
, -1);
3441 if (BE (idx1
== 0 || *cp
!= '\0', 0))
3442 /* This isn't a valid character. */
3443 return REG_ECOLLATE
;
3445 /* Build single byte matcing table for this equivalence class. */
3446 len
= weights
[idx1
& 0xffffff];
3447 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3451 idx2
= findidx (&cp
, 1);
3456 /* This isn't a valid character. */
3458 /* Compare only if the length matches and the collation rule
3459 index is the same. */
3460 if (len
== weights
[idx2
& 0xffffff] && (idx1
>> 24) == (idx2
>> 24))
3464 while (cnt
<= len
&&
3465 weights
[(idx1
& 0xffffff) + 1 + cnt
]
3466 == weights
[(idx2
& 0xffffff) + 1 + cnt
])
3470 bitset_set (sbcset
, ch
);
3473 /* Check whether the array has enough space. */
3474 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3476 /* Not enough, realloc it. */
3477 /* +1 in case of mbcset->nequiv_classes is 0. */
3478 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3479 /* Use realloc since the array is NULL if *alloc == 0. */
3480 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3482 new_equiv_class_alloc
);
3483 if (BE (new_equiv_classes
== NULL
, 0))
3485 mbcset
->equiv_classes
= new_equiv_classes
;
3486 *equiv_class_alloc
= new_equiv_class_alloc
;
3488 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3493 if (BE (strlen ((const char *) name
) != 1, 0))
3494 return REG_ECOLLATE
;
3495 bitset_set (sbcset
, *name
);
3500 /* Helper function for parse_bracket_exp.
3501 Build the character class which is represented by NAME.
3502 The result are written to MBCSET and SBCSET.
3503 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3504 is a pointer argument sinse we may update it. */
3506 static reg_errcode_t
3507 #ifdef RE_ENABLE_I18N
3508 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3509 re_charset_t
*mbcset
, int *char_class_alloc
,
3510 const unsigned char *class_name
, reg_syntax_t syntax
)
3511 #else /* not RE_ENABLE_I18N */
3512 build_charclass (RE_TRANSLATE_TYPE trans
, bitset_t sbcset
,
3513 const unsigned char *class_name
, reg_syntax_t syntax
)
3514 #endif /* not RE_ENABLE_I18N */
3517 const char *name
= (const char *) class_name
;
3519 /* In case of REG_ICASE "upper" and "lower" match the both of
3520 upper and lower cases. */
3521 if ((syntax
& RE_ICASE
)
3522 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3525 #ifdef RE_ENABLE_I18N
3526 /* Check the space of the arrays. */
3527 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3529 /* Not enough, realloc it. */
3530 /* +1 in case of mbcset->nchar_classes is 0. */
3531 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3532 /* Use realloc since array is NULL if *alloc == 0. */
3533 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3534 new_char_class_alloc
);
3535 if (BE (new_char_classes
== NULL
, 0))
3537 mbcset
->char_classes
= new_char_classes
;
3538 *char_class_alloc
= new_char_class_alloc
;
3540 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3541 #endif /* RE_ENABLE_I18N */
3543 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3545 if (BE (trans != NULL, 0)) \
3547 for (i = 0; i < SBC_MAX; ++i) \
3548 if (ctype_func (i)) \
3549 bitset_set (sbcset, trans[i]); \
3553 for (i = 0; i < SBC_MAX; ++i) \
3554 if (ctype_func (i)) \
3555 bitset_set (sbcset, i); \
3559 if (strcmp (name
, "alnum") == 0)
3560 BUILD_CHARCLASS_LOOP (isalnum
);
3561 else if (strcmp (name
, "cntrl") == 0)
3562 BUILD_CHARCLASS_LOOP (iscntrl
);
3563 else if (strcmp (name
, "lower") == 0)
3564 BUILD_CHARCLASS_LOOP (islower
);
3565 else if (strcmp (name
, "space") == 0)
3566 BUILD_CHARCLASS_LOOP (isspace
);
3567 else if (strcmp (name
, "alpha") == 0)
3568 BUILD_CHARCLASS_LOOP (isalpha
);
3569 else if (strcmp (name
, "digit") == 0)
3570 BUILD_CHARCLASS_LOOP (isdigit
);
3571 else if (strcmp (name
, "print") == 0)
3572 BUILD_CHARCLASS_LOOP (isprint
);
3573 else if (strcmp (name
, "upper") == 0)
3574 BUILD_CHARCLASS_LOOP (isupper
);
3575 else if (strcmp (name
, "blank") == 0)
3576 BUILD_CHARCLASS_LOOP (isblank
);
3577 else if (strcmp (name
, "graph") == 0)
3578 BUILD_CHARCLASS_LOOP (isgraph
);
3579 else if (strcmp (name
, "punct") == 0)
3580 BUILD_CHARCLASS_LOOP (ispunct
);
3581 else if (strcmp (name
, "xdigit") == 0)
3582 BUILD_CHARCLASS_LOOP (isxdigit
);
3590 build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
3591 const unsigned char *class_name
,
3592 const unsigned char *extra
, int non_match
,
3595 re_bitset_ptr_t sbcset
;
3596 #ifdef RE_ENABLE_I18N
3597 re_charset_t
*mbcset
;
3599 #endif /* not RE_ENABLE_I18N */
3601 re_token_t br_token
;
3604 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (bitset_t
), 1);
3605 #ifdef RE_ENABLE_I18N
3606 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3607 #endif /* RE_ENABLE_I18N */
3609 #ifdef RE_ENABLE_I18N
3610 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3611 #else /* not RE_ENABLE_I18N */
3612 if (BE (sbcset
== NULL
, 0))
3613 #endif /* not RE_ENABLE_I18N */
3621 #ifdef RE_ENABLE_I18N
3622 mbcset
->non_match
= 1;
3623 #endif /* not RE_ENABLE_I18N */
3626 /* We don't care the syntax in this case. */
3627 ret
= build_charclass (trans
, sbcset
,
3628 #ifdef RE_ENABLE_I18N
3630 #endif /* RE_ENABLE_I18N */
3633 if (BE (ret
!= REG_NOERROR
, 0))
3636 #ifdef RE_ENABLE_I18N
3637 free_charset (mbcset
);
3638 #endif /* RE_ENABLE_I18N */
3642 /* \w match '_' also. */
3643 for (; *extra
; extra
++)
3644 bitset_set (sbcset
, *extra
);
3646 /* If it is non-matching list. */
3648 bitset_not (sbcset
);
3650 #ifdef RE_ENABLE_I18N
3651 /* Ensure only single byte characters are set. */
3652 if (dfa
->mb_cur_max
> 1)
3653 bitset_mask (sbcset
, dfa
->sb_char
);
3656 /* Build a tree for simple bracket. */
3657 br_token
.type
= SIMPLE_BRACKET
;
3658 br_token
.opr
.sbcset
= sbcset
;
3659 tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3660 if (BE (tree
== NULL
, 0))
3661 goto build_word_op_espace
;
3663 #ifdef RE_ENABLE_I18N
3664 if (dfa
->mb_cur_max
> 1)
3666 bin_tree_t
*mbc_tree
;
3667 /* Build a tree for complex bracket. */
3668 br_token
.type
= COMPLEX_BRACKET
;
3669 br_token
.opr
.mbcset
= mbcset
;
3670 dfa
->has_mb_node
= 1;
3671 mbc_tree
= create_token_tree (dfa
, NULL
, NULL
, &br_token
);
3672 if (BE (mbc_tree
== NULL
, 0))
3673 goto build_word_op_espace
;
3674 /* Then join them by ALT node. */
3675 tree
= create_tree (dfa
, tree
, mbc_tree
, OP_ALT
);
3676 if (BE (mbc_tree
!= NULL
, 1))
3681 free_charset (mbcset
);
3684 #else /* not RE_ENABLE_I18N */
3686 #endif /* not RE_ENABLE_I18N */
3688 build_word_op_espace
:
3690 #ifdef RE_ENABLE_I18N
3691 free_charset (mbcset
);
3692 #endif /* RE_ENABLE_I18N */
3697 /* This is intended for the expressions like "a{1,3}".
3698 Fetch a number from `input', and return the number.
3699 Return -1, if the number field is empty like "{,1}".
3700 Return -2, If an error is occured. */
3703 fetch_number (re_string_t
*input
, re_token_t
*token
, reg_syntax_t syntax
)
3709 fetch_token (token
, input
, syntax
);
3711 if (BE (token
->type
== END_OF_RE
, 0))
3713 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3715 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3716 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3717 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3722 #ifdef RE_ENABLE_I18N
3724 free_charset (re_charset_t
*cset
)
3726 re_free (cset
->mbchars
);
3728 re_free (cset
->coll_syms
);
3729 re_free (cset
->equiv_classes
);
3730 re_free (cset
->range_starts
);
3731 re_free (cset
->range_ends
);
3733 re_free (cset
->char_classes
);
3736 #endif /* RE_ENABLE_I18N */
3738 /* Functions for binary tree operation. */
3740 /* Create a tree node. */
3743 create_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3744 re_token_type_t type
)
3748 return create_token_tree (dfa
, left
, right
, &t
);
3752 create_token_tree (re_dfa_t
*dfa
, bin_tree_t
*left
, bin_tree_t
*right
,
3753 const re_token_t
*token
)
3756 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3758 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3760 if (storage
== NULL
)
3762 storage
->next
= dfa
->str_tree_storage
;
3763 dfa
->str_tree_storage
= storage
;
3764 dfa
->str_tree_storage_idx
= 0;
3766 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3768 tree
->parent
= NULL
;
3770 tree
->right
= right
;
3771 tree
->token
= *token
;
3772 tree
->token
.duplicated
= 0;
3773 tree
->token
.opt_subexp
= 0;
3776 tree
->node_idx
= -1;
3779 left
->parent
= tree
;
3781 right
->parent
= tree
;
3785 /* Mark the tree SRC as an optional subexpression.
3786 To be called from preorder or postorder. */
3788 static reg_errcode_t
3789 mark_opt_subexp (void *extra
, bin_tree_t
*node
)
3791 int idx
= (int) (long) extra
;
3792 if (node
->token
.type
== SUBEXP
&& node
->token
.opr
.idx
== idx
)
3793 node
->token
.opt_subexp
= 1;
3798 /* Free the allocated memory inside NODE. */
3801 free_token (re_token_t
*node
)
3803 #ifdef RE_ENABLE_I18N
3804 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
3805 free_charset (node
->opr
.mbcset
);
3807 #endif /* RE_ENABLE_I18N */
3808 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
3809 re_free (node
->opr
.sbcset
);
3812 /* Worker function for tree walking. Free the allocated memory inside NODE
3813 and its children. */
3815 static reg_errcode_t
3816 free_tree (void *extra
, bin_tree_t
*node
)
3818 free_token (&node
->token
);
3823 /* Duplicate the node SRC, and return new node. This is a preorder
3824 visit similar to the one implemented by the generic visitor, but
3825 we need more infrastructure to maintain two parallel trees --- so,
3826 it's easier to duplicate. */
3829 duplicate_tree (const bin_tree_t
*root
, re_dfa_t
*dfa
)
3831 const bin_tree_t
*node
;
3832 bin_tree_t
*dup_root
;
3833 bin_tree_t
**p_new
= &dup_root
, *dup_node
= root
->parent
;
3835 for (node
= root
; ; )
3837 /* Create a new tree and link it back to the current parent. */
3838 *p_new
= create_token_tree (dfa
, NULL
, NULL
, &node
->token
);
3841 (*p_new
)->parent
= dup_node
;
3842 (*p_new
)->token
.duplicated
= 1;
3845 /* Go to the left node, or up and to the right. */
3849 p_new
= &dup_node
->left
;
3853 const bin_tree_t
*prev
= NULL
;
3854 while (node
->right
== prev
|| node
->right
== NULL
)
3857 node
= node
->parent
;
3858 dup_node
= dup_node
->parent
;
3863 p_new
= &dup_node
->right
;