1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t
re_compile_internal (regex_t
*preg
, const char * pattern
,
22 int length
, reg_syntax_t syntax
);
23 static void re_compile_fastmap_iter (regex_t
*bufp
,
24 const re_dfastate_t
*init_state
,
26 static reg_errcode_t
init_dfa (re_dfa_t
*dfa
, int pat_len
);
27 static reg_errcode_t
init_word_char (re_dfa_t
*dfa
);
29 static void free_charset (re_charset_t
*cset
);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t
*preg
);
32 static reg_errcode_t
create_initial_state (re_dfa_t
*dfa
);
34 static void optimize_utf8 (re_dfa_t
*dfa
);
36 static reg_errcode_t
analyze (re_dfa_t
*dfa
);
37 static reg_errcode_t
analyze_tree (re_dfa_t
*dfa
, bin_tree_t
*node
);
38 static void calc_first (re_dfa_t
*dfa
, bin_tree_t
*node
);
39 static void calc_next (re_dfa_t
*dfa
, bin_tree_t
*node
);
40 static void calc_epsdest (re_dfa_t
*dfa
, bin_tree_t
*node
);
41 static reg_errcode_t
duplicate_node_closure (re_dfa_t
*dfa
, int top_org_node
,
42 int top_clone_node
, int root_node
,
43 unsigned int constraint
);
44 static reg_errcode_t
duplicate_node (int *new_idx
, re_dfa_t
*dfa
, int org_idx
,
45 unsigned int constraint
);
46 static int search_duplicated_node (re_dfa_t
*dfa
, int org_node
,
47 unsigned int constraint
);
48 static reg_errcode_t
calc_eclosure (re_dfa_t
*dfa
);
49 static reg_errcode_t
calc_eclosure_iter (re_node_set
*new_set
, re_dfa_t
*dfa
,
51 static void calc_inveclosure (re_dfa_t
*dfa
);
52 static int fetch_number (re_string_t
*input
, re_token_t
*token
,
54 static void fetch_token (re_token_t
*result
, re_string_t
*input
,
56 static int peek_token (re_token_t
*token
, re_string_t
*input
,
58 static int peek_token_bracket (re_token_t
*token
, re_string_t
*input
,
60 static bin_tree_t
*parse (re_string_t
*regexp
, regex_t
*preg
,
61 reg_syntax_t syntax
, reg_errcode_t
*err
);
62 static bin_tree_t
*parse_reg_exp (re_string_t
*regexp
, regex_t
*preg
,
63 re_token_t
*token
, reg_syntax_t syntax
,
64 int nest
, reg_errcode_t
*err
);
65 static bin_tree_t
*parse_branch (re_string_t
*regexp
, regex_t
*preg
,
66 re_token_t
*token
, reg_syntax_t syntax
,
67 int nest
, reg_errcode_t
*err
);
68 static bin_tree_t
*parse_expression (re_string_t
*regexp
, regex_t
*preg
,
69 re_token_t
*token
, reg_syntax_t syntax
,
70 int nest
, reg_errcode_t
*err
);
71 static bin_tree_t
*parse_sub_exp (re_string_t
*regexp
, regex_t
*preg
,
72 re_token_t
*token
, reg_syntax_t syntax
,
73 int nest
, reg_errcode_t
*err
);
74 static bin_tree_t
*parse_dup_op (bin_tree_t
*dup_elem
, re_string_t
*regexp
,
75 re_dfa_t
*dfa
, re_token_t
*token
,
76 reg_syntax_t syntax
, reg_errcode_t
*err
);
77 static bin_tree_t
*parse_bracket_exp (re_string_t
*regexp
, re_dfa_t
*dfa
,
78 re_token_t
*token
, reg_syntax_t syntax
,
80 static reg_errcode_t
parse_bracket_element (bracket_elem_t
*elem
,
82 re_token_t
*token
, int token_len
,
86 static reg_errcode_t
parse_bracket_symbol (bracket_elem_t
*elem
,
90 # ifdef RE_ENABLE_I18N
91 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
92 re_charset_t
*mbcset
, int *range_alloc
,
93 bracket_elem_t
*start_elem
,
94 bracket_elem_t
*end_elem
);
95 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
98 const unsigned char *name
);
99 # else /* not RE_ENABLE_I18N */
100 static reg_errcode_t
build_range_exp (re_bitset_ptr_t sbcset
,
101 bracket_elem_t
*start_elem
,
102 bracket_elem_t
*end_elem
);
103 static reg_errcode_t
build_collating_symbol (re_bitset_ptr_t sbcset
,
104 const unsigned char *name
);
105 # endif /* not RE_ENABLE_I18N */
106 #endif /* not _LIBC */
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
109 re_charset_t
*mbcset
,
110 int *equiv_class_alloc
,
111 const unsigned char *name
);
112 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
113 re_bitset_ptr_t sbcset
,
114 re_charset_t
*mbcset
,
115 int *char_class_alloc
,
116 const unsigned char *class_name
,
117 reg_syntax_t syntax
);
118 #else /* not RE_ENABLE_I18N */
119 static reg_errcode_t
build_equiv_class (re_bitset_ptr_t sbcset
,
120 const unsigned char *name
);
121 static reg_errcode_t
build_charclass (RE_TRANSLATE_TYPE trans
,
122 re_bitset_ptr_t sbcset
,
123 const unsigned char *class_name
,
124 reg_syntax_t syntax
);
125 #endif /* not RE_ENABLE_I18N */
126 static bin_tree_t
*build_charclass_op (re_dfa_t
*dfa
, RE_TRANSLATE_TYPE trans
,
127 const unsigned char *class_name
,
128 const unsigned char *extra
, int not,
130 static bin_tree_t
*create_tree (re_dfa_t
*dfa
,
131 bin_tree_t
*left
, bin_tree_t
*right
,
132 re_token_type_t type
, int index
);
133 static bin_tree_t
*re_dfa_add_tree_node (re_dfa_t
*dfa
,
134 bin_tree_t
*left
, bin_tree_t
*right
,
135 const re_token_t
*token
)
136 __attribute ((noinline
));
137 static bin_tree_t
*duplicate_tree (const bin_tree_t
*src
, re_dfa_t
*dfa
);
139 /* This table gives an error message for each of the error codes listed
140 in regex.h. Obviously the order here has to be same as there.
141 POSIX doesn't require that we do anything for REG_NOERROR,
142 but why not be nice? */
144 const char __re_error_msgid
[] attribute_hidden
=
146 #define REG_NOERROR_IDX 0
147 gettext_noop ("Success") /* REG_NOERROR */
149 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
150 gettext_noop ("No match") /* REG_NOMATCH */
152 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
153 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
155 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
156 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
158 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
159 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
161 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
162 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
164 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
165 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
167 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
168 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
170 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
171 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
173 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
174 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
176 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
177 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
179 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
180 gettext_noop ("Invalid range end") /* REG_ERANGE */
182 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
183 gettext_noop ("Memory exhausted") /* REG_ESPACE */
185 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
186 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
188 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
189 gettext_noop ("Premature end of regular expression") /* REG_EEND */
191 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
192 gettext_noop ("Regular expression too big") /* REG_ESIZE */
194 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
195 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
198 const size_t __re_error_msgid_idx
[] attribute_hidden
=
219 /* Entry points for GNU code. */
221 /* re_compile_pattern is the GNU regular expression compiler: it
222 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
223 Returns 0 if the pattern was valid, otherwise an error string.
225 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
226 are set in BUFP on entry. */
229 re_compile_pattern (pattern
, length
, bufp
)
232 struct re_pattern_buffer
*bufp
;
236 /* And GNU code determines whether or not to get register information
237 by passing null for the REGS argument to re_match, etc., not by
241 /* Match anchors at newline. */
242 bufp
->newline_anchor
= 1;
244 ret
= re_compile_internal (bufp
, pattern
, length
, re_syntax_options
);
248 return gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
251 weak_alias (__re_compile_pattern
, re_compile_pattern
)
254 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
255 also be assigned to arbitrarily: each pattern buffer stores its own
256 syntax, so it can be changed between regex compilations. */
257 /* This has no initializer because initialized variables in Emacs
258 become read-only after dumping. */
259 reg_syntax_t re_syntax_options
;
262 /* Specify the precise syntax of regexps for compilation. This provides
263 for compatibility for various utilities which historically have
264 different, incompatible syntaxes.
266 The argument SYNTAX is a bit mask comprised of the various bits
267 defined in regex.h. We return the old syntax. */
270 re_set_syntax (syntax
)
273 reg_syntax_t ret
= re_syntax_options
;
275 re_syntax_options
= syntax
;
279 weak_alias (__re_set_syntax
, re_set_syntax
)
283 re_compile_fastmap (bufp
)
284 struct re_pattern_buffer
*bufp
;
286 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
287 char *fastmap
= bufp
->fastmap
;
289 memset (fastmap
, '\0', sizeof (char) * SBC_MAX
);
290 re_compile_fastmap_iter (bufp
, dfa
->init_state
, fastmap
);
291 if (dfa
->init_state
!= dfa
->init_state_word
)
292 re_compile_fastmap_iter (bufp
, dfa
->init_state_word
, fastmap
);
293 if (dfa
->init_state
!= dfa
->init_state_nl
)
294 re_compile_fastmap_iter (bufp
, dfa
->init_state_nl
, fastmap
);
295 if (dfa
->init_state
!= dfa
->init_state_begbuf
)
296 re_compile_fastmap_iter (bufp
, dfa
->init_state_begbuf
, fastmap
);
297 bufp
->fastmap_accurate
= 1;
301 weak_alias (__re_compile_fastmap
, re_compile_fastmap
)
305 __attribute ((always_inline
))
306 re_set_fastmap (char *fastmap
, int icase
, int ch
)
310 fastmap
[tolower (ch
)] = 1;
313 /* Helper function for re_compile_fastmap.
314 Compile fastmap for the initial_state INIT_STATE. */
317 re_compile_fastmap_iter (bufp
, init_state
, fastmap
)
319 const re_dfastate_t
*init_state
;
322 re_dfa_t
*dfa
= (re_dfa_t
*) bufp
->buffer
;
324 int icase
= (dfa
->mb_cur_max
== 1 && (bufp
->syntax
& RE_ICASE
));
325 for (node_cnt
= 0; node_cnt
< init_state
->nodes
.nelem
; ++node_cnt
)
327 int node
= init_state
->nodes
.elems
[node_cnt
];
328 re_token_type_t type
= dfa
->nodes
[node
].type
;
330 if (type
== CHARACTER
)
332 re_set_fastmap (fastmap
, icase
, dfa
->nodes
[node
].opr
.c
);
333 #ifdef RE_ENABLE_I18N
334 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
336 unsigned char *buf
= alloca (dfa
->mb_cur_max
), *p
;
341 *p
++ = dfa
->nodes
[node
].opr
.c
;
342 while (++node
< dfa
->nodes_len
343 && dfa
->nodes
[node
].type
== CHARACTER
344 && dfa
->nodes
[node
].mb_partial
)
345 *p
++ = dfa
->nodes
[node
].opr
.c
;
346 memset (&state
, 0, sizeof (state
));
347 if (mbrtowc (&wc
, (const char *) buf
, p
- buf
,
349 && __wcrtomb ((char *) buf
, towlower (wc
), &state
) > 0)
350 re_set_fastmap (fastmap
, 0, buf
[0]);
354 else if (type
== SIMPLE_BRACKET
)
357 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
358 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
359 if (dfa
->nodes
[node
].opr
.sbcset
[i
] & (1 << j
))
360 re_set_fastmap (fastmap
, icase
, ch
);
362 #ifdef RE_ENABLE_I18N
363 else if (type
== COMPLEX_BRACKET
)
366 re_charset_t
*cset
= dfa
->nodes
[node
].opr
.mbcset
;
367 if (cset
->non_match
|| cset
->ncoll_syms
|| cset
->nequiv_classes
368 || cset
->nranges
|| cset
->nchar_classes
)
371 if (_NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
) != 0)
373 /* In this case we want to catch the bytes which are
374 the first byte of any collation elements.
375 e.g. In da_DK, we want to catch 'a' since "aa"
376 is a valid collation element, and don't catch
377 'b' since 'b' is the only collation element
378 which starts from 'b'. */
380 const int32_t *table
= (const int32_t *)
381 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
382 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
383 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
385 re_set_fastmap (fastmap
, icase
, ch
);
388 if (dfa
->mb_cur_max
> 1)
389 for (i
= 0; i
< SBC_MAX
; ++i
)
390 if (__btowc (i
) == WEOF
)
391 re_set_fastmap (fastmap
, icase
, i
);
392 # endif /* not _LIBC */
394 for (i
= 0; i
< cset
->nmbchars
; ++i
)
398 memset (&state
, '\0', sizeof (state
));
399 __wcrtomb (buf
, cset
->mbchars
[i
], &state
);
400 re_set_fastmap (fastmap
, icase
, *(unsigned char *) buf
);
401 if ((bufp
->syntax
& RE_ICASE
) && dfa
->mb_cur_max
> 1)
403 __wcrtomb (buf
, towlower (cset
->mbchars
[i
]), &state
);
404 re_set_fastmap (fastmap
, 0, *(unsigned char *) buf
);
408 #endif /* RE_ENABLE_I18N */
409 else if (type
== END_OF_RE
|| type
== OP_PERIOD
410 || type
== OP_UTF8_PERIOD
)
412 memset (fastmap
, '\1', sizeof (char) * SBC_MAX
);
413 if (type
== END_OF_RE
)
414 bufp
->can_be_null
= 1;
420 /* Entry point for POSIX code. */
421 /* regcomp takes a regular expression as a string and compiles it.
423 PREG is a regex_t *. We do not expect any fields to be initialized,
424 since POSIX says we shouldn't. Thus, we set
426 `buffer' to the compiled pattern;
427 `used' to the length of the compiled pattern;
428 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
429 REG_EXTENDED bit in CFLAGS is set; otherwise, to
430 RE_SYNTAX_POSIX_BASIC;
431 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
432 `fastmap' to an allocated space for the fastmap;
433 `fastmap_accurate' to zero;
434 `re_nsub' to the number of subexpressions in PATTERN.
436 PATTERN is the address of the pattern string.
438 CFLAGS is a series of bits which affect compilation.
440 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
441 use POSIX basic syntax.
443 If REG_NEWLINE is set, then . and [^...] don't match newline.
444 Also, regexec will try a match beginning after every newline.
446 If REG_ICASE is set, then we considers upper- and lowercase
447 versions of letters to be equivalent when matching.
449 If REG_NOSUB is set, then when PREG is passed to regexec, that
450 routine will report only success or failure, and nothing about the
453 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
454 the return codes and their meanings.) */
457 regcomp (preg
, pattern
, cflags
)
458 regex_t
*__restrict preg
;
459 const char *__restrict pattern
;
463 reg_syntax_t syntax
= ((cflags
& REG_EXTENDED
) ? RE_SYNTAX_POSIX_EXTENDED
464 : RE_SYNTAX_POSIX_BASIC
);
470 /* Try to allocate space for the fastmap. */
471 preg
->fastmap
= re_malloc (char, SBC_MAX
);
472 if (BE (preg
->fastmap
== NULL
, 0))
475 syntax
|= (cflags
& REG_ICASE
) ? RE_ICASE
: 0;
477 /* If REG_NEWLINE is set, newlines are treated differently. */
478 if (cflags
& REG_NEWLINE
)
479 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
480 syntax
&= ~RE_DOT_NEWLINE
;
481 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
482 /* It also changes the matching behavior. */
483 preg
->newline_anchor
= 1;
486 preg
->newline_anchor
= 0;
487 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
488 preg
->translate
= NULL
;
490 ret
= re_compile_internal (preg
, pattern
, strlen (pattern
), syntax
);
492 /* POSIX doesn't distinguish between an unmatched open-group and an
493 unmatched close-group: both are REG_EPAREN. */
494 if (ret
== REG_ERPAREN
)
497 /* We have already checked preg->fastmap != NULL. */
498 if (BE (ret
== REG_NOERROR
, 1))
499 /* Compute the fastmap now, since regexec cannot modify the pattern
500 buffer. This function nevers fails in this implementation. */
501 (void) re_compile_fastmap (preg
);
504 /* Some error occurred while compiling the expression. */
505 re_free (preg
->fastmap
);
506 preg
->fastmap
= NULL
;
512 weak_alias (__regcomp
, regcomp
)
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
519 regerror (errcode
, preg
, errbuf
, errbuf_size
)
529 || errcode
>= (int) (sizeof (__re_error_msgid_idx
)
530 / sizeof (__re_error_msgid_idx
[0])), 0))
531 /* Only error codes returned by the rest of the code should be passed
532 to this routine. If we are given anything else, or if other regex
533 code generates an invalid error code, then the program has a bug.
534 Dump core so we can fix it. */
537 msg
= gettext (__re_error_msgid
+ __re_error_msgid_idx
[errcode
]);
539 msg_size
= strlen (msg
) + 1; /* Includes the null. */
541 if (BE (errbuf_size
!= 0, 1))
543 if (BE (msg_size
> errbuf_size
, 0))
545 #if defined HAVE_MEMPCPY || defined _LIBC
546 *((char *) __mempcpy (errbuf
, msg
, errbuf_size
- 1)) = '\0';
548 memcpy (errbuf
, msg
, errbuf_size
- 1);
549 errbuf
[errbuf_size
- 1] = 0;
553 memcpy (errbuf
, msg
, msg_size
);
559 weak_alias (__regerror
, regerror
)
564 free_dfa_content (re_dfa_t
*dfa
)
568 re_free (dfa
->subexps
);
571 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
573 re_token_t
*node
= dfa
->nodes
+ i
;
574 #ifdef RE_ENABLE_I18N
575 if (node
->type
== COMPLEX_BRACKET
&& node
->duplicated
== 0)
576 free_charset (node
->opr
.mbcset
);
578 #endif /* RE_ENABLE_I18N */
579 if (node
->type
== SIMPLE_BRACKET
&& node
->duplicated
== 0)
580 re_free (node
->opr
.sbcset
);
582 re_free (dfa
->nexts
);
583 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
585 if (dfa
->eclosures
!= NULL
)
586 re_node_set_free (dfa
->eclosures
+ i
);
587 if (dfa
->inveclosures
!= NULL
)
588 re_node_set_free (dfa
->inveclosures
+ i
);
589 if (dfa
->edests
!= NULL
)
590 re_node_set_free (dfa
->edests
+ i
);
592 re_free (dfa
->edests
);
593 re_free (dfa
->eclosures
);
594 re_free (dfa
->inveclosures
);
595 re_free (dfa
->nodes
);
597 if (dfa
->state_table
)
598 for (i
= 0; i
<= dfa
->state_hash_mask
; ++i
)
600 struct re_state_table_entry
*entry
= dfa
->state_table
+ i
;
601 for (j
= 0; j
< entry
->num
; ++j
)
603 re_dfastate_t
*state
= entry
->array
[j
];
606 re_free (entry
->array
);
608 re_free (dfa
->state_table
);
609 re_free (dfa
->word_char
);
610 #ifdef RE_ENABLE_I18N
611 re_free (dfa
->sb_char
);
614 re_free (dfa
->re_str
);
621 /* Free dynamically allocated space used by PREG. */
627 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
628 if (BE (dfa
!= NULL
, 1))
629 free_dfa_content (dfa
);
631 re_free (preg
->fastmap
);
634 weak_alias (__regfree
, regfree
)
637 /* Entry points compatible with 4.2 BSD regex library. We don't define
638 them unless specifically requested. */
640 #if defined _REGEX_RE_COMP || defined _LIBC
642 /* BSD has one and only one pattern buffer. */
643 static struct re_pattern_buffer re_comp_buf
;
647 /* Make these definitions weak in libc, so POSIX programs can redefine
648 these names if they don't use our functions, and still use
649 regcomp/regexec above without link errors. */
660 if (!re_comp_buf
.buffer
)
661 return gettext ("No previous regular expression");
665 if (re_comp_buf
.buffer
)
667 fastmap
= re_comp_buf
.fastmap
;
668 re_comp_buf
.fastmap
= NULL
;
669 __regfree (&re_comp_buf
);
670 memset (&re_comp_buf
, '\0', sizeof (re_comp_buf
));
671 re_comp_buf
.fastmap
= fastmap
;
674 if (re_comp_buf
.fastmap
== NULL
)
676 re_comp_buf
.fastmap
= (char *) malloc (SBC_MAX
);
677 if (re_comp_buf
.fastmap
== NULL
)
678 return (char *) gettext (__re_error_msgid
679 + __re_error_msgid_idx
[(int) REG_ESPACE
]);
682 /* Since `re_exec' always passes NULL for the `regs' argument, we
683 don't need to initialize the pattern buffer fields which affect it. */
685 /* Match anchors at newlines. */
686 re_comp_buf
.newline_anchor
= 1;
688 ret
= re_compile_internal (&re_comp_buf
, s
, strlen (s
), re_syntax_options
);
693 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
694 return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx
[(int) ret
]);
698 libc_freeres_fn (free_mem
)
700 __regfree (&re_comp_buf
);
704 #endif /* _REGEX_RE_COMP */
706 /* Internal entry point.
707 Compile the regular expression PATTERN, whose length is LENGTH.
708 SYNTAX indicate regular expression's syntax. */
711 re_compile_internal (preg
, pattern
, length
, syntax
)
713 const char * pattern
;
717 reg_errcode_t err
= REG_NOERROR
;
721 /* Initialize the pattern buffer. */
722 preg
->fastmap_accurate
= 0;
723 preg
->syntax
= syntax
;
724 preg
->not_bol
= preg
->not_eol
= 0;
727 preg
->can_be_null
= 0;
728 preg
->regs_allocated
= REGS_UNALLOCATED
;
730 /* Initialize the dfa. */
731 dfa
= (re_dfa_t
*) preg
->buffer
;
732 if (BE (preg
->allocated
< sizeof (re_dfa_t
), 0))
734 /* If zero allocated, but buffer is non-null, try to realloc
735 enough space. This loses if buffer's address is bogus, but
736 that is the user's responsibility. If ->buffer is NULL this
737 is a simple allocation. */
738 dfa
= re_realloc (preg
->buffer
, re_dfa_t
, 1);
741 preg
->allocated
= sizeof (re_dfa_t
);
742 preg
->buffer
= (unsigned char *) dfa
;
744 preg
->used
= sizeof (re_dfa_t
);
746 err
= init_dfa (dfa
, length
);
747 if (BE (err
!= REG_NOERROR
, 0))
749 free_dfa_content (dfa
);
755 dfa
->re_str
= re_malloc (char, length
+ 1);
756 strncpy (dfa
->re_str
, pattern
, length
+ 1);
759 err
= re_string_construct (®exp
, pattern
, length
, preg
->translate
,
760 syntax
& RE_ICASE
, dfa
);
761 if (BE (err
!= REG_NOERROR
, 0))
763 re_compile_internal_free_return
:
764 free_workarea_compile (preg
);
765 re_string_destruct (®exp
);
766 free_dfa_content (dfa
);
772 /* Parse the regular expression, and build a structure tree. */
774 dfa
->str_tree
= parse (®exp
, preg
, syntax
, &err
);
775 if (BE (dfa
->str_tree
== NULL
, 0))
776 goto re_compile_internal_free_return
;
778 #ifdef RE_ENABLE_I18N
779 /* If possible, do searching in single byte encoding to speed things up. */
780 if (dfa
->is_utf8
&& !(syntax
& RE_ICASE
) && preg
->translate
== NULL
)
784 /* Analyze the tree and collect information which is necessary to
787 if (BE (err
!= REG_NOERROR
, 0))
788 goto re_compile_internal_free_return
;
790 /* Then create the initial state of the dfa. */
791 err
= create_initial_state (dfa
);
793 /* Release work areas. */
794 free_workarea_compile (preg
);
795 re_string_destruct (®exp
);
797 if (BE (err
!= REG_NOERROR
, 0))
799 free_dfa_content (dfa
);
807 /* Initialize DFA. We use the length of the regular expression PAT_LEN
808 as the initial length of some arrays. */
811 init_dfa (dfa
, pat_len
)
817 memset (dfa
, '\0', sizeof (re_dfa_t
));
819 /* Force allocation of str_tree_storage the first time. */
820 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
822 dfa
->nodes_alloc
= pat_len
+ 1;
823 dfa
->nodes
= re_malloc (re_token_t
, dfa
->nodes_alloc
);
825 dfa
->states_alloc
= pat_len
+ 1;
827 /* table_size = 2 ^ ceil(log pat_len) */
828 for (table_size
= 1; table_size
> 0; table_size
<<= 1)
829 if (table_size
> pat_len
)
832 dfa
->state_table
= calloc (sizeof (struct re_state_table_entry
), table_size
);
833 dfa
->state_hash_mask
= table_size
- 1;
835 dfa
->subexps_alloc
= 1;
836 dfa
->subexps
= re_malloc (re_subexp_t
, dfa
->subexps_alloc
);
837 /* dfa->word_char = NULL; */
839 dfa
->mb_cur_max
= MB_CUR_MAX
;
841 if (dfa
->mb_cur_max
> 1
842 && strcmp (_NL_CURRENT (LC_CTYPE
, _NL_CTYPE_CODESET_NAME
), "UTF-8") == 0)
844 dfa
->map_notascii
= (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_MAP_TO_NONASCII
)
847 #ifdef RE_ENABLE_I18N
848 if (dfa
->mb_cur_max
> 1)
852 dfa
->sb_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
853 if (BE (dfa
->sb_char
== NULL
, 0))
857 memset (dfa
->sb_char
, 255, sizeof (unsigned int) * BITSET_UINTS
/ 2);
860 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
861 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
862 if (btowc (ch
) != WEOF
)
863 dfa
->sb_char
[i
] |= 1 << j
;
867 if (BE (dfa
->nodes
== NULL
|| dfa
->state_table
== NULL
868 || dfa
->subexps
== NULL
, 0))
873 /* Initialize WORD_CHAR table, which indicate which character is
874 "word". In this case "word" means that it is the word construction
875 character used by some operators like "\<", "\>", etc. */
882 dfa
->word_char
= (re_bitset_ptr_t
) calloc (sizeof (bitset
), 1);
883 if (BE (dfa
->word_char
== NULL
, 0))
885 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
886 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
887 if (isalnum (ch
) || ch
== '_')
888 dfa
->word_char
[i
] |= 1 << j
;
892 /* Free the work area which are only used while compiling. */
895 free_workarea_compile (preg
)
898 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
899 bin_tree_storage_t
*storage
, *next
;
900 for (storage
= dfa
->str_tree_storage
; storage
; storage
= next
)
902 next
= storage
->next
;
905 dfa
->str_tree_storage
= NULL
;
906 dfa
->str_tree_storage_idx
= BIN_TREE_STORAGE_SIZE
;
907 dfa
->str_tree
= NULL
;
908 re_free (dfa
->org_indices
);
909 dfa
->org_indices
= NULL
;
912 /* Create initial states for all contexts. */
915 create_initial_state (dfa
)
920 re_node_set init_nodes
;
922 /* Initial states have the epsilon closure of the node which is
923 the first node of the regular expression. */
924 first
= dfa
->str_tree
->first
;
925 dfa
->init_node
= first
;
926 err
= re_node_set_init_copy (&init_nodes
, dfa
->eclosures
+ first
);
927 if (BE (err
!= REG_NOERROR
, 0))
930 /* The back-references which are in initial states can epsilon transit,
931 since in this case all of the subexpressions can be null.
932 Then we add epsilon closures of the nodes which are the next nodes of
933 the back-references. */
934 if (dfa
->nbackref
> 0)
935 for (i
= 0; i
< init_nodes
.nelem
; ++i
)
937 int node_idx
= init_nodes
.elems
[i
];
938 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
941 if (type
!= OP_BACK_REF
)
943 for (clexp_idx
= 0; clexp_idx
< init_nodes
.nelem
; ++clexp_idx
)
945 re_token_t
*clexp_node
;
946 clexp_node
= dfa
->nodes
+ init_nodes
.elems
[clexp_idx
];
947 if (clexp_node
->type
== OP_CLOSE_SUBEXP
948 && clexp_node
->opr
.idx
+ 1 == dfa
->nodes
[node_idx
].opr
.idx
)
951 if (clexp_idx
== init_nodes
.nelem
)
954 if (type
== OP_BACK_REF
)
956 int dest_idx
= dfa
->edests
[node_idx
].elems
[0];
957 if (!re_node_set_contains (&init_nodes
, dest_idx
))
959 re_node_set_merge (&init_nodes
, dfa
->eclosures
+ dest_idx
);
965 /* It must be the first time to invoke acquire_state. */
966 dfa
->init_state
= re_acquire_state_context (&err
, dfa
, &init_nodes
, 0);
967 /* We don't check ERR here, since the initial state must not be NULL. */
968 if (BE (dfa
->init_state
== NULL
, 0))
970 if (dfa
->init_state
->has_constraint
)
972 dfa
->init_state_word
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
974 dfa
->init_state_nl
= re_acquire_state_context (&err
, dfa
, &init_nodes
,
976 dfa
->init_state_begbuf
= re_acquire_state_context (&err
, dfa
,
980 if (BE (dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
981 || dfa
->init_state_begbuf
== NULL
, 0))
985 dfa
->init_state_word
= dfa
->init_state_nl
986 = dfa
->init_state_begbuf
= dfa
->init_state
;
988 re_node_set_free (&init_nodes
);
992 #ifdef RE_ENABLE_I18N
993 /* If it is possible to do searching in single byte encoding instead of UTF-8
994 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
995 DFA nodes where needed. */
1001 int node
, i
, mb_chars
= 0, has_period
= 0;
1003 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1004 switch (dfa
->nodes
[node
].type
)
1007 if (dfa
->nodes
[node
].opr
.c
>= 0x80)
1011 switch (dfa
->nodes
[node
].opr
.idx
)
1019 /* Word anchors etc. cannot be handled. */
1029 case OP_DUP_ASTERISK
:
1030 case OP_DUP_QUESTION
:
1032 case OP_OPEN_SUBEXP
:
1033 case OP_CLOSE_SUBEXP
:
1035 case SIMPLE_BRACKET
:
1036 /* Just double check. */
1037 for (i
= 0x80 / UINT_BITS
; i
< BITSET_UINTS
; ++i
)
1038 if (dfa
->nodes
[node
].opr
.sbcset
[i
])
1045 if (mb_chars
|| has_period
)
1046 for (node
= 0; node
< dfa
->nodes_len
; ++node
)
1048 if (dfa
->nodes
[node
].type
== CHARACTER
1049 && dfa
->nodes
[node
].opr
.c
>= 0x80)
1050 dfa
->nodes
[node
].mb_partial
= 0;
1051 else if (dfa
->nodes
[node
].type
== OP_PERIOD
)
1052 dfa
->nodes
[node
].type
= OP_UTF8_PERIOD
;
1055 /* The search can be in single byte locale. */
1056 dfa
->mb_cur_max
= 1;
1058 dfa
->has_mb_node
= dfa
->nbackref
> 0 || has_period
;
1062 /* Analyze the structure tree, and calculate "first", "next", "edest",
1063 "eclosure", and "inveclosure". */
1065 static reg_errcode_t
1072 /* Allocate arrays. */
1073 dfa
->nexts
= re_malloc (int, dfa
->nodes_alloc
);
1074 dfa
->org_indices
= re_malloc (int, dfa
->nodes_alloc
);
1075 dfa
->edests
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1076 dfa
->eclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1077 dfa
->inveclosures
= re_malloc (re_node_set
, dfa
->nodes_alloc
);
1078 if (BE (dfa
->nexts
== NULL
|| dfa
->org_indices
== NULL
|| dfa
->edests
== NULL
1079 || dfa
->eclosures
== NULL
|| dfa
->inveclosures
== NULL
, 0))
1081 /* Initialize them. */
1082 for (i
= 0; i
< dfa
->nodes_len
; ++i
)
1085 re_node_set_init_empty (dfa
->edests
+ i
);
1086 re_node_set_init_empty (dfa
->eclosures
+ i
);
1087 re_node_set_init_empty (dfa
->inveclosures
+ i
);
1090 ret
= analyze_tree (dfa
, dfa
->str_tree
);
1091 if (BE (ret
== REG_NOERROR
, 1))
1093 ret
= calc_eclosure (dfa
);
1094 if (ret
== REG_NOERROR
)
1095 calc_inveclosure (dfa
);
1100 /* Helper functions for analyze.
1101 This function calculate "first", "next", and "edest" for the subtree
1102 whose root is NODE. */
1104 static reg_errcode_t
1105 analyze_tree (dfa
, node
)
1110 if (node
->first
== -1)
1111 calc_first (dfa
, node
);
1112 if (node
->next
== -1)
1113 calc_next (dfa
, node
);
1114 if (node
->eclosure
.nelem
== 0)
1115 calc_epsdest (dfa
, node
);
1116 /* Calculate "first" etc. for the left child. */
1117 if (node
->left
!= NULL
)
1119 ret
= analyze_tree (dfa
, node
->left
);
1120 if (BE (ret
!= REG_NOERROR
, 0))
1123 /* Calculate "first" etc. for the right child. */
1124 if (node
->right
!= NULL
)
1126 ret
= analyze_tree (dfa
, node
->right
);
1127 if (BE (ret
!= REG_NOERROR
, 0))
1133 /* Calculate "first" for the node NODE. */
1135 calc_first (dfa
, node
)
1140 idx
= node
->node_idx
;
1141 type
= (node
->type
== 0) ? dfa
->nodes
[idx
].type
: node
->type
;
1146 case OP_OPEN_BRACKET
:
1147 case OP_CLOSE_BRACKET
:
1148 case OP_OPEN_DUP_NUM
:
1149 case OP_CLOSE_DUP_NUM
:
1150 case OP_NON_MATCH_LIST
:
1151 case OP_OPEN_COLL_ELEM
:
1152 case OP_CLOSE_COLL_ELEM
:
1153 case OP_OPEN_EQUIV_CLASS
:
1154 case OP_CLOSE_EQUIV_CLASS
:
1155 case OP_OPEN_CHAR_CLASS
:
1156 case OP_CLOSE_CHAR_CLASS
:
1157 /* These must not be appeared here. */
1163 case OP_DUP_ASTERISK
:
1164 case OP_DUP_QUESTION
:
1165 #ifdef RE_ENABLE_I18N
1166 case OP_UTF8_PERIOD
:
1167 case COMPLEX_BRACKET
:
1168 #endif /* RE_ENABLE_I18N */
1169 case SIMPLE_BRACKET
:
1172 case OP_OPEN_SUBEXP
:
1173 case OP_CLOSE_SUBEXP
:
1178 assert (node
->left
!= NULL
);
1180 if (node
->left
->first
== -1)
1181 calc_first (dfa
, node
->left
);
1182 node
->first
= node
->left
->first
;
1187 /* else fall through */
1190 assert (node
->left
!= NULL
);
1192 if (node
->left
->first
== -1)
1193 calc_first (dfa
, node
->left
);
1194 node
->first
= node
->left
->first
;
1199 /* Calculate "next" for the node NODE. */
1202 calc_next (dfa
, node
)
1207 bin_tree_t
*parent
= node
->parent
;
1211 idx
= node
->node_idx
;
1212 if (node
->type
== 0)
1213 dfa
->nexts
[idx
] = node
->next
;
1217 idx
= parent
->node_idx
;
1218 type
= (parent
->type
== 0) ? dfa
->nodes
[idx
].type
: parent
->type
;
1222 case OP_DUP_ASTERISK
:
1227 if (parent
->left
== node
)
1229 if (parent
->right
->first
== -1)
1230 calc_first (dfa
, parent
->right
);
1231 node
->next
= parent
->right
->first
;
1234 /* else fall through */
1236 if (parent
->next
== -1)
1237 calc_next (dfa
, parent
);
1238 node
->next
= parent
->next
;
1241 idx
= node
->node_idx
;
1242 if (node
->type
== 0)
1243 dfa
->nexts
[idx
] = node
->next
;
1246 /* Calculate "edest" for the node NODE. */
1249 calc_epsdest (dfa
, node
)
1254 idx
= node
->node_idx
;
1255 if (node
->type
== 0)
1257 if (dfa
->nodes
[idx
].type
== OP_DUP_ASTERISK
1258 || dfa
->nodes
[idx
].type
== OP_DUP_PLUS
1259 || dfa
->nodes
[idx
].type
== OP_DUP_QUESTION
)
1261 if (node
->left
->first
== -1)
1262 calc_first (dfa
, node
->left
);
1263 if (node
->next
== -1)
1264 calc_next (dfa
, node
);
1265 re_node_set_init_2 (dfa
->edests
+ idx
, node
->left
->first
,
1268 else if (dfa
->nodes
[idx
].type
== OP_ALT
)
1271 if (node
->left
!= NULL
)
1273 if (node
->left
->first
== -1)
1274 calc_first (dfa
, node
->left
);
1275 left
= node
->left
->first
;
1279 if (node
->next
== -1)
1280 calc_next (dfa
, node
);
1283 if (node
->right
!= NULL
)
1285 if (node
->right
->first
== -1)
1286 calc_first (dfa
, node
->right
);
1287 right
= node
->right
->first
;
1291 if (node
->next
== -1)
1292 calc_next (dfa
, node
);
1295 re_node_set_init_2 (dfa
->edests
+ idx
, left
, right
);
1297 else if (dfa
->nodes
[idx
].type
== ANCHOR
1298 || dfa
->nodes
[idx
].type
== OP_OPEN_SUBEXP
1299 || dfa
->nodes
[idx
].type
== OP_CLOSE_SUBEXP
1300 || dfa
->nodes
[idx
].type
== OP_BACK_REF
)
1301 re_node_set_init_1 (dfa
->edests
+ idx
, node
->next
);
1303 assert (!IS_EPSILON_NODE (dfa
->nodes
[idx
].type
));
1307 /* Duplicate the epsilon closure of the node ROOT_NODE.
1308 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1309 to their own constraint. */
1311 static reg_errcode_t
1312 duplicate_node_closure (dfa
, top_org_node
, top_clone_node
, root_node
,
1315 int top_org_node
, top_clone_node
, root_node
;
1316 unsigned int init_constraint
;
1319 int org_node
, clone_node
, ret
;
1320 unsigned int constraint
= init_constraint
;
1321 for (org_node
= top_org_node
, clone_node
= top_clone_node
;;)
1323 int org_dest
, clone_dest
;
1324 if (dfa
->nodes
[org_node
].type
== OP_BACK_REF
)
1326 /* If the back reference epsilon-transit, its destination must
1327 also have the constraint. Then duplicate the epsilon closure
1328 of the destination of the back reference, and store it in
1329 edests of the back reference. */
1330 org_dest
= dfa
->nexts
[org_node
];
1331 re_node_set_empty (dfa
->edests
+ clone_node
);
1332 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1333 if (BE (err
!= REG_NOERROR
, 0))
1335 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1336 if (clone_dest
== -1)
1338 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1339 if (BE (ret
< 0, 0))
1342 else if (dfa
->edests
[org_node
].nelem
== 0)
1344 /* In case of the node can't epsilon-transit, don't duplicate the
1345 destination and store the original destination as the
1346 destination of the node. */
1347 dfa
->nexts
[clone_node
] = dfa
->nexts
[org_node
];
1350 else if (dfa
->edests
[org_node
].nelem
== 1)
1352 /* In case of the node can epsilon-transit, and it has only one
1354 org_dest
= dfa
->edests
[org_node
].elems
[0];
1355 re_node_set_empty (dfa
->edests
+ clone_node
);
1356 if (dfa
->nodes
[org_node
].type
== ANCHOR
)
1358 /* In case of the node has another constraint, append it. */
1359 if (org_node
== root_node
&& clone_node
!= org_node
)
1361 /* ...but if the node is root_node itself, it means the
1362 epsilon closure have a loop, then tie it to the
1363 destination of the root_node. */
1364 ret
= re_node_set_insert (dfa
->edests
+ clone_node
,
1366 if (BE (ret
< 0, 0))
1370 constraint
|= dfa
->nodes
[org_node
].opr
.ctx_type
;
1372 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1373 if (BE (err
!= REG_NOERROR
, 0))
1375 if (clone_dest
== -1)
1377 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1378 if (BE (ret
< 0, 0))
1381 else /* dfa->edests[org_node].nelem == 2 */
1383 /* In case of the node can epsilon-transit, and it has two
1384 destinations. E.g. '|', '*', '+', '?'. */
1385 org_dest
= dfa
->edests
[org_node
].elems
[0];
1386 re_node_set_empty (dfa
->edests
+ clone_node
);
1387 /* Search for a duplicated node which satisfies the constraint. */
1388 clone_dest
= search_duplicated_node (dfa
, org_dest
, constraint
);
1389 if (clone_dest
== -1)
1391 /* There are no such a duplicated node, create a new one. */
1392 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1393 if (BE (err
!= REG_NOERROR
, 0))
1395 if (clone_dest
!= -1)
1397 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1398 if (BE (ret
< 0, 0))
1400 err
= duplicate_node_closure (dfa
, org_dest
, clone_dest
,
1401 root_node
, constraint
);
1402 if (BE (err
!= REG_NOERROR
, 0))
1408 /* There are a duplicated node which satisfy the constraint,
1409 use it to avoid infinite loop. */
1410 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1411 if (BE (ret
< 0, 0))
1415 org_dest
= dfa
->edests
[org_node
].elems
[1];
1416 err
= duplicate_node (&clone_dest
, dfa
, org_dest
, constraint
);
1417 if (BE (err
!= REG_NOERROR
, 0))
1419 if (clone_dest
== -1)
1421 ret
= re_node_set_insert (dfa
->edests
+ clone_node
, clone_dest
);
1422 if (BE (ret
< 0, 0))
1425 org_node
= org_dest
;
1426 clone_node
= clone_dest
;
1431 /* Search for a node which is duplicated from the node ORG_NODE, and
1432 satisfies the constraint CONSTRAINT. */
1435 search_duplicated_node (dfa
, org_node
, constraint
)
1438 unsigned int constraint
;
1441 for (idx
= dfa
->nodes_len
- 1; dfa
->nodes
[idx
].duplicated
&& idx
> 0; --idx
)
1443 if (org_node
== dfa
->org_indices
[idx
]
1444 && constraint
== dfa
->nodes
[idx
].constraint
)
1445 return idx
; /* Found. */
1447 return -1; /* Not found. */
1450 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1451 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1452 otherwise return the error code. */
1454 static reg_errcode_t
1455 duplicate_node (new_idx
, dfa
, org_idx
, constraint
)
1457 int *new_idx
, org_idx
;
1458 unsigned int constraint
;
1462 if (dfa
->nodes
[org_idx
].type
== CHARACTER
1463 && (((constraint
& NEXT_WORD_CONSTRAINT
)
1464 && !dfa
->nodes
[org_idx
].word_char
)
1465 || ((constraint
& NEXT_NOTWORD_CONSTRAINT
)
1466 && dfa
->nodes
[org_idx
].word_char
)))
1468 /* \<!, \>W etc. can never match. Don't duplicate them, instead
1469 tell the caller they shouldn't be added to edests. */
1474 dup_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[org_idx
], 1);
1475 if (BE (dup_idx
== -1, 0))
1477 dfa
->nodes
[dup_idx
].constraint
= constraint
;
1478 if (dfa
->nodes
[org_idx
].type
== ANCHOR
)
1479 dfa
->nodes
[dup_idx
].constraint
|= dfa
->nodes
[org_idx
].opr
.ctx_type
;
1480 dfa
->nodes
[dup_idx
].duplicated
= 1;
1481 re_node_set_init_empty (dfa
->edests
+ dup_idx
);
1482 re_node_set_init_empty (dfa
->eclosures
+ dup_idx
);
1483 re_node_set_init_empty (dfa
->inveclosures
+ dup_idx
);
1485 /* Store the index of the original node. */
1486 dfa
->org_indices
[dup_idx
] = org_idx
;
1492 calc_inveclosure (dfa
)
1496 for (src
= 0; src
< dfa
->nodes_len
; ++src
)
1498 for (idx
= 0; idx
< dfa
->eclosures
[src
].nelem
; ++idx
)
1500 dest
= dfa
->eclosures
[src
].elems
[idx
];
1501 re_node_set_insert (dfa
->inveclosures
+ dest
, src
);
1506 /* Calculate "eclosure" for all the node in DFA. */
1508 static reg_errcode_t
1512 int node_idx
, incomplete
;
1514 assert (dfa
->nodes_len
> 0);
1517 /* For each nodes, calculate epsilon closure. */
1518 for (node_idx
= 0; ; ++node_idx
)
1521 re_node_set eclosure_elem
;
1522 if (node_idx
== dfa
->nodes_len
)
1531 assert (dfa
->eclosures
[node_idx
].nelem
!= -1);
1533 /* If we have already calculated, skip it. */
1534 if (dfa
->eclosures
[node_idx
].nelem
!= 0)
1536 /* Calculate epsilon closure of `node_idx'. */
1537 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, node_idx
, 1);
1538 if (BE (err
!= REG_NOERROR
, 0))
1541 if (dfa
->eclosures
[node_idx
].nelem
== 0)
1544 re_node_set_free (&eclosure_elem
);
1550 /* Calculate epsilon closure of NODE. */
1552 static reg_errcode_t
1553 calc_eclosure_iter (new_set
, dfa
, node
, root
)
1554 re_node_set
*new_set
;
1559 unsigned int constraint
;
1561 re_node_set eclosure
;
1563 err
= re_node_set_alloc (&eclosure
, dfa
->edests
[node
].nelem
+ 1);
1564 if (BE (err
!= REG_NOERROR
, 0))
1567 /* This indicates that we are calculating this node now.
1568 We reference this value to avoid infinite loop. */
1569 dfa
->eclosures
[node
].nelem
= -1;
1571 constraint
= ((dfa
->nodes
[node
].type
== ANCHOR
)
1572 ? dfa
->nodes
[node
].opr
.ctx_type
: 0);
1573 /* If the current node has constraints, duplicate all nodes.
1574 Since they must inherit the constraints. */
1575 if (constraint
&& !dfa
->nodes
[dfa
->edests
[node
].elems
[0]].duplicated
)
1577 int org_node
, cur_node
;
1578 org_node
= cur_node
= node
;
1579 err
= duplicate_node_closure (dfa
, node
, node
, node
, constraint
);
1580 if (BE (err
!= REG_NOERROR
, 0))
1584 /* Expand each epsilon destination nodes. */
1585 if (IS_EPSILON_NODE(dfa
->nodes
[node
].type
))
1586 for (i
= 0; i
< dfa
->edests
[node
].nelem
; ++i
)
1588 re_node_set eclosure_elem
;
1589 int edest
= dfa
->edests
[node
].elems
[i
];
1590 /* If calculating the epsilon closure of `edest' is in progress,
1591 return intermediate result. */
1592 if (dfa
->eclosures
[edest
].nelem
== -1)
1597 /* If we haven't calculated the epsilon closure of `edest' yet,
1598 calculate now. Otherwise use calculated epsilon closure. */
1599 if (dfa
->eclosures
[edest
].nelem
== 0)
1601 err
= calc_eclosure_iter (&eclosure_elem
, dfa
, edest
, 0);
1602 if (BE (err
!= REG_NOERROR
, 0))
1606 eclosure_elem
= dfa
->eclosures
[edest
];
1607 /* Merge the epsilon closure of `edest'. */
1608 re_node_set_merge (&eclosure
, &eclosure_elem
);
1609 /* If the epsilon closure of `edest' is incomplete,
1610 the epsilon closure of this node is also incomplete. */
1611 if (dfa
->eclosures
[edest
].nelem
== 0)
1614 re_node_set_free (&eclosure_elem
);
1618 /* Epsilon closures include itself. */
1619 re_node_set_insert (&eclosure
, node
);
1620 if (incomplete
&& !root
)
1621 dfa
->eclosures
[node
].nelem
= 0;
1623 dfa
->eclosures
[node
] = eclosure
;
1624 *new_set
= eclosure
;
1628 /* Functions for token which are used in the parser. */
1630 /* Fetch a token from INPUT.
1631 We must not use this function inside bracket expressions. */
1634 fetch_token (result
, input
, syntax
)
1637 reg_syntax_t syntax
;
1639 re_string_skip_bytes (input
, peek_token (result
, input
, syntax
));
1642 /* Peek a token from INPUT, and return the length of the token.
1643 We must not use this function inside bracket expressions. */
1646 peek_token (token
, input
, syntax
)
1649 reg_syntax_t syntax
;
1653 if (re_string_eoi (input
))
1655 token
->type
= END_OF_RE
;
1659 c
= re_string_peek_byte (input
, 0);
1662 token
->word_char
= 0;
1663 #ifdef RE_ENABLE_I18N
1664 token
->mb_partial
= 0;
1665 if (input
->mb_cur_max
> 1 &&
1666 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1668 token
->type
= CHARACTER
;
1669 token
->mb_partial
= 1;
1676 if (re_string_cur_idx (input
) + 1 >= re_string_length (input
))
1678 token
->type
= BACK_SLASH
;
1682 c2
= re_string_peek_byte_case (input
, 1);
1684 token
->type
= CHARACTER
;
1685 #ifdef RE_ENABLE_I18N
1686 if (input
->mb_cur_max
> 1)
1688 wint_t wc
= re_string_wchar_at (input
,
1689 re_string_cur_idx (input
) + 1);
1690 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1694 token
->word_char
= IS_WORD_CHAR (c2
) != 0;
1699 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_NO_BK_VBAR
))
1700 token
->type
= OP_ALT
;
1702 case '1': case '2': case '3': case '4': case '5':
1703 case '6': case '7': case '8': case '9':
1704 if (!(syntax
& RE_NO_BK_REFS
))
1706 token
->type
= OP_BACK_REF
;
1707 token
->opr
.idx
= c2
- '0';
1711 if (!(syntax
& RE_NO_GNU_OPS
))
1713 token
->type
= ANCHOR
;
1714 token
->opr
.idx
= WORD_FIRST
;
1718 if (!(syntax
& RE_NO_GNU_OPS
))
1720 token
->type
= ANCHOR
;
1721 token
->opr
.idx
= WORD_LAST
;
1725 if (!(syntax
& RE_NO_GNU_OPS
))
1727 token
->type
= ANCHOR
;
1728 token
->opr
.idx
= WORD_DELIM
;
1732 if (!(syntax
& RE_NO_GNU_OPS
))
1734 token
->type
= ANCHOR
;
1735 token
->opr
.idx
= INSIDE_WORD
;
1739 if (!(syntax
& RE_NO_GNU_OPS
))
1740 token
->type
= OP_WORD
;
1743 if (!(syntax
& RE_NO_GNU_OPS
))
1744 token
->type
= OP_NOTWORD
;
1747 if (!(syntax
& RE_NO_GNU_OPS
))
1748 token
->type
= OP_SPACE
;
1751 if (!(syntax
& RE_NO_GNU_OPS
))
1752 token
->type
= OP_NOTSPACE
;
1755 if (!(syntax
& RE_NO_GNU_OPS
))
1757 token
->type
= ANCHOR
;
1758 token
->opr
.idx
= BUF_FIRST
;
1762 if (!(syntax
& RE_NO_GNU_OPS
))
1764 token
->type
= ANCHOR
;
1765 token
->opr
.idx
= BUF_LAST
;
1769 if (!(syntax
& RE_NO_BK_PARENS
))
1770 token
->type
= OP_OPEN_SUBEXP
;
1773 if (!(syntax
& RE_NO_BK_PARENS
))
1774 token
->type
= OP_CLOSE_SUBEXP
;
1777 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1778 token
->type
= OP_DUP_PLUS
;
1781 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_BK_PLUS_QM
))
1782 token
->type
= OP_DUP_QUESTION
;
1785 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1786 token
->type
= OP_OPEN_DUP_NUM
;
1789 if ((syntax
& RE_INTERVALS
) && (!(syntax
& RE_NO_BK_BRACES
)))
1790 token
->type
= OP_CLOSE_DUP_NUM
;
1798 token
->type
= CHARACTER
;
1799 #ifdef RE_ENABLE_I18N
1800 if (input
->mb_cur_max
> 1)
1802 wint_t wc
= re_string_wchar_at (input
, re_string_cur_idx (input
));
1803 token
->word_char
= IS_WIDE_WORD_CHAR (wc
) != 0;
1807 token
->word_char
= IS_WORD_CHAR (token
->opr
.c
);
1812 if (syntax
& RE_NEWLINE_ALT
)
1813 token
->type
= OP_ALT
;
1816 if (!(syntax
& RE_LIMITED_OPS
) && (syntax
& RE_NO_BK_VBAR
))
1817 token
->type
= OP_ALT
;
1820 token
->type
= OP_DUP_ASTERISK
;
1823 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1824 token
->type
= OP_DUP_PLUS
;
1827 if (!(syntax
& RE_LIMITED_OPS
) && !(syntax
& RE_BK_PLUS_QM
))
1828 token
->type
= OP_DUP_QUESTION
;
1831 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1832 token
->type
= OP_OPEN_DUP_NUM
;
1835 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
1836 token
->type
= OP_CLOSE_DUP_NUM
;
1839 if (syntax
& RE_NO_BK_PARENS
)
1840 token
->type
= OP_OPEN_SUBEXP
;
1843 if (syntax
& RE_NO_BK_PARENS
)
1844 token
->type
= OP_CLOSE_SUBEXP
;
1847 token
->type
= OP_OPEN_BRACKET
;
1850 token
->type
= OP_PERIOD
;
1853 if (!(syntax
& (RE_CONTEXT_INDEP_ANCHORS
| RE_CARET_ANCHORS_HERE
)) &&
1854 re_string_cur_idx (input
) != 0)
1856 char prev
= re_string_peek_byte (input
, -1);
1857 if (!(syntax
& RE_NEWLINE_ALT
) || prev
!= '\n')
1860 token
->type
= ANCHOR
;
1861 token
->opr
.idx
= LINE_FIRST
;
1864 if (!(syntax
& RE_CONTEXT_INDEP_ANCHORS
) &&
1865 re_string_cur_idx (input
) + 1 != re_string_length (input
))
1868 re_string_skip_bytes (input
, 1);
1869 peek_token (&next
, input
, syntax
);
1870 re_string_skip_bytes (input
, -1);
1871 if (next
.type
!= OP_ALT
&& next
.type
!= OP_CLOSE_SUBEXP
)
1874 token
->type
= ANCHOR
;
1875 token
->opr
.idx
= LINE_LAST
;
1883 /* Peek a token from INPUT, and return the length of the token.
1884 We must not use this function out of bracket expressions. */
1887 peek_token_bracket (token
, input
, syntax
)
1890 reg_syntax_t syntax
;
1893 if (re_string_eoi (input
))
1895 token
->type
= END_OF_RE
;
1898 c
= re_string_peek_byte (input
, 0);
1901 #ifdef RE_ENABLE_I18N
1902 if (input
->mb_cur_max
> 1 &&
1903 !re_string_first_byte (input
, re_string_cur_idx (input
)))
1905 token
->type
= CHARACTER
;
1908 #endif /* RE_ENABLE_I18N */
1910 if (c
== '\\' && (syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1912 /* In this case, '\' escape a character. */
1914 re_string_skip_bytes (input
, 1);
1915 c2
= re_string_peek_byte (input
, 0);
1917 token
->type
= CHARACTER
;
1920 if (c
== '[') /* '[' is a special char in a bracket exps. */
1924 c2
= re_string_peek_byte (input
, 1);
1930 token
->type
= OP_OPEN_COLL_ELEM
;
1933 token
->type
= OP_OPEN_EQUIV_CLASS
;
1936 if (syntax
& RE_CHAR_CLASSES
)
1938 token
->type
= OP_OPEN_CHAR_CLASS
;
1941 /* else fall through. */
1943 token
->type
= CHARACTER
;
1953 token
->type
= OP_CHARSET_RANGE
;
1956 token
->type
= OP_CLOSE_BRACKET
;
1959 token
->type
= OP_NON_MATCH_LIST
;
1962 token
->type
= CHARACTER
;
1967 /* Functions for parser. */
1969 /* Entry point of the parser.
1970 Parse the regular expression REGEXP and return the structure tree.
1971 If an error is occured, ERR is set by error code, and return NULL.
1972 This function build the following tree, from regular expression <reg_exp>:
1978 CAT means concatenation.
1979 EOR means end of regular expression. */
1982 parse (regexp
, preg
, syntax
, err
)
1983 re_string_t
*regexp
;
1985 reg_syntax_t syntax
;
1988 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1989 bin_tree_t
*tree
, *eor
, *root
;
1990 re_token_t current_token
;
1991 fetch_token (¤t_token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
1992 tree
= parse_reg_exp (regexp
, preg
, ¤t_token
, syntax
, 0, err
);
1993 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
1995 eor
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, ¤t_token
);
1997 root
= create_tree (dfa
, tree
, eor
, CONCAT
, 0);
2000 if (BE (eor
== NULL
|| root
== NULL
, 0))
2008 /* This function build the following tree, from regular expression
2009 <branch1>|<branch2>:
2015 ALT means alternative, which represents the operator `|'. */
2018 parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2019 re_string_t
*regexp
;
2022 reg_syntax_t syntax
;
2026 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2027 bin_tree_t
*tree
, *branch
= NULL
;
2028 tree
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2029 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2032 while (token
->type
== OP_ALT
)
2034 re_token_t alt_token
= *token
;
2035 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2036 if (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2037 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2039 branch
= parse_branch (regexp
, preg
, token
, syntax
, nest
, err
);
2040 if (BE (*err
!= REG_NOERROR
&& branch
== NULL
, 0))
2045 tree
= re_dfa_add_tree_node (dfa
, tree
, branch
, &alt_token
);
2046 if (BE (tree
== NULL
, 0))
2051 dfa
->has_plural_match
= 1;
2056 /* This function build the following tree, from regular expression
2063 CAT means concatenation. */
2066 parse_branch (regexp
, preg
, token
, syntax
, nest
, err
)
2067 re_string_t
*regexp
;
2070 reg_syntax_t syntax
;
2074 bin_tree_t
*tree
, *exp
;
2075 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2076 tree
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2077 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2080 while (token
->type
!= OP_ALT
&& token
->type
!= END_OF_RE
2081 && (nest
== 0 || token
->type
!= OP_CLOSE_SUBEXP
))
2083 exp
= parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2084 if (BE (*err
!= REG_NOERROR
&& exp
== NULL
, 0))
2088 if (tree
!= NULL
&& exp
!= NULL
)
2090 tree
= create_tree (dfa
, tree
, exp
, CONCAT
, 0);
2097 else if (tree
== NULL
)
2099 /* Otherwise exp == NULL, we don't need to create new tree. */
2104 /* This function build the following tree, from regular expression a*:
2111 parse_expression (regexp
, preg
, token
, syntax
, nest
, err
)
2112 re_string_t
*regexp
;
2115 reg_syntax_t syntax
;
2119 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2121 switch (token
->type
)
2124 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2125 if (BE (tree
== NULL
, 0))
2130 #ifdef RE_ENABLE_I18N
2131 if (dfa
->mb_cur_max
> 1)
2133 while (!re_string_eoi (regexp
)
2134 && !re_string_first_byte (regexp
, re_string_cur_idx (regexp
)))
2136 bin_tree_t
*mbc_remain
;
2137 fetch_token (token
, regexp
, syntax
);
2138 mbc_remain
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2139 tree
= create_tree (dfa
, tree
, mbc_remain
, CONCAT
, 0);
2140 if (BE (mbc_remain
== NULL
|| tree
== NULL
, 0))
2149 case OP_OPEN_SUBEXP
:
2150 tree
= parse_sub_exp (regexp
, preg
, token
, syntax
, nest
+ 1, err
);
2151 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2154 case OP_OPEN_BRACKET
:
2155 tree
= parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
);
2156 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2160 if (BE (preg
->re_nsub
< token
->opr
.idx
2161 || dfa
->subexps
[token
->opr
.idx
- 1].end
== -1, 0))
2166 dfa
->used_bkref_map
|= 1 << (token
->opr
.idx
- 1);
2167 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2168 if (BE (tree
== NULL
, 0))
2174 dfa
->has_mb_node
= 1;
2176 case OP_OPEN_DUP_NUM
:
2177 if (syntax
& RE_CONTEXT_INVALID_DUP
)
2183 case OP_DUP_ASTERISK
:
2185 case OP_DUP_QUESTION
:
2186 if (syntax
& RE_CONTEXT_INVALID_OPS
)
2191 else if (syntax
& RE_CONTEXT_INDEP_OPS
)
2193 fetch_token (token
, regexp
, syntax
);
2194 return parse_expression (regexp
, preg
, token
, syntax
, nest
, err
);
2196 /* else fall through */
2197 case OP_CLOSE_SUBEXP
:
2198 if ((token
->type
== OP_CLOSE_SUBEXP
) &&
2199 !(syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
))
2204 /* else fall through */
2205 case OP_CLOSE_DUP_NUM
:
2206 /* We treat it as a normal character. */
2208 /* Then we can these characters as normal characters. */
2209 token
->type
= CHARACTER
;
2210 /* mb_partial and word_char bits should be initialized already
2212 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2213 if (BE (tree
== NULL
, 0))
2220 if (dfa
->word_char
== NULL
)
2222 *err
= init_word_char (dfa
);
2223 if (BE (*err
!= REG_NOERROR
, 0))
2226 if (token
->opr
.ctx_type
== WORD_DELIM
)
2228 bin_tree_t
*tree_first
, *tree_last
;
2229 token
->opr
.ctx_type
= WORD_FIRST
;
2230 tree_first
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2231 token
->opr
.ctx_type
= WORD_LAST
;
2232 tree_last
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2233 token
->type
= OP_ALT
;
2234 tree
= re_dfa_add_tree_node (dfa
, tree_first
, tree_last
, token
);
2235 if (BE (tree_first
== NULL
|| tree_last
== NULL
|| tree
== NULL
, 0))
2243 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2244 if (BE (tree
== NULL
, 0))
2250 /* We must return here, since ANCHORs can't be followed
2251 by repetition operators.
2252 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2253 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2254 fetch_token (token
, regexp
, syntax
);
2257 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2258 if (BE (tree
== NULL
, 0))
2263 if (dfa
->mb_cur_max
> 1)
2264 dfa
->has_mb_node
= 1;
2267 tree
= build_charclass_op (dfa
, regexp
->trans
, "alnum", "_", 0, err
);
2268 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2272 tree
= build_charclass_op (dfa
, regexp
->trans
, "alnum", "_", 1, err
);
2273 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2277 tree
= build_charclass_op (dfa
, regexp
->trans
, "space", "", 0, err
);
2278 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2282 tree
= build_charclass_op (dfa
, regexp
->trans
, "space", "", 1, err
);
2283 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2293 /* Must not happen? */
2299 fetch_token (token
, regexp
, syntax
);
2301 while (token
->type
== OP_DUP_ASTERISK
|| token
->type
== OP_DUP_PLUS
2302 || token
->type
== OP_DUP_QUESTION
|| token
->type
== OP_OPEN_DUP_NUM
)
2304 tree
= parse_dup_op (tree
, regexp
, dfa
, token
, syntax
, err
);
2305 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2307 /* In BRE consecutive duplications are not allowed. */
2308 if ((syntax
& RE_CONTEXT_INVALID_DUP
)
2309 && (token
->type
== OP_DUP_ASTERISK
2310 || token
->type
== OP_OPEN_DUP_NUM
))
2315 dfa
->has_plural_match
= 1;
2321 /* This function build the following tree, from regular expression
2329 parse_sub_exp (regexp
, preg
, token
, syntax
, nest
, err
)
2330 re_string_t
*regexp
;
2333 reg_syntax_t syntax
;
2337 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2338 bin_tree_t
*tree
, *left_par
, *right_par
;
2340 cur_nsub
= preg
->re_nsub
++;
2341 if (BE (dfa
->subexps_alloc
< preg
->re_nsub
, 0))
2343 re_subexp_t
*new_array
;
2344 dfa
->subexps_alloc
*= 2;
2345 new_array
= re_realloc (dfa
->subexps
, re_subexp_t
, dfa
->subexps_alloc
);
2346 if (BE (new_array
== NULL
, 0))
2348 dfa
->subexps_alloc
/= 2;
2352 dfa
->subexps
= new_array
;
2354 dfa
->subexps
[cur_nsub
].start
= dfa
->nodes_len
;
2355 dfa
->subexps
[cur_nsub
].end
= -1;
2357 left_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2358 if (BE (left_par
== NULL
, 0))
2363 dfa
->nodes
[left_par
->node_idx
].opr
.idx
= cur_nsub
;
2364 fetch_token (token
, regexp
, syntax
| RE_CARET_ANCHORS_HERE
);
2366 /* The subexpression may be a null string. */
2367 if (token
->type
== OP_CLOSE_SUBEXP
)
2371 tree
= parse_reg_exp (regexp
, preg
, token
, syntax
, nest
, err
);
2372 if (BE (*err
!= REG_NOERROR
&& tree
== NULL
, 0))
2375 if (BE (token
->type
!= OP_CLOSE_SUBEXP
, 0))
2380 right_par
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, token
);
2381 dfa
->subexps
[cur_nsub
].end
= dfa
->nodes_len
;
2382 tree
= ((tree
== NULL
) ? right_par
2383 : create_tree (dfa
, tree
, right_par
, CONCAT
, 0));
2384 tree
= create_tree (dfa
, left_par
, tree
, CONCAT
, 0);
2385 if (BE (right_par
== NULL
|| tree
== NULL
, 0))
2390 dfa
->nodes
[right_par
->node_idx
].opr
.idx
= cur_nsub
;
2395 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2398 parse_dup_op (dup_elem
, regexp
, dfa
, token
, syntax
, err
)
2399 bin_tree_t
*dup_elem
;
2400 re_string_t
*regexp
;
2403 reg_syntax_t syntax
;
2406 re_token_t dup_token
;
2407 bin_tree_t
*tree
= dup_elem
, *work_tree
;
2408 int start_idx
= re_string_cur_idx (regexp
);
2409 re_token_t start_token
= *token
;
2410 if (token
->type
== OP_OPEN_DUP_NUM
)
2414 int start
= fetch_number (regexp
, token
, syntax
);
2418 if (token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2419 start
= 0; /* We treat "{,m}" as "{0,m}". */
2422 *err
= REG_BADBR
; /* <re>{} is invalid. */
2426 if (BE (start
!= -2, 1))
2428 /* We treat "{n}" as "{n,n}". */
2429 end
= ((token
->type
== OP_CLOSE_DUP_NUM
) ? start
2430 : ((token
->type
== CHARACTER
&& token
->opr
.c
== ',')
2431 ? fetch_number (regexp
, token
, syntax
) : -2));
2433 if (BE (start
== -2 || end
== -2, 0))
2435 /* Invalid sequence. */
2436 if (token
->type
== OP_CLOSE_DUP_NUM
)
2437 goto parse_dup_op_invalid_interval
;
2439 goto parse_dup_op_ebrace
;
2441 if (BE (start
== 0 && end
== 0, 0))
2443 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2444 fetch_token (token
, regexp
, syntax
);
2448 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2450 for (i
= 0; i
< start
; ++i
)
2453 work_tree
= duplicate_tree (elem
, dfa
);
2454 tree
= create_tree (dfa
, tree
, work_tree
, CONCAT
, 0);
2455 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2456 goto parse_dup_op_espace
;
2461 /* We treat "<re>{0,}" as "<re>*". */
2462 dup_token
.type
= OP_DUP_ASTERISK
;
2465 elem
= duplicate_tree (elem
, dfa
);
2466 work_tree
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2467 tree
= create_tree (dfa
, tree
, work_tree
, CONCAT
, 0);
2468 if (BE (elem
== NULL
|| work_tree
== NULL
|| tree
== NULL
, 0))
2469 goto parse_dup_op_espace
;
2473 tree
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2474 if (BE (tree
== NULL
, 0))
2475 goto parse_dup_op_espace
;
2478 else if (BE (start
> end
, 0))
2480 /* First number greater than first. */
2484 else if (end
- start
> 0)
2486 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2487 dup_token
.type
= OP_DUP_QUESTION
;
2490 elem
= duplicate_tree (elem
, dfa
);
2491 elem
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2492 tree
= create_tree (dfa
, tree
, elem
, CONCAT
, 0);
2493 if (BE (elem
== NULL
|| tree
== NULL
, 0))
2494 goto parse_dup_op_espace
;
2498 tree
= elem
= re_dfa_add_tree_node (dfa
, elem
, NULL
, &dup_token
);
2499 if (BE (tree
== NULL
, 0))
2500 goto parse_dup_op_espace
;
2502 for (i
= 1; i
< end
- start
; ++i
)
2504 work_tree
= duplicate_tree (elem
, dfa
);
2505 tree
= create_tree (dfa
, tree
, work_tree
, CONCAT
, 0);
2506 if (BE (work_tree
== NULL
|| tree
== NULL
, 0))
2516 tree
= re_dfa_add_tree_node (dfa
, tree
, NULL
, token
);
2517 if (BE (tree
== NULL
, 0))
2523 fetch_token (token
, regexp
, syntax
);
2526 parse_dup_op_espace
:
2530 parse_dup_op_ebrace
:
2531 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2536 goto parse_dup_op_rollback
;
2537 parse_dup_op_invalid_interval
:
2538 if (BE (!(syntax
& RE_INVALID_INTERVAL_ORD
), 0))
2543 parse_dup_op_rollback
:
2544 re_string_set_index (regexp
, start_idx
);
2545 *token
= start_token
;
2546 token
->type
= CHARACTER
;
2547 /* mb_partial and word_char bits should be already initialized by
2552 /* Size of the names for collating symbol/equivalence_class/character_class.
2553 I'm not sure, but maybe enough. */
2554 #define BRACKET_NAME_BUF_SIZE 32
2557 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2558 Build the range expression which starts from START_ELEM, and ends
2559 at END_ELEM. The result are written to MBCSET and SBCSET.
2560 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2561 mbcset->range_ends, is a pointer argument sinse we may
2564 static reg_errcode_t
2565 # ifdef RE_ENABLE_I18N
2566 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2567 re_charset_t
*mbcset
;
2569 # else /* not RE_ENABLE_I18N */
2570 build_range_exp (sbcset
, start_elem
, end_elem
)
2571 # endif /* not RE_ENABLE_I18N */
2572 re_bitset_ptr_t sbcset
;
2573 bracket_elem_t
*start_elem
, *end_elem
;
2575 unsigned int start_ch
, end_ch
;
2576 /* Equivalence Classes and Character Classes can't be a range start/end. */
2577 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2578 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2582 /* We can handle no multi character collating elements without libc
2584 if (BE ((start_elem
->type
== COLL_SYM
2585 && strlen ((char *) start_elem
->opr
.name
) > 1)
2586 || (end_elem
->type
== COLL_SYM
2587 && strlen ((char *) end_elem
->opr
.name
) > 1), 0))
2588 return REG_ECOLLATE
;
2590 # ifdef RE_ENABLE_I18N
2592 wchar_t wc
, start_wc
, end_wc
;
2593 wchar_t cmp_buf
[6] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2595 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2596 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2598 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2599 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2601 start_wc
= ((start_elem
->type
== SB_CHAR
|| start_elem
->type
== COLL_SYM
)
2602 ? __btowc (start_ch
) : start_elem
->opr
.wch
);
2603 end_wc
= ((end_elem
->type
== SB_CHAR
|| end_elem
->type
== COLL_SYM
)
2604 ? __btowc (end_ch
) : end_elem
->opr
.wch
);
2605 cmp_buf
[0] = start_wc
;
2606 cmp_buf
[4] = end_wc
;
2607 if (wcscoll (cmp_buf
, cmp_buf
+ 4) > 0)
2610 /* Check the space of the arrays. */
2611 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2613 /* There are not enough space, need realloc. */
2614 wchar_t *new_array_start
, *new_array_end
;
2617 /* +1 in case of mbcset->nranges is 0. */
2618 new_nranges
= 2 * mbcset
->nranges
+ 1;
2619 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2620 are NULL if *range_alloc == 0. */
2621 new_array_start
= re_realloc (mbcset
->range_starts
, wchar_t,
2623 new_array_end
= re_realloc (mbcset
->range_ends
, wchar_t,
2626 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2629 mbcset
->range_starts
= new_array_start
;
2630 mbcset
->range_ends
= new_array_end
;
2631 *range_alloc
= new_nranges
;
2634 mbcset
->range_starts
[mbcset
->nranges
] = start_wc
;
2635 mbcset
->range_ends
[mbcset
->nranges
++] = end_wc
;
2637 /* Build the table for single byte characters. */
2638 for (wc
= 0; wc
<= SBC_MAX
; ++wc
)
2641 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2642 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2643 bitset_set (sbcset
, wc
);
2646 # else /* not RE_ENABLE_I18N */
2649 start_ch
= ((start_elem
->type
== SB_CHAR
) ? start_elem
->opr
.ch
2650 : ((start_elem
->type
== COLL_SYM
) ? start_elem
->opr
.name
[0]
2652 end_ch
= ((end_elem
->type
== SB_CHAR
) ? end_elem
->opr
.ch
2653 : ((end_elem
->type
== COLL_SYM
) ? end_elem
->opr
.name
[0]
2655 if (start_ch
> end_ch
)
2657 /* Build the table for single byte characters. */
2658 for (ch
= 0; ch
<= SBC_MAX
; ++ch
)
2659 if (start_ch
<= ch
&& ch
<= end_ch
)
2660 bitset_set (sbcset
, ch
);
2662 # endif /* not RE_ENABLE_I18N */
2665 #endif /* not _LIBC */
2668 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2669 Build the collating element which is represented by NAME.
2670 The result are written to MBCSET and SBCSET.
2671 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2672 pointer argument since we may update it. */
2674 static reg_errcode_t
2675 # ifdef RE_ENABLE_I18N
2676 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2677 re_charset_t
*mbcset
;
2678 int *coll_sym_alloc
;
2679 # else /* not RE_ENABLE_I18N */
2680 build_collating_symbol (sbcset
, name
)
2681 # endif /* not RE_ENABLE_I18N */
2682 re_bitset_ptr_t sbcset
;
2683 const unsigned char *name
;
2685 size_t name_len
= strlen ((const char *) name
);
2686 if (BE (name_len
!= 1, 0))
2687 return REG_ECOLLATE
;
2690 bitset_set (sbcset
, name
[0]);
2694 #endif /* not _LIBC */
2696 /* This function parse bracket expression like "[abc]", "[a-c]",
2700 parse_bracket_exp (regexp
, dfa
, token
, syntax
, err
)
2701 re_string_t
*regexp
;
2704 reg_syntax_t syntax
;
2708 const unsigned char *collseqmb
;
2709 const char *collseqwc
;
2712 const int32_t *symb_table
;
2713 const unsigned char *extra
;
2715 /* Local function for parse_bracket_exp used in _LIBC environement.
2716 Seek the collating symbol entry correspondings to NAME.
2717 Return the index of the symbol in the SYMB_TABLE. */
2719 static inline int32_t
2720 __attribute ((always_inline
))
2721 seek_collating_symbol_entry (name
, name_len
)
2722 const unsigned char *name
;
2725 int32_t hash
= elem_hash ((const char *) name
, name_len
);
2726 int32_t elem
= hash
% table_size
;
2727 int32_t second
= hash
% (table_size
- 2);
2728 while (symb_table
[2 * elem
] != 0)
2730 /* First compare the hashing value. */
2731 if (symb_table
[2 * elem
] == hash
2732 /* Compare the length of the name. */
2733 && name_len
== extra
[symb_table
[2 * elem
+ 1]]
2734 /* Compare the name. */
2735 && memcmp (name
, &extra
[symb_table
[2 * elem
+ 1] + 1],
2738 /* Yep, this is the entry. */
2748 /* Local function for parse_bracket_exp used in _LIBC environement.
2749 Look up the collation sequence value of BR_ELEM.
2750 Return the value if succeeded, UINT_MAX otherwise. */
2752 static inline unsigned int
2753 __attribute ((always_inline
))
2754 lookup_collation_sequence_value (br_elem
)
2755 bracket_elem_t
*br_elem
;
2757 if (br_elem
->type
== SB_CHAR
)
2760 if (MB_CUR_MAX == 1)
2763 return collseqmb
[br_elem
->opr
.ch
];
2766 wint_t wc
= __btowc (br_elem
->opr
.ch
);
2767 return __collseq_table_lookup (collseqwc
, wc
);
2770 else if (br_elem
->type
== MB_CHAR
)
2772 return __collseq_table_lookup (collseqwc
, br_elem
->opr
.wch
);
2774 else if (br_elem
->type
== COLL_SYM
)
2776 size_t sym_name_len
= strlen ((char *) br_elem
->opr
.name
);
2780 elem
= seek_collating_symbol_entry (br_elem
->opr
.name
,
2782 if (symb_table
[2 * elem
] != 0)
2784 /* We found the entry. */
2785 idx
= symb_table
[2 * elem
+ 1];
2786 /* Skip the name of collating element name. */
2787 idx
+= 1 + extra
[idx
];
2788 /* Skip the byte sequence of the collating element. */
2789 idx
+= 1 + extra
[idx
];
2790 /* Adjust for the alignment. */
2791 idx
= (idx
+ 3) & ~3;
2792 /* Skip the multibyte collation sequence value. */
2793 idx
+= sizeof (unsigned int);
2794 /* Skip the wide char sequence of the collating element. */
2795 idx
+= sizeof (unsigned int) *
2796 (1 + *(unsigned int *) (extra
+ idx
));
2797 /* Return the collation sequence value. */
2798 return *(unsigned int *) (extra
+ idx
);
2800 else if (symb_table
[2 * elem
] == 0 && sym_name_len
== 1)
2802 /* No valid character. Match it as a single byte
2804 return collseqmb
[br_elem
->opr
.name
[0]];
2807 else if (sym_name_len
== 1)
2808 return collseqmb
[br_elem
->opr
.name
[0]];
2813 /* Local function for parse_bracket_exp used in _LIBC environement.
2814 Build the range expression which starts from START_ELEM, and ends
2815 at END_ELEM. The result are written to MBCSET and SBCSET.
2816 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2817 mbcset->range_ends, is a pointer argument sinse we may
2820 static inline reg_errcode_t
2821 __attribute ((always_inline
))
2822 # ifdef RE_ENABLE_I18N
2823 build_range_exp (sbcset
, mbcset
, range_alloc
, start_elem
, end_elem
)
2824 re_charset_t
*mbcset
;
2826 # else /* not RE_ENABLE_I18N */
2827 build_range_exp (sbcset
, start_elem
, end_elem
)
2828 # endif /* not RE_ENABLE_I18N */
2829 re_bitset_ptr_t sbcset
;
2830 bracket_elem_t
*start_elem
, *end_elem
;
2833 uint32_t start_collseq
;
2834 uint32_t end_collseq
;
2836 # ifdef RE_ENABLE_I18N
2837 /* Check the space of the arrays. */
2838 if (BE (*range_alloc
== mbcset
->nranges
, 0))
2840 /* There are not enough space, need realloc. */
2841 uint32_t *new_array_start
;
2842 uint32_t *new_array_end
;
2845 /* +1 in case of mbcset->nranges is 0. */
2846 new_nranges
= 2 * mbcset
->nranges
+ 1;
2847 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2848 are NULL if *range_alloc == 0. */
2849 new_array_start
= re_realloc (mbcset
->range_starts
, uint32_t,
2851 new_array_end
= re_realloc (mbcset
->range_ends
, uint32_t,
2854 if (BE (new_array_start
== NULL
|| new_array_end
== NULL
, 0))
2857 mbcset
->range_starts
= new_array_start
;
2858 mbcset
->range_ends
= new_array_end
;
2859 *range_alloc
= new_nranges
;
2861 # endif /* RE_ENABLE_I18N */
2863 /* Equivalence Classes and Character Classes can't be a range
2865 if (BE (start_elem
->type
== EQUIV_CLASS
|| start_elem
->type
== CHAR_CLASS
2866 || end_elem
->type
== EQUIV_CLASS
|| end_elem
->type
== CHAR_CLASS
,
2870 start_collseq
= lookup_collation_sequence_value (start_elem
);
2871 end_collseq
= lookup_collation_sequence_value (end_elem
);
2872 /* Check start/end collation sequence values. */
2873 if (BE (start_collseq
== UINT_MAX
|| end_collseq
== UINT_MAX
, 0))
2874 return REG_ECOLLATE
;
2875 if (BE ((syntax
& RE_NO_EMPTY_RANGES
) && start_collseq
> end_collseq
, 0))
2878 # ifdef RE_ENABLE_I18N
2879 /* Got valid collation sequence values, add them as a new entry. */
2880 mbcset
->range_starts
[mbcset
->nranges
] = start_collseq
;
2881 mbcset
->range_ends
[mbcset
->nranges
++] = end_collseq
;
2882 # endif /* RE_ENABLE_I18N */
2884 /* Build the table for single byte characters. */
2885 for (ch
= 0; ch
<= SBC_MAX
; ch
++)
2887 uint32_t ch_collseq
;
2889 if (MB_CUR_MAX == 1)
2892 ch_collseq
= collseqmb
[ch
];
2894 ch_collseq
= __collseq_table_lookup (collseqwc
, __btowc (ch
));
2895 if (start_collseq
<= ch_collseq
&& ch_collseq
<= end_collseq
)
2896 bitset_set (sbcset
, ch
);
2901 /* Local function for parse_bracket_exp used in _LIBC environement.
2902 Build the collating element which is represented by NAME.
2903 The result are written to MBCSET and SBCSET.
2904 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2905 pointer argument sinse we may update it. */
2907 static inline reg_errcode_t
2908 __attribute ((always_inline
))
2909 # ifdef RE_ENABLE_I18N
2910 build_collating_symbol (sbcset
, mbcset
, coll_sym_alloc
, name
)
2911 re_charset_t
*mbcset
;
2912 int *coll_sym_alloc
;
2913 # else /* not RE_ENABLE_I18N */
2914 build_collating_symbol (sbcset
, name
)
2915 # endif /* not RE_ENABLE_I18N */
2916 re_bitset_ptr_t sbcset
;
2917 const unsigned char *name
;
2920 size_t name_len
= strlen ((const char *) name
);
2923 elem
= seek_collating_symbol_entry (name
, name_len
);
2924 if (symb_table
[2 * elem
] != 0)
2926 /* We found the entry. */
2927 idx
= symb_table
[2 * elem
+ 1];
2928 /* Skip the name of collating element name. */
2929 idx
+= 1 + extra
[idx
];
2931 else if (symb_table
[2 * elem
] == 0 && name_len
== 1)
2933 /* No valid character, treat it as a normal
2935 bitset_set (sbcset
, name
[0]);
2939 return REG_ECOLLATE
;
2941 # ifdef RE_ENABLE_I18N
2942 /* Got valid collation sequence, add it as a new entry. */
2943 /* Check the space of the arrays. */
2944 if (BE (*coll_sym_alloc
== mbcset
->ncoll_syms
, 0))
2946 /* Not enough, realloc it. */
2947 /* +1 in case of mbcset->ncoll_syms is 0. */
2948 int new_coll_sym_alloc
= 2 * mbcset
->ncoll_syms
+ 1;
2949 /* Use realloc since mbcset->coll_syms is NULL
2951 int32_t *new_coll_syms
= re_realloc (mbcset
->coll_syms
, int32_t,
2952 new_coll_sym_alloc
);
2953 if (BE (new_coll_syms
== NULL
, 0))
2955 mbcset
->coll_syms
= new_coll_syms
;
2956 *coll_sym_alloc
= new_coll_sym_alloc
;
2958 mbcset
->coll_syms
[mbcset
->ncoll_syms
++] = idx
;
2959 # endif /* RE_ENABLE_I18N */
2964 if (BE (name_len
!= 1, 0))
2965 return REG_ECOLLATE
;
2968 bitset_set (sbcset
, name
[0]);
2975 re_token_t br_token
;
2976 re_bitset_ptr_t sbcset
;
2977 #ifdef RE_ENABLE_I18N
2978 re_charset_t
*mbcset
;
2979 int coll_sym_alloc
= 0, range_alloc
= 0, mbchar_alloc
= 0;
2980 int equiv_class_alloc
= 0, char_class_alloc
= 0;
2981 #else /* not RE_ENABLE_I18N */
2983 #endif /* not RE_ENABLE_I18N */
2984 bin_tree_t
*work_tree
;
2987 collseqmb
= (const unsigned char *)
2988 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2989 nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2995 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2996 table_size
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_SYMB_HASH_SIZEMB
);
2997 symb_table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
2998 _NL_COLLATE_SYMB_TABLEMB
);
2999 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3000 _NL_COLLATE_SYMB_EXTRAMB
);
3003 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3004 #ifdef RE_ENABLE_I18N
3005 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3006 #endif /* RE_ENABLE_I18N */
3007 #ifdef RE_ENABLE_I18N
3008 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3010 if (BE (sbcset
== NULL
, 0))
3011 #endif /* RE_ENABLE_I18N */
3017 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3018 if (BE (token
->type
== END_OF_RE
, 0))
3021 goto parse_bracket_exp_free_return
;
3023 if (token
->type
== OP_NON_MATCH_LIST
)
3025 #ifdef RE_ENABLE_I18N
3026 mbcset
->non_match
= 1;
3027 #else /* not RE_ENABLE_I18N */
3029 #endif /* not RE_ENABLE_I18N */
3030 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
3031 bitset_set (sbcset
, '\0');
3032 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3033 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3034 if (BE (token
->type
== END_OF_RE
, 0))
3037 goto parse_bracket_exp_free_return
;
3041 /* We treat the first ']' as a normal character. */
3042 if (token
->type
== OP_CLOSE_BRACKET
)
3043 token
->type
= CHARACTER
;
3045 int first_round
= 1;
3048 bracket_elem_t start_elem
, end_elem
;
3049 unsigned char start_name_buf
[BRACKET_NAME_BUF_SIZE
];
3050 unsigned char end_name_buf
[BRACKET_NAME_BUF_SIZE
];
3052 int token_len2
= 0, is_range_exp
= 0;
3055 start_elem
.opr
.name
= start_name_buf
;
3056 ret
= parse_bracket_element (&start_elem
, regexp
, token
, token_len
, dfa
,
3057 syntax
, first_round
);
3058 if (BE (ret
!= REG_NOERROR
, 0))
3061 goto parse_bracket_exp_free_return
;
3065 /* Get information about the next token. We need it in any case. */
3066 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3068 /* Do not check for ranges if we know they are not allowed. */
3069 if (start_elem
.type
!= CHAR_CLASS
&& start_elem
.type
!= EQUIV_CLASS
)
3071 if (BE (token
->type
== END_OF_RE
, 0))
3074 goto parse_bracket_exp_free_return
;
3076 if (token
->type
== OP_CHARSET_RANGE
)
3078 re_string_skip_bytes (regexp
, token_len
); /* Skip '-'. */
3079 token_len2
= peek_token_bracket (&token2
, regexp
, syntax
);
3080 if (BE (token2
.type
== END_OF_RE
, 0))
3083 goto parse_bracket_exp_free_return
;
3085 if (token2
.type
== OP_CLOSE_BRACKET
)
3087 /* We treat the last '-' as a normal character. */
3088 re_string_skip_bytes (regexp
, -token_len
);
3089 token
->type
= CHARACTER
;
3096 if (is_range_exp
== 1)
3098 end_elem
.opr
.name
= end_name_buf
;
3099 ret
= parse_bracket_element (&end_elem
, regexp
, &token2
, token_len2
,
3101 if (BE (ret
!= REG_NOERROR
, 0))
3104 goto parse_bracket_exp_free_return
;
3107 token_len
= peek_token_bracket (token
, regexp
, syntax
);
3109 *err
= build_range_exp (sbcset
,
3110 #ifdef RE_ENABLE_I18N
3111 mbcset
, &range_alloc
,
3112 #endif /* RE_ENABLE_I18N */
3113 &start_elem
, &end_elem
);
3114 if (BE (*err
!= REG_NOERROR
, 0))
3115 goto parse_bracket_exp_free_return
;
3119 switch (start_elem
.type
)
3122 bitset_set (sbcset
, start_elem
.opr
.ch
);
3124 #ifdef RE_ENABLE_I18N
3126 /* Check whether the array has enough space. */
3127 if (BE (mbchar_alloc
== mbcset
->nmbchars
, 0))
3129 wchar_t *new_mbchars
;
3130 /* Not enough, realloc it. */
3131 /* +1 in case of mbcset->nmbchars is 0. */
3132 mbchar_alloc
= 2 * mbcset
->nmbchars
+ 1;
3133 /* Use realloc since array is NULL if *alloc == 0. */
3134 new_mbchars
= re_realloc (mbcset
->mbchars
, wchar_t,
3136 if (BE (new_mbchars
== NULL
, 0))
3137 goto parse_bracket_exp_espace
;
3138 mbcset
->mbchars
= new_mbchars
;
3140 mbcset
->mbchars
[mbcset
->nmbchars
++] = start_elem
.opr
.wch
;
3142 #endif /* RE_ENABLE_I18N */
3144 *err
= build_equiv_class (sbcset
,
3145 #ifdef RE_ENABLE_I18N
3146 mbcset
, &equiv_class_alloc
,
3147 #endif /* RE_ENABLE_I18N */
3148 start_elem
.opr
.name
);
3149 if (BE (*err
!= REG_NOERROR
, 0))
3150 goto parse_bracket_exp_free_return
;
3153 *err
= build_collating_symbol (sbcset
,
3154 #ifdef RE_ENABLE_I18N
3155 mbcset
, &coll_sym_alloc
,
3156 #endif /* RE_ENABLE_I18N */
3157 start_elem
.opr
.name
);
3158 if (BE (*err
!= REG_NOERROR
, 0))
3159 goto parse_bracket_exp_free_return
;
3162 *err
= build_charclass (regexp
->trans
, sbcset
,
3163 #ifdef RE_ENABLE_I18N
3164 mbcset
, &char_class_alloc
,
3165 #endif /* RE_ENABLE_I18N */
3166 start_elem
.opr
.name
, syntax
);
3167 if (BE (*err
!= REG_NOERROR
, 0))
3168 goto parse_bracket_exp_free_return
;
3175 if (BE (token
->type
== END_OF_RE
, 0))
3178 goto parse_bracket_exp_free_return
;
3180 if (token
->type
== OP_CLOSE_BRACKET
)
3184 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3186 /* If it is non-matching list. */
3187 #ifdef RE_ENABLE_I18N
3188 if (mbcset
->non_match
)
3189 #else /* not RE_ENABLE_I18N */
3191 #endif /* not RE_ENABLE_I18N */
3192 bitset_not (sbcset
);
3193 #ifdef RE_ENABLE_I18N
3194 /* Ensure only single byte characters are set. */
3195 if (dfa
->mb_cur_max
> 1)
3196 bitset_mask (sbcset
, dfa
->sb_char
);
3197 #endif /* RE_ENABLE_I18N */
3199 /* Build a tree for simple bracket. */
3200 br_token
.type
= SIMPLE_BRACKET
;
3201 br_token
.opr
.sbcset
= sbcset
;
3202 work_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3203 if (BE (work_tree
== NULL
, 0))
3204 goto parse_bracket_exp_espace
;
3206 #ifdef RE_ENABLE_I18N
3207 if (mbcset
->nmbchars
|| mbcset
->ncoll_syms
|| mbcset
->nequiv_classes
3208 || mbcset
->nranges
|| (dfa
->mb_cur_max
> 1 && (mbcset
->nchar_classes
3209 || mbcset
->non_match
)))
3211 re_token_t alt_token
;
3212 bin_tree_t
*mbc_tree
;
3214 /* Build a tree for complex bracket. */
3215 dfa
->has_mb_node
= 1;
3216 dfa
->has_plural_match
= 1;
3217 for (sbc_idx
= 0; sbc_idx
< BITSET_UINTS
; ++sbc_idx
)
3218 if (sbcset
[sbc_idx
])
3220 /* If there are no bits set in sbcset, there is no point
3221 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3222 if (sbc_idx
== BITSET_UINTS
)
3225 dfa
->nodes
[work_tree
->node_idx
].type
= COMPLEX_BRACKET
;
3226 dfa
->nodes
[work_tree
->node_idx
].opr
.mbcset
= mbcset
;
3229 br_token
.type
= COMPLEX_BRACKET
;
3230 br_token
.opr
.mbcset
= mbcset
;
3231 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3232 if (BE (mbc_tree
== NULL
, 0))
3233 goto parse_bracket_exp_espace
;
3234 /* Then join them by ALT node. */
3235 alt_token
.type
= OP_ALT
;
3236 work_tree
= re_dfa_add_tree_node (dfa
, work_tree
, mbc_tree
, &alt_token
);
3237 if (BE (mbc_tree
!= NULL
, 1))
3242 free_charset (mbcset
);
3245 #else /* not RE_ENABLE_I18N */
3247 #endif /* not RE_ENABLE_I18N */
3249 parse_bracket_exp_espace
:
3251 parse_bracket_exp_free_return
:
3253 #ifdef RE_ENABLE_I18N
3254 free_charset (mbcset
);
3255 #endif /* RE_ENABLE_I18N */
3259 /* Parse an element in the bracket expression. */
3261 static reg_errcode_t
3262 parse_bracket_element (elem
, regexp
, token
, token_len
, dfa
, syntax
,
3264 bracket_elem_t
*elem
;
3265 re_string_t
*regexp
;
3269 reg_syntax_t syntax
;
3272 #ifdef RE_ENABLE_I18N
3274 cur_char_size
= re_string_char_size_at (regexp
, re_string_cur_idx (regexp
));
3275 if (cur_char_size
> 1)
3277 elem
->type
= MB_CHAR
;
3278 elem
->opr
.wch
= re_string_wchar_at (regexp
, re_string_cur_idx (regexp
));
3279 re_string_skip_bytes (regexp
, cur_char_size
);
3282 #endif /* RE_ENABLE_I18N */
3283 re_string_skip_bytes (regexp
, token_len
); /* Skip a token. */
3284 if (token
->type
== OP_OPEN_COLL_ELEM
|| token
->type
== OP_OPEN_CHAR_CLASS
3285 || token
->type
== OP_OPEN_EQUIV_CLASS
)
3286 return parse_bracket_symbol (elem
, regexp
, token
);
3287 if (BE (token
->type
== OP_CHARSET_RANGE
, 0) && !accept_hyphen
)
3289 /* A '-' must only appear as anything but a range indicator before
3290 the closing bracket. Everything else is an error. */
3292 (void) peek_token_bracket (&token2
, regexp
, syntax
);
3293 if (token2
.type
!= OP_CLOSE_BRACKET
)
3294 /* The actual error value is not standardized since this whole
3295 case is undefined. But ERANGE makes good sense. */
3298 elem
->type
= SB_CHAR
;
3299 elem
->opr
.ch
= token
->opr
.c
;
3303 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3304 such as [:<character_class>:], [.<collating_element>.], and
3305 [=<equivalent_class>=]. */
3307 static reg_errcode_t
3308 parse_bracket_symbol (elem
, regexp
, token
)
3309 bracket_elem_t
*elem
;
3310 re_string_t
*regexp
;
3313 unsigned char ch
, delim
= token
->opr
.c
;
3317 if (re_string_eoi(regexp
) || i
>= BRACKET_NAME_BUF_SIZE
)
3319 if (token
->type
== OP_OPEN_CHAR_CLASS
)
3320 ch
= re_string_fetch_byte_case (regexp
);
3322 ch
= re_string_fetch_byte (regexp
);
3323 if (ch
== delim
&& re_string_peek_byte (regexp
, 0) == ']')
3325 elem
->opr
.name
[i
] = ch
;
3327 re_string_skip_bytes (regexp
, 1);
3328 elem
->opr
.name
[i
] = '\0';
3329 switch (token
->type
)
3331 case OP_OPEN_COLL_ELEM
:
3332 elem
->type
= COLL_SYM
;
3334 case OP_OPEN_EQUIV_CLASS
:
3335 elem
->type
= EQUIV_CLASS
;
3337 case OP_OPEN_CHAR_CLASS
:
3338 elem
->type
= CHAR_CLASS
;
3346 /* Helper function for parse_bracket_exp.
3347 Build the equivalence class which is represented by NAME.
3348 The result are written to MBCSET and SBCSET.
3349 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3350 is a pointer argument sinse we may update it. */
3352 static reg_errcode_t
3353 #ifdef RE_ENABLE_I18N
3354 build_equiv_class (sbcset
, mbcset
, equiv_class_alloc
, name
)
3355 re_charset_t
*mbcset
;
3356 int *equiv_class_alloc
;
3357 #else /* not RE_ENABLE_I18N */
3358 build_equiv_class (sbcset
, name
)
3359 #endif /* not RE_ENABLE_I18N */
3360 re_bitset_ptr_t sbcset
;
3361 const unsigned char *name
;
3363 #if defined _LIBC && defined RE_ENABLE_I18N
3364 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
3367 const int32_t *table
, *indirect
;
3368 const unsigned char *weights
, *extra
, *cp
;
3369 unsigned char char_buf
[2];
3373 /* This #include defines a local function! */
3374 # include <locale/weight.h>
3375 /* Calculate the index for equivalence class. */
3377 table
= (const int32_t *) _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
3378 weights
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3379 _NL_COLLATE_WEIGHTMB
);
3380 extra
= (const unsigned char *) _NL_CURRENT (LC_COLLATE
,
3381 _NL_COLLATE_EXTRAMB
);
3382 indirect
= (const int32_t *) _NL_CURRENT (LC_COLLATE
,
3383 _NL_COLLATE_INDIRECTMB
);
3384 idx1
= findidx (&cp
);
3385 if (BE (idx1
== 0 || cp
< name
+ strlen ((const char *) name
), 0))
3386 /* This isn't a valid character. */
3387 return REG_ECOLLATE
;
3389 /* Build single byte matcing table for this equivalence class. */
3390 char_buf
[1] = (unsigned char) '\0';
3391 len
= weights
[idx1
];
3392 for (ch
= 0; ch
< SBC_MAX
; ++ch
)
3396 idx2
= findidx (&cp
);
3401 /* This isn't a valid character. */
3403 if (len
== weights
[idx2
])
3406 while (cnt
<= len
&&
3407 weights
[idx1
+ 1 + cnt
] == weights
[idx2
+ 1 + cnt
])
3411 bitset_set (sbcset
, ch
);
3414 /* Check whether the array has enough space. */
3415 if (BE (*equiv_class_alloc
== mbcset
->nequiv_classes
, 0))
3417 /* Not enough, realloc it. */
3418 /* +1 in case of mbcset->nequiv_classes is 0. */
3419 int new_equiv_class_alloc
= 2 * mbcset
->nequiv_classes
+ 1;
3420 /* Use realloc since the array is NULL if *alloc == 0. */
3421 int32_t *new_equiv_classes
= re_realloc (mbcset
->equiv_classes
,
3423 new_equiv_class_alloc
);
3424 if (BE (new_equiv_classes
== NULL
, 0))
3426 mbcset
->equiv_classes
= new_equiv_classes
;
3427 *equiv_class_alloc
= new_equiv_class_alloc
;
3429 mbcset
->equiv_classes
[mbcset
->nequiv_classes
++] = idx1
;
3432 #endif /* _LIBC && RE_ENABLE_I18N */
3434 if (BE (strlen ((const char *) name
) != 1, 0))
3435 return REG_ECOLLATE
;
3436 bitset_set (sbcset
, *name
);
3441 /* Helper function for parse_bracket_exp.
3442 Build the character class which is represented by NAME.
3443 The result are written to MBCSET and SBCSET.
3444 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3445 is a pointer argument sinse we may update it. */
3447 static reg_errcode_t
3448 #ifdef RE_ENABLE_I18N
3449 build_charclass (trans
, sbcset
, mbcset
, char_class_alloc
, class_name
, syntax
)
3450 re_charset_t
*mbcset
;
3451 int *char_class_alloc
;
3452 #else /* not RE_ENABLE_I18N */
3453 build_charclass (trans
, sbcset
, class_name
, syntax
)
3454 #endif /* not RE_ENABLE_I18N */
3455 RE_TRANSLATE_TYPE trans
;
3456 re_bitset_ptr_t sbcset
;
3457 const unsigned char *class_name
;
3458 reg_syntax_t syntax
;
3461 const char *name
= (const char *) class_name
;
3463 /* In case of REG_ICASE "upper" and "lower" match the both of
3464 upper and lower cases. */
3465 if ((syntax
& RE_ICASE
)
3466 && (strcmp (name
, "upper") == 0 || strcmp (name
, "lower") == 0))
3469 #ifdef RE_ENABLE_I18N
3470 /* Check the space of the arrays. */
3471 if (BE (*char_class_alloc
== mbcset
->nchar_classes
, 0))
3473 /* Not enough, realloc it. */
3474 /* +1 in case of mbcset->nchar_classes is 0. */
3475 int new_char_class_alloc
= 2 * mbcset
->nchar_classes
+ 1;
3476 /* Use realloc since array is NULL if *alloc == 0. */
3477 wctype_t *new_char_classes
= re_realloc (mbcset
->char_classes
, wctype_t,
3478 new_char_class_alloc
);
3479 if (BE (new_char_classes
== NULL
, 0))
3481 mbcset
->char_classes
= new_char_classes
;
3482 *char_class_alloc
= new_char_class_alloc
;
3484 mbcset
->char_classes
[mbcset
->nchar_classes
++] = __wctype (name
);
3485 #endif /* RE_ENABLE_I18N */
3487 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3488 for (i = 0; i < SBC_MAX; ++i) \
3490 if (ctype_func (i)) \
3492 int ch = trans ? trans[i] : i; \
3493 bitset_set (sbcset, ch); \
3497 if (strcmp (name
, "alnum") == 0)
3498 BUILD_CHARCLASS_LOOP (isalnum
)
3499 else if (strcmp (name
, "cntrl") == 0)
3500 BUILD_CHARCLASS_LOOP (iscntrl
)
3501 else if (strcmp (name
, "lower") == 0)
3502 BUILD_CHARCLASS_LOOP (islower
)
3503 else if (strcmp (name
, "space") == 0)
3504 BUILD_CHARCLASS_LOOP (isspace
)
3505 else if (strcmp (name
, "alpha") == 0)
3506 BUILD_CHARCLASS_LOOP (isalpha
)
3507 else if (strcmp (name
, "digit") == 0)
3508 BUILD_CHARCLASS_LOOP (isdigit
)
3509 else if (strcmp (name
, "print") == 0)
3510 BUILD_CHARCLASS_LOOP (isprint
)
3511 else if (strcmp (name
, "upper") == 0)
3512 BUILD_CHARCLASS_LOOP (isupper
)
3513 else if (strcmp (name
, "blank") == 0)
3514 BUILD_CHARCLASS_LOOP (isblank
)
3515 else if (strcmp (name
, "graph") == 0)
3516 BUILD_CHARCLASS_LOOP (isgraph
)
3517 else if (strcmp (name
, "punct") == 0)
3518 BUILD_CHARCLASS_LOOP (ispunct
)
3519 else if (strcmp (name
, "xdigit") == 0)
3520 BUILD_CHARCLASS_LOOP (isxdigit
)
3528 build_charclass_op (dfa
, trans
, class_name
, extra
, not, err
)
3530 RE_TRANSLATE_TYPE trans
;
3531 const unsigned char *class_name
;
3532 const unsigned char *extra
;
3536 re_bitset_ptr_t sbcset
;
3537 #ifdef RE_ENABLE_I18N
3538 re_charset_t
*mbcset
;
3540 #else /* not RE_ENABLE_I18N */
3542 #endif /* not RE_ENABLE_I18N */
3544 re_token_t br_token
;
3547 sbcset
= (re_bitset_ptr_t
) calloc (sizeof (unsigned int), BITSET_UINTS
);
3548 #ifdef RE_ENABLE_I18N
3549 mbcset
= (re_charset_t
*) calloc (sizeof (re_charset_t
), 1);
3550 #endif /* RE_ENABLE_I18N */
3552 #ifdef RE_ENABLE_I18N
3553 if (BE (sbcset
== NULL
|| mbcset
== NULL
, 0))
3554 #else /* not RE_ENABLE_I18N */
3555 if (BE (sbcset
== NULL
, 0))
3556 #endif /* not RE_ENABLE_I18N */
3564 #ifdef RE_ENABLE_I18N
3566 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3567 bitset_set(cset->sbcset, '\0');
3569 mbcset
->non_match
= 1;
3570 #else /* not RE_ENABLE_I18N */
3572 #endif /* not RE_ENABLE_I18N */
3575 /* We don't care the syntax in this case. */
3576 ret
= build_charclass (trans
, sbcset
,
3577 #ifdef RE_ENABLE_I18N
3579 #endif /* RE_ENABLE_I18N */
3582 if (BE (ret
!= REG_NOERROR
, 0))
3585 #ifdef RE_ENABLE_I18N
3586 free_charset (mbcset
);
3587 #endif /* RE_ENABLE_I18N */
3591 /* \w match '_' also. */
3592 for (; *extra
; extra
++)
3593 bitset_set (sbcset
, *extra
);
3595 /* If it is non-matching list. */
3596 #ifdef RE_ENABLE_I18N
3597 if (mbcset
->non_match
)
3598 #else /* not RE_ENABLE_I18N */
3600 #endif /* not RE_ENABLE_I18N */
3601 bitset_not (sbcset
);
3603 #ifdef RE_ENABLE_I18N
3604 /* Ensure only single byte characters are set. */
3605 if (dfa
->mb_cur_max
> 1)
3606 bitset_mask (sbcset
, dfa
->sb_char
);
3609 /* Build a tree for simple bracket. */
3610 br_token
.type
= SIMPLE_BRACKET
;
3611 br_token
.opr
.sbcset
= sbcset
;
3612 tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3613 if (BE (tree
== NULL
, 0))
3614 goto build_word_op_espace
;
3616 #ifdef RE_ENABLE_I18N
3617 if (dfa
->mb_cur_max
> 1)
3619 re_token_t alt_token
;
3620 bin_tree_t
*mbc_tree
;
3621 /* Build a tree for complex bracket. */
3622 br_token
.type
= COMPLEX_BRACKET
;
3623 br_token
.opr
.mbcset
= mbcset
;
3624 dfa
->has_mb_node
= 1;
3625 mbc_tree
= re_dfa_add_tree_node (dfa
, NULL
, NULL
, &br_token
);
3626 if (BE (mbc_tree
== NULL
, 0))
3627 goto build_word_op_espace
;
3628 /* Then join them by ALT node. */
3629 alt_token
.type
= OP_ALT
;
3630 tree
= re_dfa_add_tree_node (dfa
, tree
, mbc_tree
, &alt_token
);
3631 if (BE (mbc_tree
!= NULL
, 1))
3636 free_charset (mbcset
);
3639 #else /* not RE_ENABLE_I18N */
3641 #endif /* not RE_ENABLE_I18N */
3643 build_word_op_espace
:
3645 #ifdef RE_ENABLE_I18N
3646 free_charset (mbcset
);
3647 #endif /* RE_ENABLE_I18N */
3652 /* This is intended for the expressions like "a{1,3}".
3653 Fetch a number from `input', and return the number.
3654 Return -1, if the number field is empty like "{,1}".
3655 Return -2, If an error is occured. */
3658 fetch_number (input
, token
, syntax
)
3661 reg_syntax_t syntax
;
3667 fetch_token (token
, input
, syntax
);
3669 if (BE (token
->type
== END_OF_RE
, 0))
3671 if (token
->type
== OP_CLOSE_DUP_NUM
|| c
== ',')
3673 num
= ((token
->type
!= CHARACTER
|| c
< '0' || '9' < c
|| num
== -2)
3674 ? -2 : ((num
== -1) ? c
- '0' : num
* 10 + c
- '0'));
3675 num
= (num
> RE_DUP_MAX
) ? -2 : num
;
3680 #ifdef RE_ENABLE_I18N
3682 free_charset (re_charset_t
*cset
)
3684 re_free (cset
->mbchars
);
3686 re_free (cset
->coll_syms
);
3687 re_free (cset
->equiv_classes
);
3688 re_free (cset
->range_starts
);
3689 re_free (cset
->range_ends
);
3691 re_free (cset
->char_classes
);
3694 #endif /* RE_ENABLE_I18N */
3696 /* Functions for binary tree operation. */
3698 /* Create a tree node. */
3701 create_tree (dfa
, left
, right
, type
, index
)
3705 re_token_type_t type
;
3709 if (BE (dfa
->str_tree_storage_idx
== BIN_TREE_STORAGE_SIZE
, 0))
3711 bin_tree_storage_t
*storage
= re_malloc (bin_tree_storage_t
, 1);
3713 if (storage
== NULL
)
3715 storage
->next
= dfa
->str_tree_storage
;
3716 dfa
->str_tree_storage
= storage
;
3717 dfa
->str_tree_storage_idx
= 0;
3719 tree
= &dfa
->str_tree_storage
->data
[dfa
->str_tree_storage_idx
++];
3721 tree
->parent
= NULL
;
3723 tree
->right
= right
;
3725 tree
->node_idx
= index
;
3728 re_node_set_init_empty (&tree
->eclosure
);
3731 left
->parent
= tree
;
3733 right
->parent
= tree
;
3737 /* Create both a DFA node and a tree for it. */
3740 re_dfa_add_tree_node (dfa
, left
, right
, token
)
3744 const re_token_t
*token
;
3746 int new_idx
= re_dfa_add_node (dfa
, *token
, 0);
3751 return create_tree (dfa
, left
, right
, 0, new_idx
);
3755 /* Duplicate the node SRC, and return new node. */
3758 duplicate_tree (src
, dfa
)
3759 const bin_tree_t
*src
;
3762 bin_tree_t
*left
= NULL
, *right
= NULL
, *new_tree
;
3764 /* Since node indies must be according to Post-order of the tree,
3765 we must duplicate the left at first. */
3766 if (src
->left
!= NULL
)
3768 left
= duplicate_tree (src
->left
, dfa
);
3773 /* Secondaly, duplicate the right. */
3774 if (src
->right
!= NULL
)
3776 right
= duplicate_tree (src
->right
, dfa
);
3781 /* At last, duplicate itself. */
3782 if (src
->type
== NON_TYPE
)
3784 new_node_idx
= re_dfa_add_node (dfa
, dfa
->nodes
[src
->node_idx
], 0);
3785 dfa
->nodes
[new_node_idx
].duplicated
= 1;
3786 if (BE (new_node_idx
== -1, 0))
3790 new_node_idx
= src
->type
;
3792 new_tree
= create_tree (dfa
, left
, right
, src
->type
, new_node_idx
);