]>
Commit | Line | Data |
---|---|---|
3b0bdc72 | 1 | /* Extended regular expression matching and search library. |
ca3c505e | 2 | Copyright (C) 2002, 2003 Free Software Foundation, Inc. |
3b0bdc72 UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library; if not, write to the Free | |
18 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
19 | 02111-1307 USA. */ | |
20 | ||
3b0bdc72 | 21 | static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, |
15a7d175 | 22 | int length, reg_syntax_t syntax); |
3b0bdc72 | 23 | static void re_compile_fastmap_iter (regex_t *bufp, |
15a7d175 UD |
24 | const re_dfastate_t *init_state, |
25 | char *fastmap); | |
3b0bdc72 | 26 | static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); |
a9388965 | 27 | static reg_errcode_t init_word_char (re_dfa_t *dfa); |
c0a0f9a3 | 28 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 29 | static void free_charset (re_charset_t *cset); |
c0a0f9a3 | 30 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 UD |
31 | static void free_workarea_compile (regex_t *preg); |
32 | static reg_errcode_t create_initial_state (re_dfa_t *dfa); | |
14744156 UD |
33 | #ifdef RE_ENABLE_I18N |
34 | static void optimize_utf8 (re_dfa_t *dfa); | |
35 | #endif | |
3b0bdc72 UD |
36 | static reg_errcode_t analyze (re_dfa_t *dfa); |
37 | static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node); | |
38 | static void calc_first (re_dfa_t *dfa, bin_tree_t *node); | |
39 | static void calc_next (re_dfa_t *dfa, bin_tree_t *node); | |
40 | static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node); | |
485d775d | 41 | static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, |
15a7d175 UD |
42 | int top_clone_node, int root_node, |
43 | unsigned int constraint); | |
a9388965 | 44 | static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx, |
15a7d175 | 45 | unsigned int constraint); |
a7d5c291 UD |
46 | static int search_duplicated_node (re_dfa_t *dfa, int org_node, |
47 | unsigned int constraint); | |
3b0bdc72 | 48 | static reg_errcode_t calc_eclosure (re_dfa_t *dfa); |
a9388965 | 49 | static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, |
15a7d175 | 50 | int node, int root); |
3b0bdc72 UD |
51 | static void calc_inveclosure (re_dfa_t *dfa); |
52 | static int fetch_number (re_string_t *input, re_token_t *token, | |
15a7d175 | 53 | reg_syntax_t syntax); |
3b0bdc72 UD |
54 | static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax); |
55 | static int peek_token (re_token_t *token, re_string_t *input, | |
15a7d175 | 56 | reg_syntax_t syntax); |
3b0bdc72 | 57 | static int peek_token_bracket (re_token_t *token, re_string_t *input, |
15a7d175 | 58 | reg_syntax_t syntax); |
3b0bdc72 | 59 | static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, |
15a7d175 | 60 | reg_syntax_t syntax, reg_errcode_t *err); |
3b0bdc72 | 61 | static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, |
15a7d175 UD |
62 | re_token_t *token, reg_syntax_t syntax, |
63 | int nest, reg_errcode_t *err); | |
3b0bdc72 | 64 | static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, |
15a7d175 UD |
65 | re_token_t *token, reg_syntax_t syntax, |
66 | int nest, reg_errcode_t *err); | |
3b0bdc72 | 67 | static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, |
15a7d175 UD |
68 | re_token_t *token, reg_syntax_t syntax, |
69 | int nest, reg_errcode_t *err); | |
3b0bdc72 | 70 | static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, |
15a7d175 UD |
71 | re_token_t *token, reg_syntax_t syntax, |
72 | int nest, reg_errcode_t *err); | |
3b0bdc72 | 73 | static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, |
15a7d175 UD |
74 | re_dfa_t *dfa, re_token_t *token, |
75 | reg_syntax_t syntax, reg_errcode_t *err); | |
3b0bdc72 | 76 | static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, |
15a7d175 UD |
77 | re_token_t *token, reg_syntax_t syntax, |
78 | reg_errcode_t *err); | |
3b0bdc72 | 79 | static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, |
15a7d175 UD |
80 | re_string_t *regexp, |
81 | re_token_t *token, int token_len, | |
82 | re_dfa_t *dfa, | |
83 | reg_syntax_t syntax); | |
3b0bdc72 | 84 | static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, |
15a7d175 UD |
85 | re_string_t *regexp, |
86 | re_token_t *token); | |
434d3784 | 87 | #ifndef _LIBC |
c0a0f9a3 UD |
88 | # ifdef RE_ENABLE_I18N |
89 | static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, | |
15a7d175 UD |
90 | re_charset_t *mbcset, int *range_alloc, |
91 | bracket_elem_t *start_elem, | |
92 | bracket_elem_t *end_elem); | |
c0a0f9a3 | 93 | static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, |
15a7d175 UD |
94 | re_charset_t *mbcset, |
95 | int *coll_sym_alloc, | |
96 | const unsigned char *name); | |
c0a0f9a3 UD |
97 | # else /* not RE_ENABLE_I18N */ |
98 | static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, | |
15a7d175 UD |
99 | bracket_elem_t *start_elem, |
100 | bracket_elem_t *end_elem); | |
c0a0f9a3 | 101 | static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, |
15a7d175 | 102 | const unsigned char *name); |
c0a0f9a3 | 103 | # endif /* not RE_ENABLE_I18N */ |
434d3784 | 104 | #endif /* not _LIBC */ |
c0a0f9a3 UD |
105 | #ifdef RE_ENABLE_I18N |
106 | static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, | |
15a7d175 UD |
107 | re_charset_t *mbcset, |
108 | int *equiv_class_alloc, | |
109 | const unsigned char *name); | |
66b110e8 UD |
110 | static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, |
111 | re_bitset_ptr_t sbcset, | |
15a7d175 UD |
112 | re_charset_t *mbcset, |
113 | int *char_class_alloc, | |
114 | const unsigned char *class_name, | |
115 | reg_syntax_t syntax); | |
c0a0f9a3 UD |
116 | #else /* not RE_ENABLE_I18N */ |
117 | static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, | |
15a7d175 | 118 | const unsigned char *name); |
66b110e8 UD |
119 | static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, |
120 | re_bitset_ptr_t sbcset, | |
15a7d175 UD |
121 | const unsigned char *class_name, |
122 | reg_syntax_t syntax); | |
c0a0f9a3 | 123 | #endif /* not RE_ENABLE_I18N */ |
e2b6bfa3 | 124 | static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, |
06e8303a | 125 | const unsigned char *class_name, |
e2b6bfa3 UD |
126 | const unsigned char *extra, int not, |
127 | reg_errcode_t *err); | |
3b0bdc72 UD |
128 | static void free_bin_tree (bin_tree_t *tree); |
129 | static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right, | |
15a7d175 | 130 | re_token_type_t type, int index); |
3b0bdc72 UD |
131 | static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); |
132 | \f | |
133 | /* This table gives an error message for each of the error codes listed | |
134 | in regex.h. Obviously the order here has to be same as there. | |
135 | POSIX doesn't require that we do anything for REG_NOERROR, | |
136 | but why not be nice? */ | |
137 | ||
6455d255 | 138 | const char __re_error_msgid[] attribute_hidden = |
3b0bdc72 UD |
139 | { |
140 | #define REG_NOERROR_IDX 0 | |
141 | gettext_noop ("Success") /* REG_NOERROR */ | |
142 | "\0" | |
143 | #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") | |
144 | gettext_noop ("No match") /* REG_NOMATCH */ | |
145 | "\0" | |
146 | #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") | |
147 | gettext_noop ("Invalid regular expression") /* REG_BADPAT */ | |
148 | "\0" | |
149 | #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") | |
150 | gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ | |
151 | "\0" | |
152 | #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") | |
153 | gettext_noop ("Invalid character class name") /* REG_ECTYPE */ | |
154 | "\0" | |
155 | #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") | |
156 | gettext_noop ("Trailing backslash") /* REG_EESCAPE */ | |
157 | "\0" | |
158 | #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") | |
159 | gettext_noop ("Invalid back reference") /* REG_ESUBREG */ | |
160 | "\0" | |
161 | #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") | |
162 | gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ | |
163 | "\0" | |
164 | #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") | |
165 | gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ | |
166 | "\0" | |
167 | #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") | |
168 | gettext_noop ("Unmatched \\{") /* REG_EBRACE */ | |
169 | "\0" | |
170 | #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") | |
171 | gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ | |
172 | "\0" | |
173 | #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") | |
174 | gettext_noop ("Invalid range end") /* REG_ERANGE */ | |
175 | "\0" | |
176 | #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") | |
177 | gettext_noop ("Memory exhausted") /* REG_ESPACE */ | |
178 | "\0" | |
179 | #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") | |
180 | gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ | |
181 | "\0" | |
182 | #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") | |
183 | gettext_noop ("Premature end of regular expression") /* REG_EEND */ | |
184 | "\0" | |
185 | #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") | |
186 | gettext_noop ("Regular expression too big") /* REG_ESIZE */ | |
187 | "\0" | |
188 | #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") | |
189 | gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ | |
190 | }; | |
191 | ||
6455d255 | 192 | const size_t __re_error_msgid_idx[] attribute_hidden = |
3b0bdc72 UD |
193 | { |
194 | REG_NOERROR_IDX, | |
195 | REG_NOMATCH_IDX, | |
196 | REG_BADPAT_IDX, | |
197 | REG_ECOLLATE_IDX, | |
198 | REG_ECTYPE_IDX, | |
199 | REG_EESCAPE_IDX, | |
200 | REG_ESUBREG_IDX, | |
201 | REG_EBRACK_IDX, | |
202 | REG_EPAREN_IDX, | |
203 | REG_EBRACE_IDX, | |
204 | REG_BADBR_IDX, | |
205 | REG_ERANGE_IDX, | |
206 | REG_ESPACE_IDX, | |
207 | REG_BADRPT_IDX, | |
208 | REG_EEND_IDX, | |
209 | REG_ESIZE_IDX, | |
210 | REG_ERPAREN_IDX | |
211 | }; | |
212 | \f | |
213 | /* Entry points for GNU code. */ | |
214 | ||
215 | /* re_compile_pattern is the GNU regular expression compiler: it | |
ac3d553b | 216 | compiles PATTERN (of length LENGTH) and puts the result in BUFP. |
3b0bdc72 UD |
217 | Returns 0 if the pattern was valid, otherwise an error string. |
218 | ||
219 | Assumes the `allocated' (and perhaps `buffer') and `translate' fields | |
220 | are set in BUFP on entry. */ | |
221 | ||
222 | const char * | |
223 | re_compile_pattern (pattern, length, bufp) | |
224 | const char *pattern; | |
225 | size_t length; | |
226 | struct re_pattern_buffer *bufp; | |
227 | { | |
228 | reg_errcode_t ret; | |
229 | ||
3b0bdc72 UD |
230 | /* And GNU code determines whether or not to get register information |
231 | by passing null for the REGS argument to re_match, etc., not by | |
232 | setting no_sub. */ | |
233 | bufp->no_sub = 0; | |
234 | ||
235 | /* Match anchors at newline. */ | |
236 | bufp->newline_anchor = 1; | |
237 | ||
75e4a282 | 238 | ret = re_compile_internal (bufp, pattern, length, re_syntax_options); |
3b0bdc72 UD |
239 | |
240 | if (!ret) | |
241 | return NULL; | |
6455d255 | 242 | return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); |
3b0bdc72 UD |
243 | } |
244 | #ifdef _LIBC | |
245 | weak_alias (__re_compile_pattern, re_compile_pattern) | |
246 | #endif | |
247 | ||
248 | /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can | |
249 | also be assigned to arbitrarily: each pattern buffer stores its own | |
250 | syntax, so it can be changed between regex compilations. */ | |
251 | /* This has no initializer because initialized variables in Emacs | |
252 | become read-only after dumping. */ | |
253 | reg_syntax_t re_syntax_options; | |
254 | ||
255 | ||
256 | /* Specify the precise syntax of regexps for compilation. This provides | |
257 | for compatibility for various utilities which historically have | |
258 | different, incompatible syntaxes. | |
259 | ||
260 | The argument SYNTAX is a bit mask comprised of the various bits | |
261 | defined in regex.h. We return the old syntax. */ | |
262 | ||
263 | reg_syntax_t | |
264 | re_set_syntax (syntax) | |
265 | reg_syntax_t syntax; | |
266 | { | |
267 | reg_syntax_t ret = re_syntax_options; | |
268 | ||
269 | re_syntax_options = syntax; | |
270 | return ret; | |
271 | } | |
272 | #ifdef _LIBC | |
273 | weak_alias (__re_set_syntax, re_set_syntax) | |
274 | #endif | |
275 | ||
276 | int | |
277 | re_compile_fastmap (bufp) | |
278 | struct re_pattern_buffer *bufp; | |
279 | { | |
280 | re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; | |
281 | char *fastmap = bufp->fastmap; | |
282 | ||
283 | memset (fastmap, '\0', sizeof (char) * SBC_MAX); | |
284 | re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); | |
285 | if (dfa->init_state != dfa->init_state_word) | |
286 | re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); | |
287 | if (dfa->init_state != dfa->init_state_nl) | |
288 | re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); | |
289 | if (dfa->init_state != dfa->init_state_begbuf) | |
290 | re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); | |
291 | bufp->fastmap_accurate = 1; | |
292 | return 0; | |
293 | } | |
294 | #ifdef _LIBC | |
295 | weak_alias (__re_compile_fastmap, re_compile_fastmap) | |
296 | #endif | |
297 | ||
1b2c2628 | 298 | static inline void |
25337753 | 299 | __attribute ((always_inline)) |
1b2c2628 UD |
300 | re_set_fastmap (char *fastmap, int icase, int ch) |
301 | { | |
302 | fastmap[ch] = 1; | |
303 | if (icase) | |
304 | fastmap[tolower (ch)] = 1; | |
305 | } | |
306 | ||
3b0bdc72 UD |
307 | /* Helper function for re_compile_fastmap. |
308 | Compile fastmap for the initial_state INIT_STATE. */ | |
309 | ||
310 | static void | |
311 | re_compile_fastmap_iter (bufp, init_state, fastmap) | |
312 | regex_t *bufp; | |
313 | const re_dfastate_t *init_state; | |
314 | char *fastmap; | |
315 | { | |
316 | re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; | |
317 | int node_cnt; | |
3c0fb574 | 318 | int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); |
3b0bdc72 UD |
319 | for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) |
320 | { | |
321 | int node = init_state->nodes.elems[node_cnt]; | |
322 | re_token_type_t type = dfa->nodes[node].type; | |
3b0bdc72 UD |
323 | |
324 | if (type == CHARACTER) | |
74e12fbc UD |
325 | { |
326 | re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); | |
327 | #ifdef RE_ENABLE_I18N | |
14744156 | 328 | if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) |
74e12fbc | 329 | { |
3c0fb574 | 330 | unsigned char *buf = alloca (dfa->mb_cur_max), *p; |
74e12fbc UD |
331 | wchar_t wc; |
332 | mbstate_t state; | |
333 | ||
334 | p = buf; | |
335 | *p++ = dfa->nodes[node].opr.c; | |
336 | while (++node < dfa->nodes_len | |
337 | && dfa->nodes[node].type == CHARACTER | |
338 | && dfa->nodes[node].mb_partial) | |
339 | *p++ = dfa->nodes[node].opr.c; | |
340 | memset (&state, 0, sizeof (state)); | |
341 | if (mbrtowc (&wc, (const char *) buf, p - buf, | |
342 | &state) == p - buf | |
343 | && __wcrtomb ((char *) buf, towlower (wc), &state) > 0) | |
344 | re_set_fastmap (fastmap, 0, buf[0]); | |
345 | } | |
346 | #endif | |
347 | } | |
3b0bdc72 | 348 | else if (type == SIMPLE_BRACKET) |
15a7d175 UD |
349 | { |
350 | int i, j, ch; | |
351 | for (i = 0, ch = 0; i < BITSET_UINTS; ++i) | |
352 | for (j = 0; j < UINT_BITS; ++j, ++ch) | |
353 | if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) | |
354 | re_set_fastmap (fastmap, icase, ch); | |
355 | } | |
c0a0f9a3 | 356 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 357 | else if (type == COMPLEX_BRACKET) |
15a7d175 UD |
358 | { |
359 | int i; | |
360 | re_charset_t *cset = dfa->nodes[node].opr.mbcset; | |
361 | if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes | |
362 | || cset->nranges || cset->nchar_classes) | |
363 | { | |
c0a0f9a3 | 364 | # ifdef _LIBC |
15a7d175 UD |
365 | if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) |
366 | { | |
367 | /* In this case we want to catch the bytes which are | |
368 | the first byte of any collation elements. | |
369 | e.g. In da_DK, we want to catch 'a' since "aa" | |
370 | is a valid collation element, and don't catch | |
371 | 'b' since 'b' is the only collation element | |
372 | which starts from 'b'. */ | |
373 | int j, ch; | |
374 | const int32_t *table = (const int32_t *) | |
375 | _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); | |
376 | for (i = 0, ch = 0; i < BITSET_UINTS; ++i) | |
377 | for (j = 0; j < UINT_BITS; ++j, ++ch) | |
378 | if (table[ch] < 0) | |
379 | re_set_fastmap (fastmap, icase, ch); | |
380 | } | |
c0a0f9a3 | 381 | # else |
3c0fb574 | 382 | if (dfa->mb_cur_max > 1) |
15a7d175 UD |
383 | for (i = 0; i < SBC_MAX; ++i) |
384 | if (__btowc (i) == WEOF) | |
385 | re_set_fastmap (fastmap, icase, i); | |
c0a0f9a3 | 386 | # endif /* not _LIBC */ |
15a7d175 UD |
387 | } |
388 | for (i = 0; i < cset->nmbchars; ++i) | |
389 | { | |
390 | char buf[256]; | |
391 | mbstate_t state; | |
392 | memset (&state, '\0', sizeof (state)); | |
393 | __wcrtomb (buf, cset->mbchars[i], &state); | |
394 | re_set_fastmap (fastmap, icase, *(unsigned char *) buf); | |
14744156 | 395 | if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) |
74e12fbc UD |
396 | { |
397 | __wcrtomb (buf, towlower (cset->mbchars[i]), &state); | |
398 | re_set_fastmap (fastmap, 0, *(unsigned char *) buf); | |
399 | } | |
15a7d175 UD |
400 | } |
401 | } | |
c0a0f9a3 | 402 | #endif /* RE_ENABLE_I18N */ |
1b2c2628 | 403 | else if (type == END_OF_RE || type == OP_PERIOD) |
15a7d175 UD |
404 | { |
405 | memset (fastmap, '\1', sizeof (char) * SBC_MAX); | |
406 | if (type == END_OF_RE) | |
407 | bufp->can_be_null = 1; | |
408 | return; | |
409 | } | |
3b0bdc72 UD |
410 | } |
411 | } | |
412 | \f | |
413 | /* Entry point for POSIX code. */ | |
414 | /* regcomp takes a regular expression as a string and compiles it. | |
415 | ||
416 | PREG is a regex_t *. We do not expect any fields to be initialized, | |
417 | since POSIX says we shouldn't. Thus, we set | |
418 | ||
419 | `buffer' to the compiled pattern; | |
420 | `used' to the length of the compiled pattern; | |
421 | `syntax' to RE_SYNTAX_POSIX_EXTENDED if the | |
422 | REG_EXTENDED bit in CFLAGS is set; otherwise, to | |
423 | RE_SYNTAX_POSIX_BASIC; | |
424 | `newline_anchor' to REG_NEWLINE being set in CFLAGS; | |
425 | `fastmap' to an allocated space for the fastmap; | |
426 | `fastmap_accurate' to zero; | |
427 | `re_nsub' to the number of subexpressions in PATTERN. | |
428 | ||
429 | PATTERN is the address of the pattern string. | |
430 | ||
431 | CFLAGS is a series of bits which affect compilation. | |
432 | ||
433 | If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we | |
434 | use POSIX basic syntax. | |
435 | ||
436 | If REG_NEWLINE is set, then . and [^...] don't match newline. | |
437 | Also, regexec will try a match beginning after every newline. | |
438 | ||
439 | If REG_ICASE is set, then we considers upper- and lowercase | |
440 | versions of letters to be equivalent when matching. | |
441 | ||
442 | If REG_NOSUB is set, then when PREG is passed to regexec, that | |
443 | routine will report only success or failure, and nothing about the | |
444 | registers. | |
445 | ||
446 | It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for | |
447 | the return codes and their meanings.) */ | |
448 | ||
449 | int | |
450 | regcomp (preg, pattern, cflags) | |
75e4a282 UD |
451 | regex_t *__restrict preg; |
452 | const char *__restrict pattern; | |
3b0bdc72 UD |
453 | int cflags; |
454 | { | |
455 | reg_errcode_t ret; | |
456 | reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED | |
15a7d175 | 457 | : RE_SYNTAX_POSIX_BASIC); |
3b0bdc72 UD |
458 | |
459 | preg->buffer = NULL; | |
460 | preg->allocated = 0; | |
461 | preg->used = 0; | |
462 | ||
463 | /* Try to allocate space for the fastmap. */ | |
464 | preg->fastmap = re_malloc (char, SBC_MAX); | |
bc15410e | 465 | if (BE (preg->fastmap == NULL, 0)) |
3b0bdc72 UD |
466 | return REG_ESPACE; |
467 | ||
468 | syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; | |
469 | ||
470 | /* If REG_NEWLINE is set, newlines are treated differently. */ | |
471 | if (cflags & REG_NEWLINE) | |
472 | { /* REG_NEWLINE implies neither . nor [^...] match newline. */ | |
473 | syntax &= ~RE_DOT_NEWLINE; | |
474 | syntax |= RE_HAT_LISTS_NOT_NEWLINE; | |
475 | /* It also changes the matching behavior. */ | |
476 | preg->newline_anchor = 1; | |
477 | } | |
478 | else | |
479 | preg->newline_anchor = 0; | |
480 | preg->no_sub = !!(cflags & REG_NOSUB); | |
481 | preg->translate = NULL; | |
482 | ||
483 | ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); | |
484 | ||
485 | /* POSIX doesn't distinguish between an unmatched open-group and an | |
486 | unmatched close-group: both are REG_EPAREN. */ | |
487 | if (ret == REG_ERPAREN) | |
488 | ret = REG_EPAREN; | |
489 | ||
a9388965 | 490 | /* We have already checked preg->fastmap != NULL. */ |
bc15410e | 491 | if (BE (ret == REG_NOERROR, 1)) |
71ccd330 UD |
492 | /* Compute the fastmap now, since regexec cannot modify the pattern |
493 | buffer. This function nevers fails in this implementation. */ | |
83b038f2 | 494 | (void) re_compile_fastmap (preg); |
71ccd330 | 495 | else |
3b0bdc72 | 496 | { |
71ccd330 UD |
497 | /* Some error occurred while compiling the expression. */ |
498 | re_free (preg->fastmap); | |
499 | preg->fastmap = NULL; | |
3b0bdc72 UD |
500 | } |
501 | ||
502 | return (int) ret; | |
503 | } | |
504 | #ifdef _LIBC | |
505 | weak_alias (__regcomp, regcomp) | |
506 | #endif | |
507 | ||
508 | /* Returns a message corresponding to an error code, ERRCODE, returned | |
509 | from either regcomp or regexec. We don't use PREG here. */ | |
510 | ||
511 | size_t | |
512 | regerror (errcode, preg, errbuf, errbuf_size) | |
513 | int errcode; | |
514 | const regex_t *preg; | |
515 | char *errbuf; | |
516 | size_t errbuf_size; | |
517 | { | |
518 | const char *msg; | |
519 | size_t msg_size; | |
520 | ||
bc15410e | 521 | if (BE (errcode < 0 |
15a7d175 UD |
522 | || errcode >= (int) (sizeof (__re_error_msgid_idx) |
523 | / sizeof (__re_error_msgid_idx[0])), 0)) | |
3b0bdc72 UD |
524 | /* Only error codes returned by the rest of the code should be passed |
525 | to this routine. If we are given anything else, or if other regex | |
526 | code generates an invalid error code, then the program has a bug. | |
527 | Dump core so we can fix it. */ | |
528 | abort (); | |
529 | ||
6455d255 | 530 | msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); |
3b0bdc72 UD |
531 | |
532 | msg_size = strlen (msg) + 1; /* Includes the null. */ | |
533 | ||
bc15410e | 534 | if (BE (errbuf_size != 0, 1)) |
3b0bdc72 | 535 | { |
bc15410e | 536 | if (BE (msg_size > errbuf_size, 0)) |
15a7d175 | 537 | { |
3b0bdc72 UD |
538 | #if defined HAVE_MEMPCPY || defined _LIBC |
539 | *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; | |
540 | #else | |
15a7d175 UD |
541 | memcpy (errbuf, msg, errbuf_size - 1); |
542 | errbuf[errbuf_size - 1] = 0; | |
3b0bdc72 | 543 | #endif |
15a7d175 | 544 | } |
3b0bdc72 | 545 | else |
15a7d175 | 546 | memcpy (errbuf, msg, msg_size); |
3b0bdc72 UD |
547 | } |
548 | ||
549 | return msg_size; | |
550 | } | |
551 | #ifdef _LIBC | |
552 | weak_alias (__regerror, regerror) | |
553 | #endif | |
554 | ||
3b0bdc72 | 555 | |
71ccd330 UD |
556 | static void |
557 | free_dfa_content (re_dfa_t *dfa) | |
3b0bdc72 UD |
558 | { |
559 | int i, j; | |
3b0bdc72 | 560 | |
71ccd330 UD |
561 | re_free (dfa->subexps); |
562 | ||
563 | for (i = 0; i < dfa->nodes_len; ++i) | |
564 | { | |
565 | re_token_t *node = dfa->nodes + i; | |
c0a0f9a3 | 566 | #ifdef RE_ENABLE_I18N |
71ccd330 UD |
567 | if (node->type == COMPLEX_BRACKET && node->duplicated == 0) |
568 | free_charset (node->opr.mbcset); | |
569 | else | |
c0a0f9a3 | 570 | #endif /* RE_ENABLE_I18N */ |
71ccd330 UD |
571 | if (node->type == SIMPLE_BRACKET && node->duplicated == 0) |
572 | re_free (node->opr.sbcset); | |
573 | } | |
574 | re_free (dfa->nexts); | |
575 | for (i = 0; i < dfa->nodes_len; ++i) | |
576 | { | |
577 | if (dfa->eclosures != NULL) | |
578 | re_node_set_free (dfa->eclosures + i); | |
579 | if (dfa->inveclosures != NULL) | |
580 | re_node_set_free (dfa->inveclosures + i); | |
581 | if (dfa->edests != NULL) | |
582 | re_node_set_free (dfa->edests + i); | |
583 | } | |
584 | re_free (dfa->edests); | |
585 | re_free (dfa->eclosures); | |
586 | re_free (dfa->inveclosures); | |
587 | re_free (dfa->nodes); | |
3b0bdc72 | 588 | |
71ccd330 UD |
589 | for (i = 0; i <= dfa->state_hash_mask; ++i) |
590 | { | |
591 | struct re_state_table_entry *entry = dfa->state_table + i; | |
592 | for (j = 0; j < entry->num; ++j) | |
593 | { | |
594 | re_dfastate_t *state = entry->array[j]; | |
b14993a2 | 595 | free_state (state); |
71ccd330 UD |
596 | } |
597 | re_free (entry->array); | |
598 | } | |
599 | re_free (dfa->state_table); | |
3b0bdc72 | 600 | |
71ccd330 UD |
601 | if (dfa->word_char != NULL) |
602 | re_free (dfa->word_char); | |
0742e48e | 603 | #ifdef DEBUG |
71ccd330 | 604 | re_free (dfa->re_str); |
0742e48e | 605 | #endif |
71ccd330 UD |
606 | |
607 | re_free (dfa); | |
608 | } | |
609 | ||
610 | ||
611 | /* Free dynamically allocated space used by PREG. */ | |
612 | ||
613 | void | |
614 | regfree (preg) | |
615 | regex_t *preg; | |
616 | { | |
617 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | |
618 | if (BE (dfa != NULL, 1)) | |
619 | free_dfa_content (dfa); | |
620 | ||
3b0bdc72 UD |
621 | re_free (preg->fastmap); |
622 | } | |
623 | #ifdef _LIBC | |
624 | weak_alias (__regfree, regfree) | |
625 | #endif | |
626 | \f | |
627 | /* Entry points compatible with 4.2 BSD regex library. We don't define | |
628 | them unless specifically requested. */ | |
629 | ||
630 | #if defined _REGEX_RE_COMP || defined _LIBC | |
631 | ||
632 | /* BSD has one and only one pattern buffer. */ | |
633 | static struct re_pattern_buffer re_comp_buf; | |
634 | ||
635 | char * | |
636 | # ifdef _LIBC | |
637 | /* Make these definitions weak in libc, so POSIX programs can redefine | |
638 | these names if they don't use our functions, and still use | |
639 | regcomp/regexec above without link errors. */ | |
640 | weak_function | |
641 | # endif | |
642 | re_comp (s) | |
643 | const char *s; | |
644 | { | |
645 | reg_errcode_t ret; | |
240e87c2 | 646 | char *fastmap; |
3b0bdc72 UD |
647 | |
648 | if (!s) | |
649 | { | |
650 | if (!re_comp_buf.buffer) | |
651 | return gettext ("No previous regular expression"); | |
652 | return 0; | |
653 | } | |
654 | ||
240e87c2 RM |
655 | if (re_comp_buf.buffer) |
656 | { | |
657 | fastmap = re_comp_buf.fastmap; | |
658 | re_comp_buf.fastmap = NULL; | |
659 | __regfree (&re_comp_buf); | |
1b2c2628 | 660 | memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); |
240e87c2 RM |
661 | re_comp_buf.fastmap = fastmap; |
662 | } | |
663 | ||
664 | if (re_comp_buf.fastmap == NULL) | |
3b0bdc72 UD |
665 | { |
666 | re_comp_buf.fastmap = (char *) malloc (SBC_MAX); | |
667 | if (re_comp_buf.fastmap == NULL) | |
6455d255 UD |
668 | return (char *) gettext (__re_error_msgid |
669 | + __re_error_msgid_idx[(int) REG_ESPACE]); | |
3b0bdc72 UD |
670 | } |
671 | ||
672 | /* Since `re_exec' always passes NULL for the `regs' argument, we | |
673 | don't need to initialize the pattern buffer fields which affect it. */ | |
674 | ||
675 | /* Match anchors at newlines. */ | |
676 | re_comp_buf.newline_anchor = 1; | |
677 | ||
678 | ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); | |
679 | ||
680 | if (!ret) | |
681 | return NULL; | |
682 | ||
683 | /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ | |
6455d255 | 684 | return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); |
3b0bdc72 | 685 | } |
240e87c2 RM |
686 | |
687 | #ifdef _LIBC | |
c877418f | 688 | libc_freeres_fn (free_mem) |
240e87c2 RM |
689 | { |
690 | __regfree (&re_comp_buf); | |
691 | } | |
240e87c2 RM |
692 | #endif |
693 | ||
3b0bdc72 UD |
694 | #endif /* _REGEX_RE_COMP */ |
695 | \f | |
696 | /* Internal entry point. | |
697 | Compile the regular expression PATTERN, whose length is LENGTH. | |
698 | SYNTAX indicate regular expression's syntax. */ | |
699 | ||
700 | static reg_errcode_t | |
701 | re_compile_internal (preg, pattern, length, syntax) | |
702 | regex_t *preg; | |
703 | const char * pattern; | |
704 | int length; | |
705 | reg_syntax_t syntax; | |
706 | { | |
707 | reg_errcode_t err = REG_NOERROR; | |
708 | re_dfa_t *dfa; | |
709 | re_string_t regexp; | |
710 | ||
711 | /* Initialize the pattern buffer. */ | |
712 | preg->fastmap_accurate = 0; | |
713 | preg->syntax = syntax; | |
714 | preg->not_bol = preg->not_eol = 0; | |
715 | preg->used = 0; | |
716 | preg->re_nsub = 0; | |
1b2c2628 UD |
717 | preg->can_be_null = 0; |
718 | preg->regs_allocated = REGS_UNALLOCATED; | |
3b0bdc72 UD |
719 | |
720 | /* Initialize the dfa. */ | |
721 | dfa = (re_dfa_t *) preg->buffer; | |
722 | if (preg->allocated < sizeof (re_dfa_t)) | |
723 | { | |
724 | /* If zero allocated, but buffer is non-null, try to realloc | |
725 | enough space. This loses if buffer's address is bogus, but | |
726 | that is the user's responsibility. If ->buffer is NULL this | |
727 | is a simple allocation. */ | |
728 | dfa = re_realloc (preg->buffer, re_dfa_t, 1); | |
729 | if (dfa == NULL) | |
730 | return REG_ESPACE; | |
3b0bdc72 UD |
731 | preg->allocated = sizeof (re_dfa_t); |
732 | } | |
733 | preg->buffer = (unsigned char *) dfa; | |
734 | preg->used = sizeof (re_dfa_t); | |
735 | ||
736 | err = init_dfa (dfa, length); | |
bc15410e | 737 | if (BE (err != REG_NOERROR, 0)) |
3b0bdc72 UD |
738 | { |
739 | re_free (dfa); | |
740 | preg->buffer = NULL; | |
ca3c505e | 741 | preg->allocated = 0; |
3b0bdc72 UD |
742 | return err; |
743 | } | |
0742e48e UD |
744 | #ifdef DEBUG |
745 | dfa->re_str = re_malloc (char, length + 1); | |
746 | strncpy (dfa->re_str, pattern, length + 1); | |
747 | #endif | |
3b0bdc72 | 748 | |
612546c6 | 749 | err = re_string_construct (®exp, pattern, length, preg->translate, |
3c0fb574 UD |
750 | syntax & RE_ICASE, dfa->mb_cur_max, |
751 | dfa->is_utf8); | |
bc15410e | 752 | if (BE (err != REG_NOERROR, 0)) |
3b0bdc72 UD |
753 | { |
754 | re_free (dfa); | |
755 | preg->buffer = NULL; | |
ca3c505e | 756 | preg->allocated = 0; |
3b0bdc72 UD |
757 | return err; |
758 | } | |
759 | ||
760 | /* Parse the regular expression, and build a structure tree. */ | |
761 | preg->re_nsub = 0; | |
762 | dfa->str_tree = parse (®exp, preg, syntax, &err); | |
bc15410e | 763 | if (BE (dfa->str_tree == NULL, 0)) |
3b0bdc72 UD |
764 | goto re_compile_internal_free_return; |
765 | ||
14744156 UD |
766 | #ifdef RE_ENABLE_I18N |
767 | /* If possible, do searching in single byte encoding to speed things up. */ | |
768 | if (dfa->is_utf8 && !(syntax & RE_ICASE)) | |
769 | optimize_utf8 (dfa); | |
770 | #endif | |
771 | ||
3b0bdc72 UD |
772 | /* Analyze the tree and collect information which is necessary to |
773 | create the dfa. */ | |
774 | err = analyze (dfa); | |
bc15410e | 775 | if (BE (err != REG_NOERROR, 0)) |
3b0bdc72 UD |
776 | goto re_compile_internal_free_return; |
777 | ||
778 | /* Then create the initial state of the dfa. */ | |
779 | err = create_initial_state (dfa); | |
3b0bdc72 | 780 | |
3b0bdc72 UD |
781 | /* Release work areas. */ |
782 | free_workarea_compile (preg); | |
783 | re_string_destruct (®exp); | |
784 | ||
71ccd330 UD |
785 | if (BE (err != REG_NOERROR, 0)) |
786 | { | |
787 | re_compile_internal_free_return: | |
788 | free_dfa_content (dfa); | |
789 | preg->buffer = NULL; | |
ca3c505e | 790 | preg->allocated = 0; |
71ccd330 UD |
791 | } |
792 | ||
3b0bdc72 UD |
793 | return err; |
794 | } | |
795 | ||
796 | /* Initialize DFA. We use the length of the regular expression PAT_LEN | |
797 | as the initial length of some arrays. */ | |
798 | ||
799 | static reg_errcode_t | |
800 | init_dfa (dfa, pat_len) | |
801 | re_dfa_t *dfa; | |
802 | int pat_len; | |
803 | { | |
804 | int table_size; | |
81c64d40 UD |
805 | |
806 | memset (dfa, '\0', sizeof (re_dfa_t)); | |
807 | ||
3b0bdc72 UD |
808 | dfa->nodes_alloc = pat_len + 1; |
809 | dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); | |
810 | ||
811 | dfa->states_alloc = pat_len + 1; | |
812 | ||
813 | /* table_size = 2 ^ ceil(log pat_len) */ | |
814 | for (table_size = 1; table_size > 0; table_size <<= 1) | |
815 | if (table_size > pat_len) | |
816 | break; | |
817 | ||
818 | dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); | |
819 | dfa->state_hash_mask = table_size - 1; | |
820 | ||
821 | dfa->subexps_alloc = 1; | |
822 | dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc); | |
823 | dfa->word_char = NULL; | |
824 | ||
3c0fb574 UD |
825 | dfa->mb_cur_max = MB_CUR_MAX; |
826 | #ifdef _LIBC | |
827 | if (dfa->mb_cur_max > 1 | |
828 | && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) | |
829 | dfa->is_utf8 = 1; | |
830 | #endif | |
831 | ||
bc15410e | 832 | if (BE (dfa->nodes == NULL || dfa->state_table == NULL |
15a7d175 | 833 | || dfa->subexps == NULL, 0)) |
3b0bdc72 UD |
834 | { |
835 | /* We don't bother to free anything which was allocated. Very | |
836 | soon the process will go down anyway. */ | |
837 | dfa->subexps = NULL; | |
838 | dfa->state_table = NULL; | |
839 | dfa->nodes = NULL; | |
840 | return REG_ESPACE; | |
841 | } | |
842 | return REG_NOERROR; | |
843 | } | |
844 | ||
845 | /* Initialize WORD_CHAR table, which indicate which character is | |
846 | "word". In this case "word" means that it is the word construction | |
847 | character used by some operators like "\<", "\>", etc. */ | |
848 | ||
a9388965 | 849 | static reg_errcode_t |
3b0bdc72 UD |
850 | init_word_char (dfa) |
851 | re_dfa_t *dfa; | |
852 | { | |
853 | int i, j, ch; | |
854 | dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); | |
bc15410e | 855 | if (BE (dfa->word_char == NULL, 0)) |
a9388965 | 856 | return REG_ESPACE; |
3b0bdc72 UD |
857 | for (i = 0, ch = 0; i < BITSET_UINTS; ++i) |
858 | for (j = 0; j < UINT_BITS; ++j, ++ch) | |
859 | if (isalnum (ch) || ch == '_') | |
15a7d175 | 860 | dfa->word_char[i] |= 1 << j; |
a9388965 | 861 | return REG_NOERROR; |
3b0bdc72 UD |
862 | } |
863 | ||
864 | /* Free the work area which are only used while compiling. */ | |
865 | ||
866 | static void | |
867 | free_workarea_compile (preg) | |
868 | regex_t *preg; | |
869 | { | |
870 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | |
871 | free_bin_tree (dfa->str_tree); | |
872 | dfa->str_tree = NULL; | |
a7d5c291 UD |
873 | re_free (dfa->org_indices); |
874 | dfa->org_indices = NULL; | |
3b0bdc72 UD |
875 | } |
876 | ||
877 | /* Create initial states for all contexts. */ | |
878 | ||
879 | static reg_errcode_t | |
880 | create_initial_state (dfa) | |
881 | re_dfa_t *dfa; | |
882 | { | |
883 | int first, i; | |
884 | reg_errcode_t err; | |
885 | re_node_set init_nodes; | |
886 | ||
887 | /* Initial states have the epsilon closure of the node which is | |
888 | the first node of the regular expression. */ | |
889 | first = dfa->str_tree->first; | |
890 | dfa->init_node = first; | |
891 | err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); | |
bc15410e | 892 | if (BE (err != REG_NOERROR, 0)) |
3b0bdc72 UD |
893 | return err; |
894 | ||
895 | /* The back-references which are in initial states can epsilon transit, | |
896 | since in this case all of the subexpressions can be null. | |
897 | Then we add epsilon closures of the nodes which are the next nodes of | |
898 | the back-references. */ | |
899 | if (dfa->nbackref > 0) | |
900 | for (i = 0; i < init_nodes.nelem; ++i) | |
901 | { | |
15a7d175 UD |
902 | int node_idx = init_nodes.elems[i]; |
903 | re_token_type_t type = dfa->nodes[node_idx].type; | |
904 | ||
905 | int clexp_idx; | |
906 | if (type != OP_BACK_REF) | |
907 | continue; | |
908 | for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) | |
909 | { | |
910 | re_token_t *clexp_node; | |
911 | clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; | |
912 | if (clexp_node->type == OP_CLOSE_SUBEXP | |
913 | && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx) | |
914 | break; | |
915 | } | |
916 | if (clexp_idx == init_nodes.nelem) | |
917 | continue; | |
918 | ||
919 | if (type == OP_BACK_REF) | |
920 | { | |
921 | int dest_idx = dfa->edests[node_idx].elems[0]; | |
922 | if (!re_node_set_contains (&init_nodes, dest_idx)) | |
923 | { | |
924 | re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); | |
925 | i = 0; | |
926 | } | |
927 | } | |
3b0bdc72 UD |
928 | } |
929 | ||
930 | /* It must be the first time to invoke acquire_state. */ | |
a9388965 UD |
931 | dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); |
932 | /* We don't check ERR here, since the initial state must not be NULL. */ | |
bc15410e | 933 | if (BE (dfa->init_state == NULL, 0)) |
a9388965 | 934 | return err; |
3b0bdc72 UD |
935 | if (dfa->init_state->has_constraint) |
936 | { | |
a9388965 | 937 | dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, |
15a7d175 | 938 | CONTEXT_WORD); |
a9388965 | 939 | dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, |
15a7d175 | 940 | CONTEXT_NEWLINE); |
a9388965 | 941 | dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, |
15a7d175 UD |
942 | &init_nodes, |
943 | CONTEXT_NEWLINE | |
944 | | CONTEXT_BEGBUF); | |
bc15410e | 945 | if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL |
15a7d175 UD |
946 | || dfa->init_state_begbuf == NULL, 0)) |
947 | return err; | |
3b0bdc72 UD |
948 | } |
949 | else | |
950 | dfa->init_state_word = dfa->init_state_nl | |
951 | = dfa->init_state_begbuf = dfa->init_state; | |
952 | ||
3b0bdc72 UD |
953 | re_node_set_free (&init_nodes); |
954 | return REG_NOERROR; | |
955 | } | |
956 | \f | |
14744156 UD |
957 | #ifdef RE_ENABLE_I18N |
958 | /* If it is possible to do searching in single byte encoding instead of UTF-8 | |
959 | to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change | |
960 | DFA nodes where needed. */ | |
961 | ||
962 | static void | |
963 | optimize_utf8 (dfa) | |
964 | re_dfa_t *dfa; | |
965 | { | |
966 | int node; | |
967 | ||
968 | for (node = 0; node < dfa->nodes_len; ++node) | |
969 | switch (dfa->nodes[node].type) | |
970 | { | |
971 | case CHARACTER: | |
972 | /* Chars >= 0x80 are optimizable in some cases (e.g. when not | |
973 | followed by DUP operator, not in bracket etc.). | |
974 | For now punt on them all. */ | |
975 | if (dfa->nodes[node].opr.c >= 0x80) | |
976 | return; | |
977 | break; | |
978 | case ANCHOR: | |
979 | switch (dfa->nodes[node].opr.idx) | |
980 | { | |
981 | case LINE_FIRST: | |
982 | case LINE_LAST: | |
983 | case BUF_FIRST: | |
984 | case BUF_LAST: | |
985 | break; | |
986 | default: | |
987 | /* Word anchors etc. cannot be handled. */ | |
988 | return; | |
989 | } | |
990 | break; | |
991 | case OP_BACK_REF: | |
992 | case OP_ALT: | |
993 | case END_OF_RE: | |
994 | case BACK_SLASH: | |
995 | case OP_DUP_ASTERISK: | |
996 | case OP_DUP_QUESTION: | |
997 | case OP_DUP_PLUS: | |
998 | case OP_OPEN_SUBEXP: | |
999 | case OP_CLOSE_SUBEXP: | |
1000 | break; | |
1001 | default: | |
1002 | return; | |
1003 | } | |
1004 | ||
1005 | /* The search can be in single byte locale. */ | |
1006 | dfa->mb_cur_max = 1; | |
1007 | dfa->is_utf8 = 0; | |
1008 | dfa->has_mb_node = dfa->nbackref > 0; | |
1009 | } | |
1010 | #endif | |
1011 | \f | |
3b0bdc72 UD |
1012 | /* Analyze the structure tree, and calculate "first", "next", "edest", |
1013 | "eclosure", and "inveclosure". */ | |
1014 | ||
1015 | static reg_errcode_t | |
1016 | analyze (dfa) | |
1017 | re_dfa_t *dfa; | |
1018 | { | |
1019 | int i; | |
1020 | reg_errcode_t ret; | |
1021 | ||
1022 | /* Allocate arrays. */ | |
3b0bdc72 | 1023 | dfa->nexts = re_malloc (int, dfa->nodes_alloc); |
a7d5c291 | 1024 | dfa->org_indices = re_malloc (int, dfa->nodes_alloc); |
3b0bdc72 UD |
1025 | dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); |
1026 | dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); | |
1027 | dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc); | |
a7d5c291 | 1028 | if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL |
15a7d175 | 1029 | || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0)) |
3b0bdc72 UD |
1030 | return REG_ESPACE; |
1031 | /* Initialize them. */ | |
1032 | for (i = 0; i < dfa->nodes_len; ++i) | |
1033 | { | |
3b0bdc72 UD |
1034 | dfa->nexts[i] = -1; |
1035 | re_node_set_init_empty (dfa->edests + i); | |
1036 | re_node_set_init_empty (dfa->eclosures + i); | |
1037 | re_node_set_init_empty (dfa->inveclosures + i); | |
1038 | } | |
1039 | ||
1040 | ret = analyze_tree (dfa, dfa->str_tree); | |
bc15410e | 1041 | if (BE (ret == REG_NOERROR, 1)) |
3b0bdc72 UD |
1042 | { |
1043 | ret = calc_eclosure (dfa); | |
1044 | if (ret == REG_NOERROR) | |
1045 | calc_inveclosure (dfa); | |
1046 | } | |
1047 | return ret; | |
1048 | } | |
1049 | ||
1050 | /* Helper functions for analyze. | |
1051 | This function calculate "first", "next", and "edest" for the subtree | |
1052 | whose root is NODE. */ | |
1053 | ||
1054 | static reg_errcode_t | |
1055 | analyze_tree (dfa, node) | |
1056 | re_dfa_t *dfa; | |
1057 | bin_tree_t *node; | |
1058 | { | |
1059 | reg_errcode_t ret; | |
1060 | if (node->first == -1) | |
1061 | calc_first (dfa, node); | |
1062 | if (node->next == -1) | |
1063 | calc_next (dfa, node); | |
1064 | if (node->eclosure.nelem == 0) | |
1065 | calc_epsdest (dfa, node); | |
1066 | /* Calculate "first" etc. for the left child. */ | |
1067 | if (node->left != NULL) | |
1068 | { | |
1069 | ret = analyze_tree (dfa, node->left); | |
bc15410e | 1070 | if (BE (ret != REG_NOERROR, 0)) |
15a7d175 | 1071 | return ret; |
3b0bdc72 UD |
1072 | } |
1073 | /* Calculate "first" etc. for the right child. */ | |
1074 | if (node->right != NULL) | |
1075 | { | |
1076 | ret = analyze_tree (dfa, node->right); | |
bc15410e | 1077 | if (BE (ret != REG_NOERROR, 0)) |
15a7d175 | 1078 | return ret; |
3b0bdc72 UD |
1079 | } |
1080 | return REG_NOERROR; | |
1081 | } | |
1082 | ||
1083 | /* Calculate "first" for the node NODE. */ | |
1084 | static void | |
1085 | calc_first (dfa, node) | |
1086 | re_dfa_t *dfa; | |
1087 | bin_tree_t *node; | |
1088 | { | |
1089 | int idx, type; | |
1090 | idx = node->node_idx; | |
1091 | type = (node->type == 0) ? dfa->nodes[idx].type : node->type; | |
1092 | ||
1093 | switch (type) | |
1094 | { | |
1095 | #ifdef DEBUG | |
3b0bdc72 UD |
1096 | case OP_OPEN_BRACKET: |
1097 | case OP_CLOSE_BRACKET: | |
1098 | case OP_OPEN_DUP_NUM: | |
1099 | case OP_CLOSE_DUP_NUM: | |
1100 | case OP_NON_MATCH_LIST: | |
1101 | case OP_OPEN_COLL_ELEM: | |
1102 | case OP_CLOSE_COLL_ELEM: | |
1103 | case OP_OPEN_EQUIV_CLASS: | |
1104 | case OP_CLOSE_EQUIV_CLASS: | |
1105 | case OP_OPEN_CHAR_CLASS: | |
1106 | case OP_CLOSE_CHAR_CLASS: | |
1107 | /* These must not be appeared here. */ | |
1108 | assert (0); | |
1109 | #endif | |
1110 | case END_OF_RE: | |
1111 | case CHARACTER: | |
1112 | case OP_PERIOD: | |
1113 | case OP_DUP_ASTERISK: | |
1114 | case OP_DUP_QUESTION: | |
c0a0f9a3 | 1115 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 1116 | case COMPLEX_BRACKET: |
c0a0f9a3 | 1117 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 UD |
1118 | case SIMPLE_BRACKET: |
1119 | case OP_BACK_REF: | |
1120 | case ANCHOR: | |
81c64d40 UD |
1121 | case OP_OPEN_SUBEXP: |
1122 | case OP_CLOSE_SUBEXP: | |
3b0bdc72 UD |
1123 | node->first = idx; |
1124 | break; | |
1125 | case OP_DUP_PLUS: | |
1126 | #ifdef DEBUG | |
1127 | assert (node->left != NULL); | |
1128 | #endif | |
1129 | if (node->left->first == -1) | |
15a7d175 | 1130 | calc_first (dfa, node->left); |
3b0bdc72 UD |
1131 | node->first = node->left->first; |
1132 | break; | |
1133 | case OP_ALT: | |
1134 | node->first = idx; | |
1135 | break; | |
3b0bdc72 UD |
1136 | /* else fall through */ |
1137 | default: | |
1138 | #ifdef DEBUG | |
1139 | assert (node->left != NULL); | |
1140 | #endif | |
1141 | if (node->left->first == -1) | |
15a7d175 | 1142 | calc_first (dfa, node->left); |
3b0bdc72 UD |
1143 | node->first = node->left->first; |
1144 | break; | |
1145 | } | |
3b0bdc72 UD |
1146 | } |
1147 | ||
1148 | /* Calculate "next" for the node NODE. */ | |
1149 | ||
1150 | static void | |
1151 | calc_next (dfa, node) | |
1152 | re_dfa_t *dfa; | |
1153 | bin_tree_t *node; | |
1154 | { | |
1155 | int idx, type; | |
1156 | bin_tree_t *parent = node->parent; | |
1157 | if (parent == NULL) | |
1158 | { | |
1159 | node->next = -1; | |
1160 | idx = node->node_idx; | |
1161 | if (node->type == 0) | |
15a7d175 | 1162 | dfa->nexts[idx] = node->next; |
3b0bdc72 UD |
1163 | return; |
1164 | } | |
1165 | ||
1166 | idx = parent->node_idx; | |
1167 | type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type; | |
1168 | ||
1169 | switch (type) | |
1170 | { | |
1171 | case OP_DUP_ASTERISK: | |
1172 | case OP_DUP_PLUS: | |
1173 | node->next = idx; | |
1174 | break; | |
1175 | case CONCAT: | |
1176 | if (parent->left == node) | |
15a7d175 UD |
1177 | { |
1178 | if (parent->right->first == -1) | |
1179 | calc_first (dfa, parent->right); | |
1180 | node->next = parent->right->first; | |
1181 | break; | |
1182 | } | |
3b0bdc72 UD |
1183 | /* else fall through */ |
1184 | default: | |
1185 | if (parent->next == -1) | |
15a7d175 | 1186 | calc_next (dfa, parent); |
3b0bdc72 UD |
1187 | node->next = parent->next; |
1188 | break; | |
1189 | } | |
1190 | idx = node->node_idx; | |
1191 | if (node->type == 0) | |
1192 | dfa->nexts[idx] = node->next; | |
1193 | } | |
1194 | ||
1195 | /* Calculate "edest" for the node NODE. */ | |
1196 | ||
1197 | static void | |
1198 | calc_epsdest (dfa, node) | |
1199 | re_dfa_t *dfa; | |
1200 | bin_tree_t *node; | |
1201 | { | |
1202 | int idx; | |
1203 | idx = node->node_idx; | |
1204 | if (node->type == 0) | |
1205 | { | |
1206 | if (dfa->nodes[idx].type == OP_DUP_ASTERISK | |
15a7d175 UD |
1207 | || dfa->nodes[idx].type == OP_DUP_PLUS |
1208 | || dfa->nodes[idx].type == OP_DUP_QUESTION) | |
1209 | { | |
1210 | if (node->left->first == -1) | |
1211 | calc_first (dfa, node->left); | |
1212 | if (node->next == -1) | |
1213 | calc_next (dfa, node); | |
1214 | re_node_set_init_2 (dfa->edests + idx, node->left->first, | |
1215 | node->next); | |
1216 | } | |
3b0bdc72 | 1217 | else if (dfa->nodes[idx].type == OP_ALT) |
15a7d175 UD |
1218 | { |
1219 | int left, right; | |
1220 | if (node->left != NULL) | |
1221 | { | |
1222 | if (node->left->first == -1) | |
1223 | calc_first (dfa, node->left); | |
1224 | left = node->left->first; | |
1225 | } | |
1226 | else | |
1227 | { | |
1228 | if (node->next == -1) | |
1229 | calc_next (dfa, node); | |
1230 | left = node->next; | |
1231 | } | |
1232 | if (node->right != NULL) | |
1233 | { | |
1234 | if (node->right->first == -1) | |
1235 | calc_first (dfa, node->right); | |
1236 | right = node->right->first; | |
1237 | } | |
1238 | else | |
1239 | { | |
1240 | if (node->next == -1) | |
1241 | calc_next (dfa, node); | |
1242 | right = node->next; | |
1243 | } | |
1244 | re_node_set_init_2 (dfa->edests + idx, left, right); | |
1245 | } | |
81c64d40 | 1246 | else if (dfa->nodes[idx].type == ANCHOR |
15a7d175 UD |
1247 | || dfa->nodes[idx].type == OP_OPEN_SUBEXP |
1248 | || dfa->nodes[idx].type == OP_CLOSE_SUBEXP | |
1249 | || dfa->nodes[idx].type == OP_BACK_REF) | |
1250 | re_node_set_init_1 (dfa->edests + idx, node->next); | |
1769a73f UD |
1251 | else |
1252 | assert (!IS_EPSILON_NODE (dfa->nodes[idx].type)); | |
3b0bdc72 UD |
1253 | } |
1254 | } | |
1255 | ||
485d775d UD |
1256 | /* Duplicate the epsilon closure of the node ROOT_NODE. |
1257 | Note that duplicated nodes have constraint INIT_CONSTRAINT in addition | |
1258 | to their own constraint. */ | |
1259 | ||
1260 | static reg_errcode_t | |
1261 | duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, | |
15a7d175 | 1262 | init_constraint) |
485d775d UD |
1263 | re_dfa_t *dfa; |
1264 | int top_org_node, top_clone_node, root_node; | |
1265 | unsigned int init_constraint; | |
1266 | { | |
1267 | reg_errcode_t err; | |
1268 | int org_node, clone_node, ret; | |
1269 | unsigned int constraint = init_constraint; | |
1270 | for (org_node = top_org_node, clone_node = top_clone_node;;) | |
1271 | { | |
1272 | int org_dest, clone_dest; | |
1273 | if (dfa->nodes[org_node].type == OP_BACK_REF) | |
15a7d175 | 1274 | { |
485d775d UD |
1275 | /* If the back reference epsilon-transit, its destination must |
1276 | also have the constraint. Then duplicate the epsilon closure | |
1277 | of the destination of the back reference, and store it in | |
1278 | edests of the back reference. */ | |
15a7d175 UD |
1279 | org_dest = dfa->nexts[org_node]; |
1280 | re_node_set_empty (dfa->edests + clone_node); | |
1281 | err = duplicate_node (&clone_dest, dfa, org_dest, constraint); | |
1282 | if (BE (err != REG_NOERROR, 0)) | |
1283 | return err; | |
1284 | dfa->nexts[clone_node] = dfa->nexts[org_node]; | |
1285 | ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); | |
1286 | if (BE (ret < 0, 0)) | |
1287 | return REG_ESPACE; | |
1288 | } | |
485d775d | 1289 | else if (dfa->edests[org_node].nelem == 0) |
15a7d175 | 1290 | { |
485d775d UD |
1291 | /* In case of the node can't epsilon-transit, don't duplicate the |
1292 | destination and store the original destination as the | |
1293 | destination of the node. */ | |
15a7d175 UD |
1294 | dfa->nexts[clone_node] = dfa->nexts[org_node]; |
1295 | break; | |
1296 | } | |
485d775d | 1297 | else if (dfa->edests[org_node].nelem == 1) |
15a7d175 | 1298 | { |
485d775d UD |
1299 | /* In case of the node can epsilon-transit, and it has only one |
1300 | destination. */ | |
15a7d175 UD |
1301 | org_dest = dfa->edests[org_node].elems[0]; |
1302 | re_node_set_empty (dfa->edests + clone_node); | |
1303 | if (dfa->nodes[org_node].type == ANCHOR) | |
1304 | { | |
485d775d | 1305 | /* In case of the node has another constraint, append it. */ |
15a7d175 UD |
1306 | if (org_node == root_node && clone_node != org_node) |
1307 | { | |
485d775d UD |
1308 | /* ...but if the node is root_node itself, it means the |
1309 | epsilon closure have a loop, then tie it to the | |
1310 | destination of the root_node. */ | |
15a7d175 UD |
1311 | ret = re_node_set_insert (dfa->edests + clone_node, |
1312 | org_dest); | |
1313 | if (BE (ret < 0, 0)) | |
1314 | return REG_ESPACE; | |
1315 | break; | |
1316 | } | |
1317 | constraint |= dfa->nodes[org_node].opr.ctx_type; | |
1318 | } | |
1319 | err = duplicate_node (&clone_dest, dfa, org_dest, constraint); | |
1320 | if (BE (err != REG_NOERROR, 0)) | |
1321 | return err; | |
1322 | ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); | |
1323 | if (BE (ret < 0, 0)) | |
1324 | return REG_ESPACE; | |
1325 | } | |
485d775d | 1326 | else /* dfa->edests[org_node].nelem == 2 */ |
15a7d175 | 1327 | { |
485d775d | 1328 | /* In case of the node can epsilon-transit, and it has two |
a7d5c291 | 1329 | destinations. E.g. '|', '*', '+', '?'. */ |
15a7d175 UD |
1330 | org_dest = dfa->edests[org_node].elems[0]; |
1331 | re_node_set_empty (dfa->edests + clone_node); | |
a7d5c291 UD |
1332 | /* Search for a duplicated node which satisfies the constraint. */ |
1333 | clone_dest = search_duplicated_node (dfa, org_dest, constraint); | |
1334 | if (clone_dest == -1) | |
1335 | { | |
1336 | /* There are no such a duplicated node, create a new one. */ | |
1337 | err = duplicate_node (&clone_dest, dfa, org_dest, constraint); | |
1338 | if (BE (err != REG_NOERROR, 0)) | |
1339 | return err; | |
1340 | ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); | |
1341 | if (BE (ret < 0, 0)) | |
1342 | return REG_ESPACE; | |
1343 | err = duplicate_node_closure (dfa, org_dest, clone_dest, | |
1344 | root_node, constraint); | |
1345 | if (BE (err != REG_NOERROR, 0)) | |
1346 | return err; | |
1347 | } | |
1348 | else | |
1349 | { | |
1350 | /* There are a duplicated node which satisfy the constraint, | |
1351 | use it to avoid infinite loop. */ | |
1352 | ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); | |
1353 | if (BE (ret < 0, 0)) | |
1354 | return REG_ESPACE; | |
1355 | } | |
15a7d175 UD |
1356 | |
1357 | org_dest = dfa->edests[org_node].elems[1]; | |
1358 | err = duplicate_node (&clone_dest, dfa, org_dest, constraint); | |
1359 | if (BE (err != REG_NOERROR, 0)) | |
1360 | return err; | |
1361 | ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); | |
1362 | if (BE (ret < 0, 0)) | |
1363 | return REG_ESPACE; | |
1364 | } | |
485d775d UD |
1365 | org_node = org_dest; |
1366 | clone_node = clone_dest; | |
1367 | } | |
1368 | return REG_NOERROR; | |
1369 | } | |
1370 | ||
a7d5c291 UD |
1371 | /* Search for a node which is duplicated from the node ORG_NODE, and |
1372 | satisfies the constraint CONSTRAINT. */ | |
1373 | ||
1374 | static int | |
1375 | search_duplicated_node (dfa, org_node, constraint) | |
1376 | re_dfa_t *dfa; | |
1377 | int org_node; | |
1378 | unsigned int constraint; | |
1379 | { | |
1380 | int idx; | |
1381 | for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) | |
1382 | { | |
1383 | if (org_node == dfa->org_indices[idx] | |
1384 | && constraint == dfa->nodes[idx].constraint) | |
1385 | return idx; /* Found. */ | |
1386 | } | |
1387 | return -1; /* Not found. */ | |
1388 | } | |
1389 | ||
a9388965 UD |
1390 | /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. |
1391 | The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded, | |
1392 | otherwise return the error code. */ | |
1393 | ||
1394 | static reg_errcode_t | |
1395 | duplicate_node (new_idx, dfa, org_idx, constraint) | |
3b0bdc72 | 1396 | re_dfa_t *dfa; |
a9388965 | 1397 | int *new_idx, org_idx; |
3b0bdc72 UD |
1398 | unsigned int constraint; |
1399 | { | |
1400 | re_token_t dup; | |
1401 | int dup_idx; | |
1402 | ||
485d775d | 1403 | dup = dfa->nodes[org_idx]; |
a9388965 | 1404 | dup_idx = re_dfa_add_node (dfa, dup, 1); |
485d775d | 1405 | if (BE (dup_idx == -1, 0)) |
a9388965 | 1406 | return REG_ESPACE; |
485d775d UD |
1407 | dfa->nodes[dup_idx].constraint = constraint; |
1408 | if (dfa->nodes[org_idx].type == ANCHOR) | |
1409 | dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; | |
a9388965 | 1410 | dfa->nodes[dup_idx].duplicated = 1; |
485d775d UD |
1411 | re_node_set_init_empty (dfa->edests + dup_idx); |
1412 | re_node_set_init_empty (dfa->eclosures + dup_idx); | |
1413 | re_node_set_init_empty (dfa->inveclosures + dup_idx); | |
3b0bdc72 | 1414 | |
a7d5c291 UD |
1415 | /* Store the index of the original node. */ |
1416 | dfa->org_indices[dup_idx] = org_idx; | |
a9388965 UD |
1417 | *new_idx = dup_idx; |
1418 | return REG_NOERROR; | |
3b0bdc72 UD |
1419 | } |
1420 | ||
1421 | static void | |
1422 | calc_inveclosure (dfa) | |
1423 | re_dfa_t *dfa; | |
1424 | { | |
485d775d | 1425 | int src, idx, dest; |
3b0bdc72 UD |
1426 | for (src = 0; src < dfa->nodes_len; ++src) |
1427 | { | |
1428 | for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) | |
15a7d175 UD |
1429 | { |
1430 | dest = dfa->eclosures[src].elems[idx]; | |
1431 | re_node_set_insert (dfa->inveclosures + dest, src); | |
1432 | } | |
3b0bdc72 UD |
1433 | } |
1434 | } | |
1435 | ||
1436 | /* Calculate "eclosure" for all the node in DFA. */ | |
1437 | ||
1438 | static reg_errcode_t | |
1439 | calc_eclosure (dfa) | |
1440 | re_dfa_t *dfa; | |
1441 | { | |
485d775d | 1442 | int node_idx, incomplete; |
3b0bdc72 UD |
1443 | #ifdef DEBUG |
1444 | assert (dfa->nodes_len > 0); | |
1445 | #endif | |
485d775d | 1446 | incomplete = 0; |
3b0bdc72 | 1447 | /* For each nodes, calculate epsilon closure. */ |
485d775d | 1448 | for (node_idx = 0; ; ++node_idx) |
3b0bdc72 | 1449 | { |
a9388965 | 1450 | reg_errcode_t err; |
3b0bdc72 | 1451 | re_node_set eclosure_elem; |
485d775d | 1452 | if (node_idx == dfa->nodes_len) |
15a7d175 UD |
1453 | { |
1454 | if (!incomplete) | |
1455 | break; | |
1456 | incomplete = 0; | |
1457 | node_idx = 0; | |
1458 | } | |
3b0bdc72 UD |
1459 | |
1460 | #ifdef DEBUG | |
3b0bdc72 UD |
1461 | assert (dfa->eclosures[node_idx].nelem != -1); |
1462 | #endif | |
1463 | /* If we have already calculated, skip it. */ | |
1464 | if (dfa->eclosures[node_idx].nelem != 0) | |
15a7d175 | 1465 | continue; |
3b0bdc72 | 1466 | /* Calculate epsilon closure of `node_idx'. */ |
a9388965 | 1467 | err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); |
bc15410e | 1468 | if (BE (err != REG_NOERROR, 0)) |
15a7d175 | 1469 | return err; |
3b0bdc72 UD |
1470 | |
1471 | if (dfa->eclosures[node_idx].nelem == 0) | |
15a7d175 UD |
1472 | { |
1473 | incomplete = 1; | |
1474 | re_node_set_free (&eclosure_elem); | |
1475 | } | |
3b0bdc72 | 1476 | } |
3b0bdc72 UD |
1477 | return REG_NOERROR; |
1478 | } | |
1479 | ||
1480 | /* Calculate epsilon closure of NODE. */ | |
1481 | ||
a9388965 UD |
1482 | static reg_errcode_t |
1483 | calc_eclosure_iter (new_set, dfa, node, root) | |
1484 | re_node_set *new_set; | |
3b0bdc72 UD |
1485 | re_dfa_t *dfa; |
1486 | int node, root; | |
1487 | { | |
a9388965 | 1488 | reg_errcode_t err; |
3b0bdc72 | 1489 | unsigned int constraint; |
485d775d | 1490 | int i, incomplete; |
3b0bdc72 | 1491 | re_node_set eclosure; |
485d775d | 1492 | incomplete = 0; |
a9388965 | 1493 | err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); |
bc15410e | 1494 | if (BE (err != REG_NOERROR, 0)) |
a9388965 | 1495 | return err; |
3b0bdc72 UD |
1496 | |
1497 | /* This indicates that we are calculating this node now. | |
1498 | We reference this value to avoid infinite loop. */ | |
1499 | dfa->eclosures[node].nelem = -1; | |
1500 | ||
1501 | constraint = ((dfa->nodes[node].type == ANCHOR) | |
15a7d175 | 1502 | ? dfa->nodes[node].opr.ctx_type : 0); |
485d775d UD |
1503 | /* If the current node has constraints, duplicate all nodes. |
1504 | Since they must inherit the constraints. */ | |
1505 | if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) | |
1506 | { | |
1507 | int org_node, cur_node; | |
1508 | org_node = cur_node = node; | |
1509 | err = duplicate_node_closure (dfa, node, node, node, constraint); | |
1510 | if (BE (err != REG_NOERROR, 0)) | |
15a7d175 | 1511 | return err; |
485d775d | 1512 | } |
3b0bdc72 UD |
1513 | |
1514 | /* Expand each epsilon destination nodes. */ | |
485d775d | 1515 | if (IS_EPSILON_NODE(dfa->nodes[node].type)) |
3b0bdc72 UD |
1516 | for (i = 0; i < dfa->edests[node].nelem; ++i) |
1517 | { | |
15a7d175 UD |
1518 | re_node_set eclosure_elem; |
1519 | int edest = dfa->edests[node].elems[i]; | |
1520 | /* If calculating the epsilon closure of `edest' is in progress, | |
1521 | return intermediate result. */ | |
1522 | if (dfa->eclosures[edest].nelem == -1) | |
1523 | { | |
1524 | incomplete = 1; | |
1525 | continue; | |
1526 | } | |
1527 | /* If we haven't calculated the epsilon closure of `edest' yet, | |
1528 | calculate now. Otherwise use calculated epsilon closure. */ | |
1529 | if (dfa->eclosures[edest].nelem == 0) | |
1530 | { | |
1531 | err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); | |
1532 | if (BE (err != REG_NOERROR, 0)) | |
1533 | return err; | |
1534 | } | |
1535 | else | |
1536 | eclosure_elem = dfa->eclosures[edest]; | |
1537 | /* Merge the epsilon closure of `edest'. */ | |
1538 | re_node_set_merge (&eclosure, &eclosure_elem); | |
1539 | /* If the epsilon closure of `edest' is incomplete, | |
1540 | the epsilon closure of this node is also incomplete. */ | |
1541 | if (dfa->eclosures[edest].nelem == 0) | |
1542 | { | |
1543 | incomplete = 1; | |
1544 | re_node_set_free (&eclosure_elem); | |
1545 | } | |
3b0bdc72 UD |
1546 | } |
1547 | ||
3b0bdc72 UD |
1548 | /* Epsilon closures include itself. */ |
1549 | re_node_set_insert (&eclosure, node); | |
1550 | if (incomplete && !root) | |
1551 | dfa->eclosures[node].nelem = 0; | |
1552 | else | |
1553 | dfa->eclosures[node] = eclosure; | |
a9388965 UD |
1554 | *new_set = eclosure; |
1555 | return REG_NOERROR; | |
3b0bdc72 UD |
1556 | } |
1557 | \f | |
1558 | /* Functions for token which are used in the parser. */ | |
1559 | ||
1560 | /* Fetch a token from INPUT. | |
1561 | We must not use this function inside bracket expressions. */ | |
1562 | ||
1563 | static re_token_t | |
1564 | fetch_token (input, syntax) | |
1565 | re_string_t *input; | |
1566 | reg_syntax_t syntax; | |
1567 | { | |
1568 | re_token_t token; | |
1569 | int consumed_byte; | |
1570 | consumed_byte = peek_token (&token, input, syntax); | |
1571 | re_string_skip_bytes (input, consumed_byte); | |
1572 | return token; | |
1573 | } | |
1574 | ||
1575 | /* Peek a token from INPUT, and return the length of the token. | |
1576 | We must not use this function inside bracket expressions. */ | |
1577 | ||
1578 | static int | |
1579 | peek_token (token, input, syntax) | |
1580 | re_token_t *token; | |
1581 | re_string_t *input; | |
1582 | reg_syntax_t syntax; | |
1583 | { | |
1584 | unsigned char c; | |
1585 | ||
1586 | if (re_string_eoi (input)) | |
1587 | { | |
1588 | token->type = END_OF_RE; | |
1589 | return 0; | |
1590 | } | |
1591 | ||
1592 | c = re_string_peek_byte (input, 0); | |
1593 | token->opr.c = c; | |
1594 | ||
1595 | #ifdef RE_ENABLE_I18N | |
1596 | token->mb_partial = 0; | |
3c0fb574 | 1597 | if (input->mb_cur_max > 1 && |
3b0bdc72 UD |
1598 | !re_string_first_byte (input, re_string_cur_idx (input))) |
1599 | { | |
1600 | token->type = CHARACTER; | |
1601 | token->mb_partial = 1; | |
1602 | return 1; | |
1603 | } | |
1604 | #endif | |
1605 | if (c == '\\') | |
1606 | { | |
1607 | unsigned char c2; | |
1608 | if (re_string_cur_idx (input) + 1 >= re_string_length (input)) | |
15a7d175 UD |
1609 | { |
1610 | token->type = BACK_SLASH; | |
1611 | return 1; | |
1612 | } | |
3b0bdc72 UD |
1613 | |
1614 | c2 = re_string_peek_byte_case (input, 1); | |
1615 | token->opr.c = c2; | |
1616 | token->type = CHARACTER; | |
1617 | switch (c2) | |
15a7d175 UD |
1618 | { |
1619 | case '|': | |
1620 | if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) | |
1621 | token->type = OP_ALT; | |
1622 | break; | |
1623 | case '1': case '2': case '3': case '4': case '5': | |
1624 | case '6': case '7': case '8': case '9': | |
1625 | if (!(syntax & RE_NO_BK_REFS)) | |
1626 | { | |
1627 | token->type = OP_BACK_REF; | |
1628 | token->opr.idx = c2 - '0'; | |
1629 | } | |
1630 | break; | |
1631 | case '<': | |
1632 | if (!(syntax & RE_NO_GNU_OPS)) | |
1633 | { | |
1634 | token->type = ANCHOR; | |
1635 | token->opr.idx = WORD_FIRST; | |
1636 | } | |
1637 | break; | |
1638 | case '>': | |
1639 | if (!(syntax & RE_NO_GNU_OPS)) | |
1640 | { | |
1641 | token->type = ANCHOR; | |
1642 | token->opr.idx = WORD_LAST; | |
1643 | } | |
1644 | break; | |
1645 | case 'b': | |
1646 | if (!(syntax & RE_NO_GNU_OPS)) | |
1647 | { | |
1648 | token->type = ANCHOR; | |
1649 | token->opr.idx = WORD_DELIM; | |
1650 | } | |
1651 | break; | |
1652 | case 'B': | |
1653 | if (!(syntax & RE_NO_GNU_OPS)) | |
1654 | { | |
1655 | token->type = ANCHOR; | |
1656 | token->opr.idx = INSIDE_WORD; | |
1657 | } | |
1658 | break; | |
1659 | case 'w': | |
1660 | if (!(syntax & RE_NO_GNU_OPS)) | |
1661 | token->type = OP_WORD; | |
1662 | break; | |
1663 | case 'W': | |
1664 | if (!(syntax & RE_NO_GNU_OPS)) | |
1665 | token->type = OP_NOTWORD; | |
1666 | break; | |
e2b6bfa3 UD |
1667 | case 's': |
1668 | if (!(syntax & RE_NO_GNU_OPS)) | |
1669 | token->type = OP_SPACE; | |
1670 | break; | |
1671 | case 'S': | |
1672 | if (!(syntax & RE_NO_GNU_OPS)) | |
1673 | token->type = OP_NOTSPACE; | |
1674 | break; | |
15a7d175 UD |
1675 | case '`': |
1676 | if (!(syntax & RE_NO_GNU_OPS)) | |
1677 | { | |
1678 | token->type = ANCHOR; | |
1679 | token->opr.idx = BUF_FIRST; | |
1680 | } | |
1681 | break; | |
1682 | case '\'': | |
1683 | if (!(syntax & RE_NO_GNU_OPS)) | |
1684 | { | |
1685 | token->type = ANCHOR; | |
1686 | token->opr.idx = BUF_LAST; | |
1687 | } | |
1688 | break; | |
1689 | case '(': | |
1690 | if (!(syntax & RE_NO_BK_PARENS)) | |
1691 | token->type = OP_OPEN_SUBEXP; | |
1692 | break; | |
1693 | case ')': | |
1694 | if (!(syntax & RE_NO_BK_PARENS)) | |
1695 | token->type = OP_CLOSE_SUBEXP; | |
1696 | break; | |
1697 | case '+': | |
1698 | if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) | |
1699 | token->type = OP_DUP_PLUS; | |
1700 | break; | |
1701 | case '?': | |
1702 | if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) | |
1703 | token->type = OP_DUP_QUESTION; | |
1704 | break; | |
1705 | case '{': | |
1706 | if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) | |
1707 | token->type = OP_OPEN_DUP_NUM; | |
1708 | break; | |
1709 | case '}': | |
1710 | if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) | |
1711 | token->type = OP_CLOSE_DUP_NUM; | |
1712 | break; | |
1713 | default: | |
1714 | break; | |
1715 | } | |
3b0bdc72 UD |
1716 | return 2; |
1717 | } | |
1718 | ||
1719 | token->type = CHARACTER; | |
1720 | switch (c) | |
1721 | { | |
1722 | case '\n': | |
1723 | if (syntax & RE_NEWLINE_ALT) | |
15a7d175 | 1724 | token->type = OP_ALT; |
3b0bdc72 UD |
1725 | break; |
1726 | case '|': | |
1727 | if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) | |
15a7d175 | 1728 | token->type = OP_ALT; |
3b0bdc72 UD |
1729 | break; |
1730 | case '*': | |
1731 | token->type = OP_DUP_ASTERISK; | |
1732 | break; | |
1733 | case '+': | |
1734 | if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) | |
15a7d175 | 1735 | token->type = OP_DUP_PLUS; |
3b0bdc72 UD |
1736 | break; |
1737 | case '?': | |
1738 | if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) | |
15a7d175 | 1739 | token->type = OP_DUP_QUESTION; |
3b0bdc72 UD |
1740 | break; |
1741 | case '{': | |
1742 | if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
15a7d175 | 1743 | token->type = OP_OPEN_DUP_NUM; |
3b0bdc72 UD |
1744 | break; |
1745 | case '}': | |
1746 | if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
15a7d175 | 1747 | token->type = OP_CLOSE_DUP_NUM; |
3b0bdc72 UD |
1748 | break; |
1749 | case '(': | |
1750 | if (syntax & RE_NO_BK_PARENS) | |
15a7d175 | 1751 | token->type = OP_OPEN_SUBEXP; |
3b0bdc72 UD |
1752 | break; |
1753 | case ')': | |
1754 | if (syntax & RE_NO_BK_PARENS) | |
15a7d175 | 1755 | token->type = OP_CLOSE_SUBEXP; |
3b0bdc72 UD |
1756 | break; |
1757 | case '[': | |
1758 | token->type = OP_OPEN_BRACKET; | |
1759 | break; | |
1760 | case '.': | |
1761 | token->type = OP_PERIOD; | |
1762 | break; | |
1763 | case '^': | |
134abcb5 | 1764 | if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && |
15a7d175 UD |
1765 | re_string_cur_idx (input) != 0) |
1766 | { | |
1767 | char prev = re_string_peek_byte (input, -1); | |
134abcb5 | 1768 | if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') |
15a7d175 UD |
1769 | break; |
1770 | } | |
3b0bdc72 UD |
1771 | token->type = ANCHOR; |
1772 | token->opr.idx = LINE_FIRST; | |
1773 | break; | |
1774 | case '$': | |
1775 | if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && | |
15a7d175 UD |
1776 | re_string_cur_idx (input) + 1 != re_string_length (input)) |
1777 | { | |
1778 | re_token_t next; | |
1779 | re_string_skip_bytes (input, 1); | |
1780 | peek_token (&next, input, syntax); | |
1781 | re_string_skip_bytes (input, -1); | |
1782 | if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) | |
1783 | break; | |
1784 | } | |
3b0bdc72 UD |
1785 | token->type = ANCHOR; |
1786 | token->opr.idx = LINE_LAST; | |
1787 | break; | |
1788 | default: | |
1789 | break; | |
1790 | } | |
1791 | return 1; | |
1792 | } | |
1793 | ||
1794 | /* Peek a token from INPUT, and return the length of the token. | |
1795 | We must not use this function out of bracket expressions. */ | |
1796 | ||
1797 | static int | |
1798 | peek_token_bracket (token, input, syntax) | |
1799 | re_token_t *token; | |
1800 | re_string_t *input; | |
1801 | reg_syntax_t syntax; | |
1802 | { | |
1803 | unsigned char c; | |
1804 | if (re_string_eoi (input)) | |
1805 | { | |
1806 | token->type = END_OF_RE; | |
1807 | return 0; | |
1808 | } | |
1809 | c = re_string_peek_byte (input, 0); | |
1810 | token->opr.c = c; | |
1811 | ||
1812 | #ifdef RE_ENABLE_I18N | |
3c0fb574 | 1813 | if (input->mb_cur_max > 1 && |
3b0bdc72 UD |
1814 | !re_string_first_byte (input, re_string_cur_idx (input))) |
1815 | { | |
1816 | token->type = CHARACTER; | |
1817 | return 1; | |
1818 | } | |
1819 | #endif /* RE_ENABLE_I18N */ | |
1820 | ||
1821 | if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)) | |
1822 | { | |
1823 | /* In this case, '\' escape a character. */ | |
1824 | unsigned char c2; | |
82bbb29e UD |
1825 | re_string_skip_bytes (input, 1); |
1826 | c2 = re_string_peek_byte (input, 0); | |
3b0bdc72 UD |
1827 | token->opr.c = c2; |
1828 | token->type = CHARACTER; | |
1829 | return 1; | |
1830 | } | |
1831 | if (c == '[') /* '[' is a special char in a bracket exps. */ | |
1832 | { | |
1833 | unsigned char c2; | |
1834 | int token_len; | |
1835 | c2 = re_string_peek_byte (input, 1); | |
1836 | token->opr.c = c2; | |
1837 | token_len = 2; | |
1838 | switch (c2) | |
15a7d175 UD |
1839 | { |
1840 | case '.': | |
1841 | token->type = OP_OPEN_COLL_ELEM; | |
1842 | break; | |
1843 | case '=': | |
1844 | token->type = OP_OPEN_EQUIV_CLASS; | |
1845 | break; | |
1846 | case ':': | |
1847 | if (syntax & RE_CHAR_CLASSES) | |
1848 | { | |
1849 | token->type = OP_OPEN_CHAR_CLASS; | |
1850 | break; | |
1851 | } | |
1852 | /* else fall through. */ | |
1853 | default: | |
1854 | token->type = CHARACTER; | |
1855 | token->opr.c = c; | |
1856 | token_len = 1; | |
1857 | break; | |
1858 | } | |
3b0bdc72 UD |
1859 | return token_len; |
1860 | } | |
1861 | switch (c) | |
1862 | { | |
1863 | case '-': | |
1864 | token->type = OP_CHARSET_RANGE; | |
1865 | break; | |
1866 | case ']': | |
1867 | token->type = OP_CLOSE_BRACKET; | |
1868 | break; | |
1869 | case '^': | |
1870 | token->type = OP_NON_MATCH_LIST; | |
1871 | break; | |
1872 | default: | |
1873 | token->type = CHARACTER; | |
1874 | } | |
1875 | return 1; | |
1876 | } | |
1877 | \f | |
1878 | /* Functions for parser. */ | |
1879 | ||
1880 | /* Entry point of the parser. | |
1881 | Parse the regular expression REGEXP and return the structure tree. | |
1882 | If an error is occured, ERR is set by error code, and return NULL. | |
1883 | This function build the following tree, from regular expression <reg_exp>: | |
15a7d175 UD |
1884 | CAT |
1885 | / \ | |
1886 | / \ | |
3b0bdc72 UD |
1887 | <reg_exp> EOR |
1888 | ||
1889 | CAT means concatenation. | |
1890 | EOR means end of regular expression. */ | |
1891 | ||
1892 | static bin_tree_t * | |
1893 | parse (regexp, preg, syntax, err) | |
1894 | re_string_t *regexp; | |
1895 | regex_t *preg; | |
1896 | reg_syntax_t syntax; | |
1897 | reg_errcode_t *err; | |
1898 | { | |
1899 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | |
1900 | bin_tree_t *tree, *eor, *root; | |
1901 | re_token_t current_token; | |
1902 | int new_idx; | |
134abcb5 | 1903 | current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE); |
3b0bdc72 | 1904 | tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); |
bc15410e | 1905 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
3b0bdc72 UD |
1906 | return NULL; |
1907 | new_idx = re_dfa_add_node (dfa, current_token, 0); | |
1908 | eor = create_tree (NULL, NULL, 0, new_idx); | |
1909 | if (tree != NULL) | |
1910 | root = create_tree (tree, eor, CONCAT, 0); | |
1911 | else | |
1912 | root = eor; | |
bc15410e | 1913 | if (BE (new_idx == -1 || eor == NULL || root == NULL, 0)) |
485d775d UD |
1914 | { |
1915 | *err = REG_ESPACE; | |
1916 | return NULL; | |
1917 | } | |
3b0bdc72 UD |
1918 | return root; |
1919 | } | |
1920 | ||
1921 | /* This function build the following tree, from regular expression | |
1922 | <branch1>|<branch2>: | |
15a7d175 UD |
1923 | ALT |
1924 | / \ | |
1925 | / \ | |
3b0bdc72 UD |
1926 | <branch1> <branch2> |
1927 | ||
1928 | ALT means alternative, which represents the operator `|'. */ | |
1929 | ||
1930 | static bin_tree_t * | |
1931 | parse_reg_exp (regexp, preg, token, syntax, nest, err) | |
1932 | re_string_t *regexp; | |
1933 | regex_t *preg; | |
1934 | re_token_t *token; | |
1935 | reg_syntax_t syntax; | |
1936 | int nest; | |
1937 | reg_errcode_t *err; | |
1938 | { | |
1939 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | |
1940 | bin_tree_t *tree, *branch = NULL; | |
1941 | int new_idx; | |
1942 | tree = parse_branch (regexp, preg, token, syntax, nest, err); | |
bc15410e | 1943 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
3b0bdc72 UD |
1944 | return NULL; |
1945 | ||
1946 | while (token->type == OP_ALT) | |
1947 | { | |
1948 | re_token_t alt_token = *token; | |
1949 | new_idx = re_dfa_add_node (dfa, alt_token, 0); | |
134abcb5 | 1950 | *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE); |
3b0bdc72 | 1951 | if (token->type != OP_ALT && token->type != END_OF_RE |
15a7d175 UD |
1952 | && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) |
1953 | { | |
1954 | branch = parse_branch (regexp, preg, token, syntax, nest, err); | |
1955 | if (BE (*err != REG_NOERROR && branch == NULL, 0)) | |
1956 | { | |
1957 | free_bin_tree (tree); | |
1958 | return NULL; | |
1959 | } | |
1960 | } | |
9b88fc16 UD |
1961 | else |
1962 | branch = NULL; | |
3b0bdc72 | 1963 | tree = create_tree (tree, branch, 0, new_idx); |
bc15410e | 1964 | if (BE (new_idx == -1 || tree == NULL, 0)) |
15a7d175 UD |
1965 | { |
1966 | *err = REG_ESPACE; | |
1967 | return NULL; | |
1968 | } | |
0742e48e | 1969 | dfa->has_plural_match = 1; |
3b0bdc72 UD |
1970 | } |
1971 | return tree; | |
1972 | } | |
1973 | ||
1974 | /* This function build the following tree, from regular expression | |
1975 | <exp1><exp2>: | |
15a7d175 UD |
1976 | CAT |
1977 | / \ | |
3b0bdc72 UD |
1978 | / \ |
1979 | <exp1> <exp2> | |
1980 | ||
1981 | CAT means concatenation. */ | |
1982 | ||
1983 | static bin_tree_t * | |
1984 | parse_branch (regexp, preg, token, syntax, nest, err) | |
1985 | re_string_t *regexp; | |
1986 | regex_t *preg; | |
1987 | re_token_t *token; | |
1988 | reg_syntax_t syntax; | |
1989 | int nest; | |
1990 | reg_errcode_t *err; | |
1991 | { | |
1992 | bin_tree_t *tree, *exp; | |
1993 | tree = parse_expression (regexp, preg, token, syntax, nest, err); | |
bc15410e | 1994 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
3b0bdc72 UD |
1995 | return NULL; |
1996 | ||
1997 | while (token->type != OP_ALT && token->type != END_OF_RE | |
15a7d175 | 1998 | && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) |
3b0bdc72 UD |
1999 | { |
2000 | exp = parse_expression (regexp, preg, token, syntax, nest, err); | |
bc15410e | 2001 | if (BE (*err != REG_NOERROR && exp == NULL, 0)) |
15a7d175 UD |
2002 | { |
2003 | free_bin_tree (tree); | |
2004 | return NULL; | |
2005 | } | |
3b0bdc72 | 2006 | if (tree != NULL && exp != NULL) |
15a7d175 UD |
2007 | { |
2008 | tree = create_tree (tree, exp, CONCAT, 0); | |
2009 | if (tree == NULL) | |
2010 | { | |
2011 | *err = REG_ESPACE; | |
2012 | return NULL; | |
2013 | } | |
2014 | } | |
3b0bdc72 | 2015 | else if (tree == NULL) |
15a7d175 | 2016 | tree = exp; |
3b0bdc72 UD |
2017 | /* Otherwise exp == NULL, we don't need to create new tree. */ |
2018 | } | |
2019 | return tree; | |
2020 | } | |
2021 | ||
2022 | /* This function build the following tree, from regular expression a*: | |
15a7d175 UD |
2023 | * |
2024 | | | |
2025 | a | |
3b0bdc72 UD |
2026 | */ |
2027 | ||
2028 | static bin_tree_t * | |
2029 | parse_expression (regexp, preg, token, syntax, nest, err) | |
2030 | re_string_t *regexp; | |
2031 | regex_t *preg; | |
2032 | re_token_t *token; | |
2033 | reg_syntax_t syntax; | |
2034 | int nest; | |
2035 | reg_errcode_t *err; | |
2036 | { | |
2037 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | |
2038 | bin_tree_t *tree; | |
2039 | int new_idx; | |
2040 | switch (token->type) | |
2041 | { | |
2042 | case CHARACTER: | |
2043 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2044 | tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 2045 | if (BE (new_idx == -1 || tree == NULL, 0)) |
15a7d175 UD |
2046 | { |
2047 | *err = REG_ESPACE; | |
2048 | return NULL; | |
2049 | } | |
3b0bdc72 | 2050 | #ifdef RE_ENABLE_I18N |
3c0fb574 | 2051 | if (dfa->mb_cur_max > 1) |
3b0bdc72 UD |
2052 | { |
2053 | while (!re_string_eoi (regexp) | |
2054 | && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) | |
2055 | { | |
15a7d175 UD |
2056 | bin_tree_t *mbc_remain; |
2057 | *token = fetch_token (regexp, syntax); | |
2058 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2059 | mbc_remain = create_tree (NULL, NULL, 0, new_idx); | |
2060 | tree = create_tree (tree, mbc_remain, CONCAT, 0); | |
2061 | if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0)) | |
54e1cabc UD |
2062 | { |
2063 | *err = REG_ESPACE; | |
2064 | return NULL; | |
2065 | } | |
15a7d175 | 2066 | } |
3b0bdc72 UD |
2067 | } |
2068 | #endif | |
2069 | break; | |
2070 | case OP_OPEN_SUBEXP: | |
2071 | tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); | |
bc15410e | 2072 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
15a7d175 | 2073 | return NULL; |
3b0bdc72 UD |
2074 | break; |
2075 | case OP_OPEN_BRACKET: | |
2076 | tree = parse_bracket_exp (regexp, dfa, token, syntax, err); | |
bc15410e | 2077 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
15a7d175 | 2078 | return NULL; |
3b0bdc72 UD |
2079 | break; |
2080 | case OP_BACK_REF: | |
bc15410e | 2081 | if (BE (preg->re_nsub < token->opr.idx |
15a7d175 UD |
2082 | || dfa->subexps[token->opr.idx - 1].end == -1, 0)) |
2083 | { | |
2084 | *err = REG_ESUBREG; | |
2085 | return NULL; | |
2086 | } | |
6291ee3c | 2087 | dfa->used_bkref_map |= 1 << (token->opr.idx - 1); |
3b0bdc72 UD |
2088 | new_idx = re_dfa_add_node (dfa, *token, 0); |
2089 | tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 2090 | if (BE (new_idx == -1 || tree == NULL, 0)) |
15a7d175 UD |
2091 | { |
2092 | *err = REG_ESPACE; | |
2093 | return NULL; | |
2094 | } | |
3b0bdc72 UD |
2095 | ++dfa->nbackref; |
2096 | dfa->has_mb_node = 1; | |
2097 | break; | |
06e8303a UD |
2098 | case OP_OPEN_DUP_NUM: |
2099 | if (syntax & RE_CONTEXT_INVALID_DUP) | |
2100 | { | |
2101 | *err = REG_BADRPT; | |
2102 | return NULL; | |
2103 | } | |
2104 | /* FALLTHROUGH */ | |
3b0bdc72 UD |
2105 | case OP_DUP_ASTERISK: |
2106 | case OP_DUP_PLUS: | |
2107 | case OP_DUP_QUESTION: | |
3b0bdc72 | 2108 | if (syntax & RE_CONTEXT_INVALID_OPS) |
15a7d175 UD |
2109 | { |
2110 | *err = REG_BADRPT; | |
2111 | return NULL; | |
2112 | } | |
3b0bdc72 | 2113 | else if (syntax & RE_CONTEXT_INDEP_OPS) |
15a7d175 UD |
2114 | { |
2115 | *token = fetch_token (regexp, syntax); | |
2116 | return parse_expression (regexp, preg, token, syntax, nest, err); | |
2117 | } | |
3b0bdc72 UD |
2118 | /* else fall through */ |
2119 | case OP_CLOSE_SUBEXP: | |
2120 | if ((token->type == OP_CLOSE_SUBEXP) && | |
15a7d175 UD |
2121 | !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) |
2122 | { | |
2123 | *err = REG_ERPAREN; | |
2124 | return NULL; | |
2125 | } | |
3b0bdc72 UD |
2126 | /* else fall through */ |
2127 | case OP_CLOSE_DUP_NUM: | |
2128 | /* We treat it as a normal character. */ | |
2129 | ||
2130 | /* Then we can these characters as normal characters. */ | |
2131 | token->type = CHARACTER; | |
2132 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2133 | tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 2134 | if (BE (new_idx == -1 || tree == NULL, 0)) |
15a7d175 UD |
2135 | { |
2136 | *err = REG_ESPACE; | |
2137 | return NULL; | |
2138 | } | |
3b0bdc72 UD |
2139 | break; |
2140 | case ANCHOR: | |
2141 | if (dfa->word_char == NULL) | |
15a7d175 UD |
2142 | { |
2143 | *err = init_word_char (dfa); | |
2144 | if (BE (*err != REG_NOERROR, 0)) | |
2145 | return NULL; | |
2146 | } | |
3b0bdc72 | 2147 | if (token->opr.ctx_type == WORD_DELIM) |
15a7d175 UD |
2148 | { |
2149 | bin_tree_t *tree_first, *tree_last; | |
2150 | int idx_first, idx_last; | |
2151 | token->opr.ctx_type = WORD_FIRST; | |
2152 | idx_first = re_dfa_add_node (dfa, *token, 0); | |
2153 | tree_first = create_tree (NULL, NULL, 0, idx_first); | |
2154 | token->opr.ctx_type = WORD_LAST; | |
2155 | idx_last = re_dfa_add_node (dfa, *token, 0); | |
2156 | tree_last = create_tree (NULL, NULL, 0, idx_last); | |
2157 | token->type = OP_ALT; | |
2158 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2159 | tree = create_tree (tree_first, tree_last, 0, new_idx); | |
2160 | if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1 | |
2161 | || tree_first == NULL || tree_last == NULL | |
2162 | || tree == NULL, 0)) | |
2163 | { | |
2164 | *err = REG_ESPACE; | |
2165 | return NULL; | |
2166 | } | |
2167 | } | |
3b0bdc72 | 2168 | else |
15a7d175 UD |
2169 | { |
2170 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2171 | tree = create_tree (NULL, NULL, 0, new_idx); | |
2172 | if (BE (new_idx == -1 || tree == NULL, 0)) | |
54e1cabc UD |
2173 | { |
2174 | *err = REG_ESPACE; | |
2175 | return NULL; | |
2176 | } | |
15a7d175 | 2177 | } |
3b0bdc72 | 2178 | /* We must return here, since ANCHORs can't be followed |
15a7d175 UD |
2179 | by repetition operators. |
2180 | eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", | |
2181 | it must not be "<ANCHOR(^)><REPEAT(*)>". */ | |
3b0bdc72 UD |
2182 | *token = fetch_token (regexp, syntax); |
2183 | return tree; | |
2184 | case OP_PERIOD: | |
2185 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2186 | tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 2187 | if (BE (new_idx == -1 || tree == NULL, 0)) |
15a7d175 UD |
2188 | { |
2189 | *err = REG_ESPACE; | |
2190 | return NULL; | |
2191 | } | |
3c0fb574 | 2192 | if (dfa->mb_cur_max > 1) |
15a7d175 | 2193 | dfa->has_mb_node = 1; |
3b0bdc72 UD |
2194 | break; |
2195 | case OP_WORD: | |
e2b6bfa3 | 2196 | tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err); |
bc15410e | 2197 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
15a7d175 | 2198 | return NULL; |
3b0bdc72 UD |
2199 | break; |
2200 | case OP_NOTWORD: | |
e2b6bfa3 UD |
2201 | tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err); |
2202 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) | |
2203 | return NULL; | |
2204 | break; | |
2205 | case OP_SPACE: | |
2206 | tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err); | |
2207 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) | |
2208 | return NULL; | |
2209 | break; | |
2210 | case OP_NOTSPACE: | |
2211 | tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err); | |
bc15410e | 2212 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
15a7d175 | 2213 | return NULL; |
3b0bdc72 UD |
2214 | break; |
2215 | case OP_ALT: | |
2216 | case END_OF_RE: | |
2217 | return NULL; | |
2218 | case BACK_SLASH: | |
2219 | *err = REG_EESCAPE; | |
2220 | return NULL; | |
2221 | default: | |
2222 | /* Must not happen? */ | |
2223 | #ifdef DEBUG | |
2224 | assert (0); | |
2225 | #endif | |
2226 | return NULL; | |
2227 | } | |
2228 | *token = fetch_token (regexp, syntax); | |
2229 | ||
2230 | while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS | |
15a7d175 | 2231 | || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) |
3b0bdc72 UD |
2232 | { |
2233 | tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); | |
bc15410e | 2234 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
15a7d175 | 2235 | return NULL; |
c34bfc8d UD |
2236 | /* In BRE consecutive duplications are not allowed. */ |
2237 | if ((syntax & RE_CONTEXT_INVALID_DUP) | |
2238 | && (token->type == OP_DUP_ASTERISK | |
2239 | || token->type == OP_OPEN_DUP_NUM)) | |
2240 | { | |
2241 | *err = REG_BADRPT; | |
2242 | return NULL; | |
2243 | } | |
0742e48e | 2244 | dfa->has_plural_match = 1; |
3b0bdc72 UD |
2245 | } |
2246 | ||
2247 | return tree; | |
2248 | } | |
2249 | ||
2250 | /* This function build the following tree, from regular expression | |
2251 | (<reg_exp>): | |
15a7d175 UD |
2252 | SUBEXP |
2253 | | | |
2254 | <reg_exp> | |
3b0bdc72 UD |
2255 | */ |
2256 | ||
2257 | static bin_tree_t * | |
2258 | parse_sub_exp (regexp, preg, token, syntax, nest, err) | |
2259 | re_string_t *regexp; | |
2260 | regex_t *preg; | |
2261 | re_token_t *token; | |
2262 | reg_syntax_t syntax; | |
2263 | int nest; | |
2264 | reg_errcode_t *err; | |
2265 | { | |
2266 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | |
81c64d40 | 2267 | bin_tree_t *tree, *left_par, *right_par; |
3b0bdc72 | 2268 | size_t cur_nsub; |
81c64d40 | 2269 | int new_idx; |
3b0bdc72 UD |
2270 | cur_nsub = preg->re_nsub++; |
2271 | if (dfa->subexps_alloc < preg->re_nsub) | |
2272 | { | |
2273 | re_subexp_t *new_array; | |
2274 | dfa->subexps_alloc *= 2; | |
2275 | new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc); | |
bc15410e | 2276 | if (BE (new_array == NULL, 0)) |
3b0bdc72 UD |
2277 | { |
2278 | dfa->subexps_alloc /= 2; | |
2279 | *err = REG_ESPACE; | |
2280 | return NULL; | |
2281 | } | |
2282 | dfa->subexps = new_array; | |
2283 | } | |
2284 | dfa->subexps[cur_nsub].start = dfa->nodes_len; | |
2285 | dfa->subexps[cur_nsub].end = -1; | |
81c64d40 UD |
2286 | |
2287 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2288 | left_par = create_tree (NULL, NULL, 0, new_idx); | |
2289 | if (BE (new_idx == -1 || left_par == NULL, 0)) | |
485d775d UD |
2290 | { |
2291 | *err = REG_ESPACE; | |
2292 | return NULL; | |
2293 | } | |
81c64d40 | 2294 | dfa->nodes[new_idx].opr.idx = cur_nsub; |
134abcb5 | 2295 | *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE); |
3b0bdc72 UD |
2296 | |
2297 | /* The subexpression may be a null string. */ | |
2298 | if (token->type == OP_CLOSE_SUBEXP) | |
81c64d40 | 2299 | tree = NULL; |
3b0bdc72 UD |
2300 | else |
2301 | { | |
2302 | tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); | |
bc15410e | 2303 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
15a7d175 | 2304 | return NULL; |
3b0bdc72 | 2305 | } |
81c64d40 UD |
2306 | if (BE (token->type != OP_CLOSE_SUBEXP, 0)) |
2307 | { | |
2308 | free_bin_tree (tree); | |
06e8303a | 2309 | *err = REG_EPAREN; |
81c64d40 UD |
2310 | return NULL; |
2311 | } | |
2312 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2313 | dfa->subexps[cur_nsub].end = dfa->nodes_len; | |
2314 | right_par = create_tree (NULL, NULL, 0, new_idx); | |
2315 | tree = ((tree == NULL) ? right_par | |
15a7d175 | 2316 | : create_tree (tree, right_par, CONCAT, 0)); |
81c64d40 UD |
2317 | tree = create_tree (left_par, tree, CONCAT, 0); |
2318 | if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0)) | |
485d775d UD |
2319 | { |
2320 | *err = REG_ESPACE; | |
2321 | return NULL; | |
2322 | } | |
81c64d40 UD |
2323 | dfa->nodes[new_idx].opr.idx = cur_nsub; |
2324 | ||
3b0bdc72 UD |
2325 | return tree; |
2326 | } | |
2327 | ||
2328 | /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ | |
2329 | ||
2330 | static bin_tree_t * | |
2331 | parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) | |
2332 | bin_tree_t *dup_elem; | |
2333 | re_string_t *regexp; | |
2334 | re_dfa_t *dfa; | |
2335 | re_token_t *token; | |
2336 | reg_syntax_t syntax; | |
2337 | reg_errcode_t *err; | |
2338 | { | |
2339 | re_token_t dup_token; | |
2340 | bin_tree_t *tree = dup_elem, *work_tree; | |
2341 | int new_idx, start_idx = re_string_cur_idx (regexp); | |
2342 | re_token_t start_token = *token; | |
2343 | if (token->type == OP_OPEN_DUP_NUM) | |
2344 | { | |
602c2f9d UD |
2345 | int i; |
2346 | int end = 0; | |
2347 | int start = fetch_number (regexp, token, syntax); | |
3b0bdc72 UD |
2348 | bin_tree_t *elem; |
2349 | if (start == -1) | |
15a7d175 UD |
2350 | { |
2351 | if (token->type == CHARACTER && token->opr.c == ',') | |
2352 | start = 0; /* We treat "{,m}" as "{0,m}". */ | |
2353 | else | |
2354 | { | |
2355 | *err = REG_BADBR; /* <re>{} is invalid. */ | |
2356 | return NULL; | |
2357 | } | |
2358 | } | |
602c2f9d | 2359 | if (BE (start != -2, 1)) |
15a7d175 UD |
2360 | { |
2361 | /* We treat "{n}" as "{n,n}". */ | |
2362 | end = ((token->type == OP_CLOSE_DUP_NUM) ? start | |
2363 | : ((token->type == CHARACTER && token->opr.c == ',') | |
2364 | ? fetch_number (regexp, token, syntax) : -2)); | |
2365 | } | |
602c2f9d | 2366 | if (BE (start == -2 || end == -2, 0)) |
15a7d175 UD |
2367 | { |
2368 | /* Invalid sequence. */ | |
2369 | if (token->type == OP_CLOSE_DUP_NUM) | |
2370 | goto parse_dup_op_invalid_interval; | |
2371 | else | |
2372 | goto parse_dup_op_ebrace; | |
2373 | } | |
602c2f9d | 2374 | if (BE (start == 0 && end == 0, 0)) |
15a7d175 UD |
2375 | { |
2376 | /* We treat "<re>{0}" and "<re>{0,0}" as null string. */ | |
2377 | *token = fetch_token (regexp, syntax); | |
2378 | free_bin_tree (dup_elem); | |
2379 | return NULL; | |
2380 | } | |
602c2f9d | 2381 | |
3b0bdc72 UD |
2382 | /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ |
2383 | elem = tree; | |
2384 | for (i = 0; i < start; ++i) | |
15a7d175 UD |
2385 | if (i != 0) |
2386 | { | |
2387 | work_tree = duplicate_tree (elem, dfa); | |
2388 | tree = create_tree (tree, work_tree, CONCAT, 0); | |
2389 | if (BE (work_tree == NULL || tree == NULL, 0)) | |
2390 | goto parse_dup_op_espace; | |
2391 | } | |
3b0bdc72 UD |
2392 | |
2393 | if (end == -1) | |
15a7d175 UD |
2394 | { |
2395 | /* We treat "<re>{0,}" as "<re>*". */ | |
2396 | dup_token.type = OP_DUP_ASTERISK; | |
2397 | if (start > 0) | |
2398 | { | |
2399 | elem = duplicate_tree (elem, dfa); | |
2400 | new_idx = re_dfa_add_node (dfa, dup_token, 0); | |
2401 | work_tree = create_tree (elem, NULL, 0, new_idx); | |
2402 | tree = create_tree (tree, work_tree, CONCAT, 0); | |
2403 | if (BE (elem == NULL || new_idx == -1 || work_tree == NULL | |
2404 | || tree == NULL, 0)) | |
2405 | goto parse_dup_op_espace; | |
2406 | } | |
2407 | else | |
2408 | { | |
2409 | new_idx = re_dfa_add_node (dfa, dup_token, 0); | |
2410 | tree = create_tree (elem, NULL, 0, new_idx); | |
2411 | if (BE (new_idx == -1 || tree == NULL, 0)) | |
2412 | goto parse_dup_op_espace; | |
2413 | } | |
2414 | } | |
deeb4aae UD |
2415 | else if (BE (start > end, 0)) |
2416 | { | |
2417 | /* First number greater than first. */ | |
2418 | *err = REG_BADBR; | |
2419 | return NULL; | |
2420 | } | |
3b0bdc72 | 2421 | else if (end - start > 0) |
15a7d175 UD |
2422 | { |
2423 | /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */ | |
2424 | dup_token.type = OP_DUP_QUESTION; | |
2425 | if (start > 0) | |
2426 | { | |
2427 | elem = duplicate_tree (elem, dfa); | |
2428 | new_idx = re_dfa_add_node (dfa, dup_token, 0); | |
2429 | elem = create_tree (elem, NULL, 0, new_idx); | |
2430 | tree = create_tree (tree, elem, CONCAT, 0); | |
2431 | if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0)) | |
2432 | goto parse_dup_op_espace; | |
2433 | } | |
2434 | else | |
2435 | { | |
2436 | new_idx = re_dfa_add_node (dfa, dup_token, 0); | |
2437 | tree = elem = create_tree (elem, NULL, 0, new_idx); | |
2438 | if (BE (new_idx == -1 || tree == NULL, 0)) | |
2439 | goto parse_dup_op_espace; | |
2440 | } | |
2441 | for (i = 1; i < end - start; ++i) | |
2442 | { | |
2443 | work_tree = duplicate_tree (elem, dfa); | |
2444 | tree = create_tree (tree, work_tree, CONCAT, 0); | |
2445 | if (BE (work_tree == NULL || tree == NULL, 0)) | |
2446 | { | |
2447 | *err = REG_ESPACE; | |
2448 | return NULL; | |
2449 | } | |
2450 | } | |
2451 | } | |
3b0bdc72 UD |
2452 | } |
2453 | else | |
2454 | { | |
2455 | new_idx = re_dfa_add_node (dfa, *token, 0); | |
2456 | tree = create_tree (tree, NULL, 0, new_idx); | |
bc15410e | 2457 | if (BE (new_idx == -1 || tree == NULL, 0)) |
15a7d175 UD |
2458 | { |
2459 | *err = REG_ESPACE; | |
2460 | return NULL; | |
2461 | } | |
3b0bdc72 UD |
2462 | } |
2463 | *token = fetch_token (regexp, syntax); | |
2464 | return tree; | |
2465 | ||
2466 | parse_dup_op_espace: | |
2467 | free_bin_tree (tree); | |
2468 | *err = REG_ESPACE; | |
2469 | return NULL; | |
2470 | ||
602c2f9d | 2471 | parse_dup_op_ebrace: |
bc15410e | 2472 | if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) |
3b0bdc72 UD |
2473 | { |
2474 | *err = REG_EBRACE; | |
2475 | return NULL; | |
2476 | } | |
602c2f9d UD |
2477 | goto parse_dup_op_rollback; |
2478 | parse_dup_op_invalid_interval: | |
2479 | if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) | |
2480 | { | |
2481 | *err = REG_BADBR; | |
2482 | return NULL; | |
2483 | } | |
2484 | parse_dup_op_rollback: | |
3b0bdc72 UD |
2485 | re_string_set_index (regexp, start_idx); |
2486 | *token = start_token; | |
2487 | token->type = CHARACTER; | |
2488 | return dup_elem; | |
2489 | } | |
2490 | ||
2491 | /* Size of the names for collating symbol/equivalence_class/character_class. | |
2492 | I'm not sure, but maybe enough. */ | |
2493 | #define BRACKET_NAME_BUF_SIZE 32 | |
2494 | ||
434d3784 UD |
2495 | #ifndef _LIBC |
2496 | /* Local function for parse_bracket_exp only used in case of NOT _LIBC. | |
2497 | Build the range expression which starts from START_ELEM, and ends | |
2498 | at END_ELEM. The result are written to MBCSET and SBCSET. | |
2499 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and | |
2500 | mbcset->range_ends, is a pointer argument sinse we may | |
2501 | update it. */ | |
2502 | ||
2503 | static reg_errcode_t | |
c0a0f9a3 UD |
2504 | # ifdef RE_ENABLE_I18N |
2505 | build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) | |
434d3784 | 2506 | re_charset_t *mbcset; |
434d3784 | 2507 | int *range_alloc; |
c0a0f9a3 UD |
2508 | # else /* not RE_ENABLE_I18N */ |
2509 | build_range_exp (sbcset, start_elem, end_elem) | |
2510 | # endif /* not RE_ENABLE_I18N */ | |
2511 | re_bitset_ptr_t sbcset; | |
434d3784 UD |
2512 | bracket_elem_t *start_elem, *end_elem; |
2513 | { | |
2514 | unsigned int start_ch, end_ch; | |
2515 | /* Equivalence Classes and Character Classes can't be a range start/end. */ | |
2516 | if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS | |
15a7d175 UD |
2517 | || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, |
2518 | 0)) | |
434d3784 UD |
2519 | return REG_ERANGE; |
2520 | ||
2521 | /* We can handle no multi character collating elements without libc | |
2522 | support. */ | |
62439eac | 2523 | if (BE ((start_elem->type == COLL_SYM |
15a7d175 UD |
2524 | && strlen ((char *) start_elem->opr.name) > 1) |
2525 | || (end_elem->type == COLL_SYM | |
2526 | && strlen ((char *) end_elem->opr.name) > 1), 0)) | |
434d3784 UD |
2527 | return REG_ECOLLATE; |
2528 | ||
2529 | # ifdef RE_ENABLE_I18N | |
2530 | { | |
2531 | wchar_t wc, start_wc, end_wc; | |
2532 | wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; | |
2533 | ||
2534 | start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch | |
15a7d175 UD |
2535 | : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] |
2536 | : 0)); | |
434d3784 | 2537 | end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch |
15a7d175 UD |
2538 | : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] |
2539 | : 0)); | |
434d3784 | 2540 | start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) |
15a7d175 | 2541 | ? __btowc (start_ch) : start_elem->opr.wch); |
434d3784 | 2542 | end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) |
15a7d175 | 2543 | ? __btowc (end_ch) : end_elem->opr.wch); |
434d3784 UD |
2544 | cmp_buf[0] = start_wc; |
2545 | cmp_buf[4] = end_wc; | |
2546 | if (wcscoll (cmp_buf, cmp_buf + 4) > 0) | |
2547 | return REG_ERANGE; | |
2548 | ||
2549 | /* Check the space of the arrays. */ | |
2550 | if (*range_alloc == mbcset->nranges) | |
2551 | { | |
15a7d175 UD |
2552 | /* There are not enough space, need realloc. */ |
2553 | wchar_t *new_array_start, *new_array_end; | |
2554 | int new_nranges; | |
2555 | ||
2556 | /* +1 in case of mbcset->nranges is 0. */ | |
2557 | new_nranges = 2 * mbcset->nranges + 1; | |
2558 | /* Use realloc since mbcset->range_starts and mbcset->range_ends | |
2559 | are NULL if *range_alloc == 0. */ | |
2560 | new_array_start = re_realloc (mbcset->range_starts, wchar_t, | |
2561 | new_nranges); | |
2562 | new_array_end = re_realloc (mbcset->range_ends, wchar_t, | |
2563 | new_nranges); | |
2564 | ||
2565 | if (BE (new_array_start == NULL || new_array_end == NULL, 0)) | |
2566 | return REG_ESPACE; | |
2567 | ||
2568 | mbcset->range_starts = new_array_start; | |
2569 | mbcset->range_ends = new_array_end; | |
2570 | *range_alloc = new_nranges; | |
434d3784 UD |
2571 | } |
2572 | ||
2573 | mbcset->range_starts[mbcset->nranges] = start_wc; | |
2574 | mbcset->range_ends[mbcset->nranges++] = end_wc; | |
2575 | ||
2576 | /* Build the table for single byte characters. */ | |
2577 | for (wc = 0; wc <= SBC_MAX; ++wc) | |
2578 | { | |
15a7d175 UD |
2579 | cmp_buf[2] = wc; |
2580 | if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 | |
2581 | && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) | |
2582 | bitset_set (sbcset, wc); | |
434d3784 UD |
2583 | } |
2584 | } | |
2585 | # else /* not RE_ENABLE_I18N */ | |
2586 | { | |
2587 | unsigned int ch; | |
2588 | start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch | |
15a7d175 UD |
2589 | : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] |
2590 | : 0)); | |
434d3784 | 2591 | end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch |
15a7d175 UD |
2592 | : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] |
2593 | : 0)); | |
434d3784 UD |
2594 | if (start_ch > end_ch) |
2595 | return REG_ERANGE; | |
2596 | /* Build the table for single byte characters. */ | |
2597 | for (ch = 0; ch <= SBC_MAX; ++ch) | |
2598 | if (start_ch <= ch && ch <= end_ch) | |
15a7d175 | 2599 | bitset_set (sbcset, ch); |
434d3784 UD |
2600 | } |
2601 | # endif /* not RE_ENABLE_I18N */ | |
2602 | return REG_NOERROR; | |
2603 | } | |
2604 | #endif /* not _LIBC */ | |
2605 | ||
2606 | #ifndef _LIBC | |
2607 | /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. | |
2608 | Build the collating element which is represented by NAME. | |
2609 | The result are written to MBCSET and SBCSET. | |
2610 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a | |
2611 | pointer argument since we may update it. */ | |
2612 | ||
2613 | static reg_errcode_t | |
c0a0f9a3 UD |
2614 | # ifdef RE_ENABLE_I18N |
2615 | build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) | |
434d3784 | 2616 | re_charset_t *mbcset; |
434d3784 | 2617 | int *coll_sym_alloc; |
c0a0f9a3 UD |
2618 | # else /* not RE_ENABLE_I18N */ |
2619 | build_collating_symbol (sbcset, name) | |
2620 | # endif /* not RE_ENABLE_I18N */ | |
2621 | re_bitset_ptr_t sbcset; | |
62439eac | 2622 | const unsigned char *name; |
434d3784 | 2623 | { |
62439eac UD |
2624 | size_t name_len = strlen ((const char *) name); |
2625 | if (BE (name_len != 1, 0)) | |
434d3784 UD |
2626 | return REG_ECOLLATE; |
2627 | else | |
2628 | { | |
62439eac | 2629 | bitset_set (sbcset, name[0]); |
434d3784 UD |
2630 | return REG_NOERROR; |
2631 | } | |
2632 | } | |
2633 | #endif /* not _LIBC */ | |
2634 | ||
3b0bdc72 UD |
2635 | /* This function parse bracket expression like "[abc]", "[a-c]", |
2636 | "[[.a-a.]]" etc. */ | |
2637 | ||
2638 | static bin_tree_t * | |
2639 | parse_bracket_exp (regexp, dfa, token, syntax, err) | |
2640 | re_string_t *regexp; | |
2641 | re_dfa_t *dfa; | |
2642 | re_token_t *token; | |
2643 | reg_syntax_t syntax; | |
2644 | reg_errcode_t *err; | |
2645 | { | |
2646 | #ifdef _LIBC | |
62439eac UD |
2647 | const unsigned char *collseqmb; |
2648 | const char *collseqwc; | |
3b0bdc72 UD |
2649 | uint32_t nrules; |
2650 | int32_t table_size; | |
2651 | const int32_t *symb_table; | |
2652 | const unsigned char *extra; | |
2653 | ||
434d3784 | 2654 | /* Local function for parse_bracket_exp used in _LIBC environement. |
3b0bdc72 UD |
2655 | Seek the collating symbol entry correspondings to NAME. |
2656 | Return the index of the symbol in the SYMB_TABLE. */ | |
2657 | ||
2658 | static inline int32_t | |
25337753 | 2659 | __attribute ((always_inline)) |
3b0bdc72 | 2660 | seek_collating_symbol_entry (name, name_len) |
15a7d175 UD |
2661 | const unsigned char *name; |
2662 | size_t name_len; | |
3b0bdc72 | 2663 | { |
62439eac | 2664 | int32_t hash = elem_hash ((const char *) name, name_len); |
3b0bdc72 UD |
2665 | int32_t elem = hash % table_size; |
2666 | int32_t second = hash % (table_size - 2); | |
2667 | while (symb_table[2 * elem] != 0) | |
15a7d175 UD |
2668 | { |
2669 | /* First compare the hashing value. */ | |
2670 | if (symb_table[2 * elem] == hash | |
2671 | /* Compare the length of the name. */ | |
2672 | && name_len == extra[symb_table[2 * elem + 1]] | |
2673 | /* Compare the name. */ | |
2674 | && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], | |
2675 | name_len) == 0) | |
2676 | { | |
2677 | /* Yep, this is the entry. */ | |
2678 | break; | |
2679 | } | |
2680 | ||
2681 | /* Next entry. */ | |
2682 | elem += second; | |
2683 | } | |
3b0bdc72 UD |
2684 | return elem; |
2685 | } | |
2686 | ||
434d3784 | 2687 | /* Local function for parse_bracket_exp used in _LIBC environement. |
3b0bdc72 UD |
2688 | Look up the collation sequence value of BR_ELEM. |
2689 | Return the value if succeeded, UINT_MAX otherwise. */ | |
2690 | ||
2691 | static inline unsigned int | |
25337753 | 2692 | __attribute ((always_inline)) |
3b0bdc72 | 2693 | lookup_collation_sequence_value (br_elem) |
15a7d175 | 2694 | bracket_elem_t *br_elem; |
3b0bdc72 UD |
2695 | { |
2696 | if (br_elem->type == SB_CHAR) | |
15a7d175 UD |
2697 | { |
2698 | /* | |
2699 | if (MB_CUR_MAX == 1) | |
2700 | */ | |
2701 | if (nrules == 0) | |
2702 | return collseqmb[br_elem->opr.ch]; | |
2703 | else | |
2704 | { | |
2705 | wint_t wc = __btowc (br_elem->opr.ch); | |
25337753 | 2706 | return __collseq_table_lookup (collseqwc, wc); |
15a7d175 UD |
2707 | } |
2708 | } | |
3b0bdc72 | 2709 | else if (br_elem->type == MB_CHAR) |
15a7d175 | 2710 | { |
25337753 | 2711 | return __collseq_table_lookup (collseqwc, br_elem->opr.wch); |
15a7d175 | 2712 | } |
3b0bdc72 | 2713 | else if (br_elem->type == COLL_SYM) |
15a7d175 UD |
2714 | { |
2715 | size_t sym_name_len = strlen ((char *) br_elem->opr.name); | |
2716 | if (nrules != 0) | |
2717 | { | |
2718 | int32_t elem, idx; | |
2719 | elem = seek_collating_symbol_entry (br_elem->opr.name, | |
2720 | sym_name_len); | |
2721 | if (symb_table[2 * elem] != 0) | |
2722 | { | |
2723 | /* We found the entry. */ | |
2724 | idx = symb_table[2 * elem + 1]; | |
2725 | /* Skip the name of collating element name. */ | |
2726 | idx += 1 + extra[idx]; | |
2727 | /* Skip the byte sequence of the collating element. */ | |
2728 | idx += 1 + extra[idx]; | |
2729 | /* Adjust for the alignment. */ | |
2730 | idx = (idx + 3) & ~3; | |
2731 | /* Skip the multibyte collation sequence value. */ | |
2732 | idx += sizeof (unsigned int); | |
2733 | /* Skip the wide char sequence of the collating element. */ | |
2734 | idx += sizeof (unsigned int) * | |
2735 | (1 + *(unsigned int *) (extra + idx)); | |
2736 | /* Return the collation sequence value. */ | |
2737 | return *(unsigned int *) (extra + idx); | |
2738 | } | |
2739 | else if (symb_table[2 * elem] == 0 && sym_name_len == 1) | |
2740 | { | |
2741 | /* No valid character. Match it as a single byte | |
2742 | character. */ | |
2743 | return collseqmb[br_elem->opr.name[0]]; | |
2744 | } | |
2745 | } | |
2746 | else if (sym_name_len == 1) | |
2747 | return collseqmb[br_elem->opr.name[0]]; | |
2748 | } | |
3b0bdc72 UD |
2749 | return UINT_MAX; |
2750 | } | |
2751 | ||
434d3784 | 2752 | /* Local function for parse_bracket_exp used in _LIBC environement. |
3b0bdc72 UD |
2753 | Build the range expression which starts from START_ELEM, and ends |
2754 | at END_ELEM. The result are written to MBCSET and SBCSET. | |
2755 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and | |
2756 | mbcset->range_ends, is a pointer argument sinse we may | |
2757 | update it. */ | |
2758 | ||
2759 | static inline reg_errcode_t | |
25337753 | 2760 | __attribute ((always_inline)) |
c0a0f9a3 UD |
2761 | # ifdef RE_ENABLE_I18N |
2762 | build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) | |
15a7d175 UD |
2763 | re_charset_t *mbcset; |
2764 | int *range_alloc; | |
c0a0f9a3 UD |
2765 | # else /* not RE_ENABLE_I18N */ |
2766 | build_range_exp (sbcset, start_elem, end_elem) | |
2767 | # endif /* not RE_ENABLE_I18N */ | |
15a7d175 UD |
2768 | re_bitset_ptr_t sbcset; |
2769 | bracket_elem_t *start_elem, *end_elem; | |
3b0bdc72 UD |
2770 | { |
2771 | unsigned int ch; | |
2772 | uint32_t start_collseq; | |
2773 | uint32_t end_collseq; | |
2774 | ||
c0a0f9a3 | 2775 | # ifdef RE_ENABLE_I18N |
3b0bdc72 UD |
2776 | /* Check the space of the arrays. */ |
2777 | if (*range_alloc == mbcset->nranges) | |
15a7d175 UD |
2778 | { |
2779 | /* There are not enough space, need realloc. */ | |
2780 | uint32_t *new_array_start; | |
2781 | uint32_t *new_array_end; | |
3b0bdc72 UD |
2782 | int new_nranges; |
2783 | ||
15a7d175 UD |
2784 | /* +1 in case of mbcset->nranges is 0. */ |
2785 | new_nranges = 2 * mbcset->nranges + 1; | |
a9388965 | 2786 | /* Use realloc since mbcset->range_starts and mbcset->range_ends |
15a7d175 UD |
2787 | are NULL if *range_alloc == 0. */ |
2788 | new_array_start = re_realloc (mbcset->range_starts, uint32_t, | |
2789 | new_nranges); | |
2790 | new_array_end = re_realloc (mbcset->range_ends, uint32_t, | |
2791 | new_nranges); | |
a9388965 | 2792 | |
15a7d175 UD |
2793 | if (BE (new_array_start == NULL || new_array_end == NULL, 0)) |
2794 | return REG_ESPACE; | |
3b0bdc72 | 2795 | |
15a7d175 UD |
2796 | mbcset->range_starts = new_array_start; |
2797 | mbcset->range_ends = new_array_end; | |
3b0bdc72 | 2798 | *range_alloc = new_nranges; |
15a7d175 | 2799 | } |
c0a0f9a3 | 2800 | # endif /* RE_ENABLE_I18N */ |
3b0bdc72 | 2801 | |
434d3784 | 2802 | /* Equivalence Classes and Character Classes can't be a range |
15a7d175 | 2803 | start/end. */ |
bc15410e | 2804 | if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS |
15a7d175 UD |
2805 | || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, |
2806 | 0)) | |
2807 | return REG_ERANGE; | |
3b0bdc72 UD |
2808 | |
2809 | start_collseq = lookup_collation_sequence_value (start_elem); | |
2810 | end_collseq = lookup_collation_sequence_value (end_elem); | |
2811 | /* Check start/end collation sequence values. */ | |
bc15410e | 2812 | if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) |
15a7d175 | 2813 | return REG_ECOLLATE; |
bc15410e | 2814 | if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) |
15a7d175 | 2815 | return REG_ERANGE; |
3b0bdc72 | 2816 | |
c0a0f9a3 | 2817 | # ifdef RE_ENABLE_I18N |
3b0bdc72 UD |
2818 | /* Got valid collation sequence values, add them as a new entry. */ |
2819 | mbcset->range_starts[mbcset->nranges] = start_collseq; | |
2820 | mbcset->range_ends[mbcset->nranges++] = end_collseq; | |
c0a0f9a3 | 2821 | # endif /* RE_ENABLE_I18N */ |
3b0bdc72 UD |
2822 | |
2823 | /* Build the table for single byte characters. */ | |
2824 | for (ch = 0; ch <= SBC_MAX; ch++) | |
15a7d175 UD |
2825 | { |
2826 | uint32_t ch_collseq; | |
2827 | /* | |
2828 | if (MB_CUR_MAX == 1) | |
2829 | */ | |
2830 | if (nrules == 0) | |
2831 | ch_collseq = collseqmb[ch]; | |
2832 | else | |
25337753 | 2833 | ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); |
15a7d175 UD |
2834 | if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) |
2835 | bitset_set (sbcset, ch); | |
2836 | } | |
3b0bdc72 UD |
2837 | return REG_NOERROR; |
2838 | } | |
3b0bdc72 | 2839 | |
434d3784 | 2840 | /* Local function for parse_bracket_exp used in _LIBC environement. |
3b0bdc72 UD |
2841 | Build the collating element which is represented by NAME. |
2842 | The result are written to MBCSET and SBCSET. | |
2843 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a | |
2844 | pointer argument sinse we may update it. */ | |
2845 | ||
2846 | static inline reg_errcode_t | |
25337753 | 2847 | __attribute ((always_inline)) |
c0a0f9a3 UD |
2848 | # ifdef RE_ENABLE_I18N |
2849 | build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) | |
15a7d175 UD |
2850 | re_charset_t *mbcset; |
2851 | int *coll_sym_alloc; | |
c0a0f9a3 UD |
2852 | # else /* not RE_ENABLE_I18N */ |
2853 | build_collating_symbol (sbcset, name) | |
2854 | # endif /* not RE_ENABLE_I18N */ | |
15a7d175 UD |
2855 | re_bitset_ptr_t sbcset; |
2856 | const unsigned char *name; | |
3b0bdc72 | 2857 | { |
3b0bdc72 | 2858 | int32_t elem, idx; |
62439eac | 2859 | size_t name_len = strlen ((const char *) name); |
3b0bdc72 | 2860 | if (nrules != 0) |
15a7d175 UD |
2861 | { |
2862 | elem = seek_collating_symbol_entry (name, name_len); | |
2863 | if (symb_table[2 * elem] != 0) | |
2864 | { | |
2865 | /* We found the entry. */ | |
2866 | idx = symb_table[2 * elem + 1]; | |
2867 | /* Skip the name of collating element name. */ | |
2868 | idx += 1 + extra[idx]; | |
2869 | } | |
2870 | else if (symb_table[2 * elem] == 0 && name_len == 1) | |
2871 | { | |
2872 | /* No valid character, treat it as a normal | |
2873 | character. */ | |
2874 | bitset_set (sbcset, name[0]); | |
2875 | return REG_NOERROR; | |
2876 | } | |
2877 | else | |
2878 | return REG_ECOLLATE; | |
3b0bdc72 | 2879 | |
c0a0f9a3 | 2880 | # ifdef RE_ENABLE_I18N |
15a7d175 UD |
2881 | /* Got valid collation sequence, add it as a new entry. */ |
2882 | /* Check the space of the arrays. */ | |
2883 | if (*coll_sym_alloc == mbcset->ncoll_syms) | |
2884 | { | |
2885 | /* Not enough, realloc it. */ | |
2886 | /* +1 in case of mbcset->ncoll_syms is 0. */ | |
2887 | *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; | |
2888 | /* Use realloc since mbcset->coll_syms is NULL | |
2889 | if *alloc == 0. */ | |
2890 | mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t, | |
2891 | *coll_sym_alloc); | |
2892 | if (BE (mbcset->coll_syms == NULL, 0)) | |
2893 | return REG_ESPACE; | |
2894 | } | |
2895 | mbcset->coll_syms[mbcset->ncoll_syms++] = idx; | |
c0a0f9a3 | 2896 | # endif /* RE_ENABLE_I18N */ |
15a7d175 UD |
2897 | return REG_NOERROR; |
2898 | } | |
3b0bdc72 | 2899 | else |
15a7d175 UD |
2900 | { |
2901 | if (BE (name_len != 1, 0)) | |
2902 | return REG_ECOLLATE; | |
2903 | else | |
2904 | { | |
2905 | bitset_set (sbcset, name[0]); | |
2906 | return REG_NOERROR; | |
2907 | } | |
2908 | } | |
3b0bdc72 | 2909 | } |
434d3784 UD |
2910 | #endif |
2911 | ||
3b0bdc72 UD |
2912 | re_token_t br_token; |
2913 | re_bitset_ptr_t sbcset; | |
c0a0f9a3 | 2914 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 2915 | re_charset_t *mbcset; |
3b0bdc72 UD |
2916 | int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; |
2917 | int equiv_class_alloc = 0, char_class_alloc = 0; | |
c0a0f9a3 UD |
2918 | #else /* not RE_ENABLE_I18N */ |
2919 | int non_match = 0; | |
2920 | #endif /* not RE_ENABLE_I18N */ | |
2921 | bin_tree_t *work_tree; | |
2922 | int token_len, new_idx; | |
3b0bdc72 | 2923 | #ifdef _LIBC |
62439eac UD |
2924 | collseqmb = (const unsigned char *) |
2925 | _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); | |
3b0bdc72 UD |
2926 | nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); |
2927 | if (nrules) | |
2928 | { | |
2929 | /* | |
2930 | if (MB_CUR_MAX > 1) | |
2931 | */ | |
15a7d175 | 2932 | collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); |
3b0bdc72 UD |
2933 | table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); |
2934 | symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, | |
15a7d175 | 2935 | _NL_COLLATE_SYMB_TABLEMB); |
3b0bdc72 | 2936 | extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, |
15a7d175 | 2937 | _NL_COLLATE_SYMB_EXTRAMB); |
3b0bdc72 UD |
2938 | } |
2939 | #endif | |
2940 | sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); | |
c0a0f9a3 | 2941 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 2942 | mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); |
c0a0f9a3 UD |
2943 | #endif /* RE_ENABLE_I18N */ |
2944 | #ifdef RE_ENABLE_I18N | |
bc15410e | 2945 | if (BE (sbcset == NULL || mbcset == NULL, 0)) |
c0a0f9a3 UD |
2946 | #else |
2947 | if (BE (sbcset == NULL, 0)) | |
2948 | #endif /* RE_ENABLE_I18N */ | |
3b0bdc72 UD |
2949 | { |
2950 | *err = REG_ESPACE; | |
2951 | return NULL; | |
2952 | } | |
2953 | ||
2954 | token_len = peek_token_bracket (token, regexp, syntax); | |
bc15410e | 2955 | if (BE (token->type == END_OF_RE, 0)) |
3b0bdc72 | 2956 | { |
3b0bdc72 | 2957 | *err = REG_BADPAT; |
434d3784 | 2958 | goto parse_bracket_exp_free_return; |
3b0bdc72 UD |
2959 | } |
2960 | if (token->type == OP_NON_MATCH_LIST) | |
2961 | { | |
c0a0f9a3 | 2962 | #ifdef RE_ENABLE_I18N |
3b0bdc72 UD |
2963 | int i; |
2964 | mbcset->non_match = 1; | |
c0a0f9a3 UD |
2965 | #else /* not RE_ENABLE_I18N */ |
2966 | non_match = 1; | |
2967 | #endif /* not RE_ENABLE_I18N */ | |
3b0bdc72 | 2968 | if (syntax & RE_HAT_LISTS_NOT_NEWLINE) |
15a7d175 | 2969 | bitset_set (sbcset, '\0'); |
3b0bdc72 UD |
2970 | re_string_skip_bytes (regexp, token_len); /* Skip a token. */ |
2971 | token_len = peek_token_bracket (token, regexp, syntax); | |
bc15410e | 2972 | if (BE (token->type == END_OF_RE, 0)) |
15a7d175 UD |
2973 | { |
2974 | *err = REG_BADPAT; | |
2975 | goto parse_bracket_exp_free_return; | |
2976 | } | |
c0a0f9a3 | 2977 | #ifdef RE_ENABLE_I18N |
3c0fb574 | 2978 | if (dfa->mb_cur_max > 1) |
15a7d175 UD |
2979 | for (i = 0; i < SBC_MAX; ++i) |
2980 | if (__btowc (i) == WEOF) | |
2981 | bitset_set (sbcset, i); | |
c0a0f9a3 | 2982 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 UD |
2983 | } |
2984 | ||
2985 | /* We treat the first ']' as a normal character. */ | |
2986 | if (token->type == OP_CLOSE_BRACKET) | |
2987 | token->type = CHARACTER; | |
2988 | ||
2989 | while (1) | |
2990 | { | |
2991 | bracket_elem_t start_elem, end_elem; | |
62439eac UD |
2992 | unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; |
2993 | unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; | |
3b0bdc72 UD |
2994 | reg_errcode_t ret; |
2995 | int token_len2 = 0, is_range_exp = 0; | |
2996 | re_token_t token2; | |
2997 | ||
2998 | start_elem.opr.name = start_name_buf; | |
2999 | ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, | |
15a7d175 | 3000 | syntax); |
bc15410e | 3001 | if (BE (ret != REG_NOERROR, 0)) |
15a7d175 UD |
3002 | { |
3003 | *err = ret; | |
3004 | goto parse_bracket_exp_free_return; | |
3005 | } | |
3b0bdc72 UD |
3006 | |
3007 | token_len = peek_token_bracket (token, regexp, syntax); | |
bc15410e | 3008 | if (BE (token->type == END_OF_RE, 0)) |
15a7d175 | 3009 | { |
f6953f16 | 3010 | *err = REG_EBRACK; |
15a7d175 UD |
3011 | goto parse_bracket_exp_free_return; |
3012 | } | |
3b0bdc72 | 3013 | if (token->type == OP_CHARSET_RANGE) |
15a7d175 UD |
3014 | { |
3015 | re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ | |
3016 | token_len2 = peek_token_bracket (&token2, regexp, syntax); | |
3017 | if (BE (token->type == END_OF_RE, 0)) | |
3018 | { | |
3019 | *err = REG_BADPAT; | |
3020 | goto parse_bracket_exp_free_return; | |
3021 | } | |
3022 | if (token2.type == OP_CLOSE_BRACKET) | |
3023 | { | |
3024 | /* We treat the last '-' as a normal character. */ | |
3025 | re_string_skip_bytes (regexp, -token_len); | |
3026 | token->type = CHARACTER; | |
3027 | } | |
3028 | else | |
3029 | is_range_exp = 1; | |
3030 | } | |
3b0bdc72 UD |
3031 | |
3032 | if (is_range_exp == 1) | |
15a7d175 UD |
3033 | { |
3034 | end_elem.opr.name = end_name_buf; | |
3035 | ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, | |
3036 | dfa, syntax); | |
3037 | if (BE (ret != REG_NOERROR, 0)) | |
3038 | { | |
3039 | *err = ret; | |
3040 | goto parse_bracket_exp_free_return; | |
3041 | } | |
3042 | ||
3043 | token_len = peek_token_bracket (token, regexp, syntax); | |
3044 | if (BE (token->type == END_OF_RE, 0)) | |
3045 | { | |
3046 | *err = REG_BADPAT; | |
3047 | goto parse_bracket_exp_free_return; | |
3048 | } | |
3049 | *err = build_range_exp (sbcset, | |
c0a0f9a3 | 3050 | #ifdef RE_ENABLE_I18N |
15a7d175 | 3051 | mbcset, &range_alloc, |
c0a0f9a3 | 3052 | #endif /* RE_ENABLE_I18N */ |
15a7d175 UD |
3053 | &start_elem, &end_elem); |
3054 | if (BE (*err != REG_NOERROR, 0)) | |
3055 | goto parse_bracket_exp_free_return; | |
3056 | } | |
3b0bdc72 | 3057 | else |
15a7d175 UD |
3058 | { |
3059 | switch (start_elem.type) | |
3060 | { | |
3061 | case SB_CHAR: | |
3062 | bitset_set (sbcset, start_elem.opr.ch); | |
3063 | break; | |
c0a0f9a3 | 3064 | #ifdef RE_ENABLE_I18N |
15a7d175 UD |
3065 | case MB_CHAR: |
3066 | /* Check whether the array has enough space. */ | |
3067 | if (mbchar_alloc == mbcset->nmbchars) | |
3068 | { | |
3069 | /* Not enough, realloc it. */ | |
3070 | /* +1 in case of mbcset->nmbchars is 0. */ | |
3071 | mbchar_alloc = 2 * mbcset->nmbchars + 1; | |
3072 | /* Use realloc since array is NULL if *alloc == 0. */ | |
3073 | mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t, | |
3074 | mbchar_alloc); | |
3075 | if (BE (mbcset->mbchars == NULL, 0)) | |
3076 | goto parse_bracket_exp_espace; | |
3077 | } | |
3078 | mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; | |
3079 | break; | |
c0a0f9a3 | 3080 | #endif /* RE_ENABLE_I18N */ |
15a7d175 UD |
3081 | case EQUIV_CLASS: |
3082 | *err = build_equiv_class (sbcset, | |
c0a0f9a3 | 3083 | #ifdef RE_ENABLE_I18N |
15a7d175 | 3084 | mbcset, &equiv_class_alloc, |
c0a0f9a3 | 3085 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 | 3086 | start_elem.opr.name); |
15a7d175 UD |
3087 | if (BE (*err != REG_NOERROR, 0)) |
3088 | goto parse_bracket_exp_free_return; | |
3089 | break; | |
3090 | case COLL_SYM: | |
3091 | *err = build_collating_symbol (sbcset, | |
c0a0f9a3 | 3092 | #ifdef RE_ENABLE_I18N |
15a7d175 | 3093 | mbcset, &coll_sym_alloc, |
c0a0f9a3 | 3094 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 | 3095 | start_elem.opr.name); |
15a7d175 UD |
3096 | if (BE (*err != REG_NOERROR, 0)) |
3097 | goto parse_bracket_exp_free_return; | |
3098 | break; | |
3099 | case CHAR_CLASS: | |
66b110e8 | 3100 | *err = build_charclass (regexp->trans, sbcset, |
c0a0f9a3 | 3101 | #ifdef RE_ENABLE_I18N |
7b7b9e70 | 3102 | mbcset, &char_class_alloc, |
c0a0f9a3 | 3103 | #endif /* RE_ENABLE_I18N */ |
7b7b9e70 UD |
3104 | start_elem.opr.name, syntax); |
3105 | if (BE (*err != REG_NOERROR, 0)) | |
3106 | goto parse_bracket_exp_free_return; | |
15a7d175 UD |
3107 | break; |
3108 | default: | |
3109 | assert (0); | |
3110 | break; | |
3111 | } | |
3112 | } | |
3b0bdc72 | 3113 | if (token->type == OP_CLOSE_BRACKET) |
15a7d175 | 3114 | break; |
3b0bdc72 UD |
3115 | } |
3116 | ||
3117 | re_string_skip_bytes (regexp, token_len); /* Skip a token. */ | |
3118 | ||
3119 | /* If it is non-matching list. */ | |
c0a0f9a3 | 3120 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3121 | if (mbcset->non_match) |
c0a0f9a3 UD |
3122 | #else /* not RE_ENABLE_I18N */ |
3123 | if (non_match) | |
3124 | #endif /* not RE_ENABLE_I18N */ | |
3b0bdc72 UD |
3125 | bitset_not (sbcset); |
3126 | ||
3127 | /* Build a tree for simple bracket. */ | |
3128 | br_token.type = SIMPLE_BRACKET; | |
3129 | br_token.opr.sbcset = sbcset; | |
3130 | new_idx = re_dfa_add_node (dfa, br_token, 0); | |
3131 | work_tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 3132 | if (BE (new_idx == -1 || work_tree == NULL, 0)) |
3b0bdc72 UD |
3133 | goto parse_bracket_exp_espace; |
3134 | ||
c0a0f9a3 | 3135 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3136 | if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes |
3c0fb574 UD |
3137 | || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes |
3138 | || mbcset->non_match))) | |
3b0bdc72 UD |
3139 | { |
3140 | re_token_t alt_token; | |
3141 | bin_tree_t *mbc_tree; | |
3142 | /* Build a tree for complex bracket. */ | |
3143 | br_token.type = COMPLEX_BRACKET; | |
3144 | br_token.opr.mbcset = mbcset; | |
3145 | dfa->has_mb_node = 1; | |
3146 | new_idx = re_dfa_add_node (dfa, br_token, 0); | |
3147 | mbc_tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 3148 | if (BE (new_idx == -1 || mbc_tree == NULL, 0)) |
15a7d175 | 3149 | goto parse_bracket_exp_espace; |
3b0bdc72 | 3150 | /* Then join them by ALT node. */ |
0742e48e | 3151 | dfa->has_plural_match = 1; |
3b0bdc72 UD |
3152 | alt_token.type = OP_ALT; |
3153 | new_idx = re_dfa_add_node (dfa, alt_token, 0); | |
3154 | work_tree = create_tree (work_tree, mbc_tree, 0, new_idx); | |
bc15410e | 3155 | if (BE (new_idx != -1 && mbc_tree != NULL, 1)) |
15a7d175 | 3156 | return work_tree; |
3b0bdc72 UD |
3157 | } |
3158 | else | |
3159 | { | |
3160 | free_charset (mbcset); | |
3161 | return work_tree; | |
3162 | } | |
c0a0f9a3 UD |
3163 | #else /* not RE_ENABLE_I18N */ |
3164 | return work_tree; | |
3165 | #endif /* not RE_ENABLE_I18N */ | |
3b0bdc72 UD |
3166 | |
3167 | parse_bracket_exp_espace: | |
3b0bdc72 | 3168 | *err = REG_ESPACE; |
434d3784 UD |
3169 | parse_bracket_exp_free_return: |
3170 | re_free (sbcset); | |
c0a0f9a3 | 3171 | #ifdef RE_ENABLE_I18N |
434d3784 | 3172 | free_charset (mbcset); |
c0a0f9a3 | 3173 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 UD |
3174 | return NULL; |
3175 | } | |
3176 | ||
434d3784 UD |
3177 | /* Parse an element in the bracket expression. */ |
3178 | ||
3b0bdc72 UD |
3179 | static reg_errcode_t |
3180 | parse_bracket_element (elem, regexp, token, token_len, dfa, syntax) | |
3181 | bracket_elem_t *elem; | |
3182 | re_string_t *regexp; | |
3183 | re_token_t *token; | |
3184 | int token_len; | |
3185 | re_dfa_t *dfa; | |
3186 | reg_syntax_t syntax; | |
3187 | { | |
3188 | #ifdef RE_ENABLE_I18N | |
3189 | int cur_char_size; | |
3190 | cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); | |
3191 | if (cur_char_size > 1) | |
3192 | { | |
3193 | elem->type = MB_CHAR; | |
3194 | elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); | |
3195 | re_string_skip_bytes (regexp, cur_char_size); | |
3196 | return REG_NOERROR; | |
3197 | } | |
3198 | #endif /* RE_ENABLE_I18N */ | |
3199 | re_string_skip_bytes (regexp, token_len); /* Skip a token. */ | |
3200 | if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS | |
3201 | || token->type == OP_OPEN_EQUIV_CLASS) | |
3202 | return parse_bracket_symbol (elem, regexp, token); | |
3203 | elem->type = SB_CHAR; | |
3204 | elem->opr.ch = token->opr.c; | |
3205 | return REG_NOERROR; | |
3206 | } | |
3207 | ||
434d3784 UD |
3208 | /* Parse a bracket symbol in the bracket expression. Bracket symbols are |
3209 | such as [:<character_class>:], [.<collating_element>.], and | |
3210 | [=<equivalent_class>=]. */ | |
3211 | ||
3b0bdc72 UD |
3212 | static reg_errcode_t |
3213 | parse_bracket_symbol (elem, regexp, token) | |
3214 | bracket_elem_t *elem; | |
3215 | re_string_t *regexp; | |
3216 | re_token_t *token; | |
3217 | { | |
3218 | unsigned char ch, delim = token->opr.c; | |
3219 | int i = 0; | |
602c2f9d | 3220 | for (;; ++i) |
3b0bdc72 | 3221 | { |
602c2f9d | 3222 | if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE) |
15a7d175 | 3223 | return REG_EBRACK; |
3b0bdc72 | 3224 | if (token->type == OP_OPEN_CHAR_CLASS) |
15a7d175 | 3225 | ch = re_string_fetch_byte_case (regexp); |
3b0bdc72 | 3226 | else |
15a7d175 | 3227 | ch = re_string_fetch_byte (regexp); |
3b0bdc72 | 3228 | if (ch == delim && re_string_peek_byte (regexp, 0) == ']') |
15a7d175 | 3229 | break; |
3b0bdc72 UD |
3230 | elem->opr.name[i] = ch; |
3231 | } | |
3232 | re_string_skip_bytes (regexp, 1); | |
3233 | elem->opr.name[i] = '\0'; | |
3234 | switch (token->type) | |
3235 | { | |
3236 | case OP_OPEN_COLL_ELEM: | |
3237 | elem->type = COLL_SYM; | |
3238 | break; | |
3239 | case OP_OPEN_EQUIV_CLASS: | |
3240 | elem->type = EQUIV_CLASS; | |
3241 | break; | |
3242 | case OP_OPEN_CHAR_CLASS: | |
3243 | elem->type = CHAR_CLASS; | |
3244 | break; | |
3245 | default: | |
3246 | break; | |
3247 | } | |
3248 | return REG_NOERROR; | |
3249 | } | |
3250 | ||
3251 | /* Helper function for parse_bracket_exp. | |
3252 | Build the equivalence class which is represented by NAME. | |
3253 | The result are written to MBCSET and SBCSET. | |
3254 | EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, | |
3255 | is a pointer argument sinse we may update it. */ | |
3256 | ||
3257 | static reg_errcode_t | |
c0a0f9a3 UD |
3258 | #ifdef RE_ENABLE_I18N |
3259 | build_equiv_class (sbcset, mbcset, equiv_class_alloc, name) | |
3b0bdc72 | 3260 | re_charset_t *mbcset; |
3b0bdc72 | 3261 | int *equiv_class_alloc; |
c0a0f9a3 UD |
3262 | #else /* not RE_ENABLE_I18N */ |
3263 | build_equiv_class (sbcset, name) | |
3264 | #endif /* not RE_ENABLE_I18N */ | |
3265 | re_bitset_ptr_t sbcset; | |
62439eac | 3266 | const unsigned char *name; |
3b0bdc72 | 3267 | { |
c0a0f9a3 | 3268 | #if defined _LIBC && defined RE_ENABLE_I18N |
3b0bdc72 UD |
3269 | uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); |
3270 | if (nrules != 0) | |
3271 | { | |
3272 | const int32_t *table, *indirect; | |
3273 | const unsigned char *weights, *extra, *cp; | |
62439eac | 3274 | unsigned char char_buf[2]; |
3b0bdc72 UD |
3275 | int32_t idx1, idx2; |
3276 | unsigned int ch; | |
3277 | size_t len; | |
3278 | /* This #include defines a local function! */ | |
3279 | # include <locale/weight.h> | |
3280 | /* Calculate the index for equivalence class. */ | |
62439eac | 3281 | cp = name; |
3b0bdc72 UD |
3282 | table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); |
3283 | weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, | |
3284 | _NL_COLLATE_WEIGHTMB); | |
3285 | extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, | |
15a7d175 | 3286 | _NL_COLLATE_EXTRAMB); |
3b0bdc72 | 3287 | indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, |
15a7d175 | 3288 | _NL_COLLATE_INDIRECTMB); |
3b0bdc72 | 3289 | idx1 = findidx (&cp); |
62439eac | 3290 | if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) |
15a7d175 UD |
3291 | /* This isn't a valid character. */ |
3292 | return REG_ECOLLATE; | |
3b0bdc72 UD |
3293 | |
3294 | /* Build single byte matcing table for this equivalence class. */ | |
62439eac | 3295 | char_buf[1] = (unsigned char) '\0'; |
3b0bdc72 UD |
3296 | len = weights[idx1]; |
3297 | for (ch = 0; ch < SBC_MAX; ++ch) | |
15a7d175 UD |
3298 | { |
3299 | char_buf[0] = ch; | |
3300 | cp = char_buf; | |
3301 | idx2 = findidx (&cp); | |
3b0bdc72 | 3302 | /* |
15a7d175 | 3303 | idx2 = table[ch]; |
3b0bdc72 | 3304 | */ |
15a7d175 UD |
3305 | if (idx2 == 0) |
3306 | /* This isn't a valid character. */ | |
3307 | continue; | |
3308 | if (len == weights[idx2]) | |
3309 | { | |
3310 | int cnt = 0; | |
3311 | while (cnt <= len && | |
3312 | weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) | |
3313 | ++cnt; | |
3314 | ||
3315 | if (cnt > len) | |
3316 | bitset_set (sbcset, ch); | |
3317 | } | |
3318 | } | |
a9388965 UD |
3319 | /* Check whether the array has enough space. */ |
3320 | if (*equiv_class_alloc == mbcset->nequiv_classes) | |
15a7d175 UD |
3321 | { |
3322 | /* Not enough, realloc it. */ | |
3323 | /* +1 in case of mbcset->nequiv_classes is 0. */ | |
3324 | *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; | |
3325 | /* Use realloc since the array is NULL if *alloc == 0. */ | |
3326 | mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t, | |
3327 | *equiv_class_alloc); | |
3328 | if (BE (mbcset->equiv_classes == NULL, 0)) | |
3329 | return REG_ESPACE; | |
3330 | } | |
3b0bdc72 UD |
3331 | mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; |
3332 | } | |
3333 | else | |
c0a0f9a3 | 3334 | #endif /* _LIBC && RE_ENABLE_I18N */ |
3b0bdc72 | 3335 | { |
62439eac | 3336 | if (BE (strlen ((const char *) name) != 1, 0)) |
15a7d175 | 3337 | return REG_ECOLLATE; |
62439eac | 3338 | bitset_set (sbcset, *name); |
3b0bdc72 UD |
3339 | } |
3340 | return REG_NOERROR; | |
3341 | } | |
3342 | ||
3343 | /* Helper function for parse_bracket_exp. | |
3344 | Build the character class which is represented by NAME. | |
3345 | The result are written to MBCSET and SBCSET. | |
3346 | CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, | |
3347 | is a pointer argument sinse we may update it. */ | |
3348 | ||
3349 | static reg_errcode_t | |
c0a0f9a3 | 3350 | #ifdef RE_ENABLE_I18N |
66b110e8 | 3351 | build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax) |
3b0bdc72 | 3352 | re_charset_t *mbcset; |
3b0bdc72 | 3353 | int *char_class_alloc; |
c0a0f9a3 | 3354 | #else /* not RE_ENABLE_I18N */ |
66b110e8 | 3355 | build_charclass (trans, sbcset, class_name, syntax) |
c0a0f9a3 | 3356 | #endif /* not RE_ENABLE_I18N */ |
66b110e8 | 3357 | RE_TRANSLATE_TYPE trans; |
c0a0f9a3 | 3358 | re_bitset_ptr_t sbcset; |
62439eac | 3359 | const unsigned char *class_name; |
602c2f9d | 3360 | reg_syntax_t syntax; |
3b0bdc72 UD |
3361 | { |
3362 | int i; | |
62439eac | 3363 | const char *name = (const char *) class_name; |
c0a0f9a3 UD |
3364 | |
3365 | /* In case of REG_ICASE "upper" and "lower" match the both of | |
3366 | upper and lower cases. */ | |
3367 | if ((syntax & RE_ICASE) | |
3368 | && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) | |
3369 | name = "alpha"; | |
3370 | ||
3371 | #ifdef RE_ENABLE_I18N | |
3b0bdc72 | 3372 | /* Check the space of the arrays. */ |
a9388965 UD |
3373 | if (*char_class_alloc == mbcset->nchar_classes) |
3374 | { | |
3375 | /* Not enough, realloc it. */ | |
3376 | /* +1 in case of mbcset->nchar_classes is 0. */ | |
3377 | *char_class_alloc = 2 * mbcset->nchar_classes + 1; | |
3378 | /* Use realloc since array is NULL if *alloc == 0. */ | |
3379 | mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t, | |
15a7d175 | 3380 | *char_class_alloc); |
bc15410e | 3381 | if (BE (mbcset->char_classes == NULL, 0)) |
15a7d175 | 3382 | return REG_ESPACE; |
a9388965 | 3383 | } |
3b0bdc72 | 3384 | mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); |
c0a0f9a3 | 3385 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 | 3386 | |
66b110e8 UD |
3387 | #define BUILD_CHARCLASS_LOOP(ctype_func) \ |
3388 | for (i = 0; i < SBC_MAX; ++i) \ | |
3389 | { \ | |
3390 | if (ctype_func (i)) \ | |
3391 | { \ | |
3392 | int ch = trans ? trans[i] : i; \ | |
3393 | bitset_set (sbcset, ch); \ | |
3394 | } \ | |
3b0bdc72 UD |
3395 | } |
3396 | ||
3397 | if (strcmp (name, "alnum") == 0) | |
3398 | BUILD_CHARCLASS_LOOP (isalnum) | |
3399 | else if (strcmp (name, "cntrl") == 0) | |
3400 | BUILD_CHARCLASS_LOOP (iscntrl) | |
3401 | else if (strcmp (name, "lower") == 0) | |
3402 | BUILD_CHARCLASS_LOOP (islower) | |
3403 | else if (strcmp (name, "space") == 0) | |
3404 | BUILD_CHARCLASS_LOOP (isspace) | |
3405 | else if (strcmp (name, "alpha") == 0) | |
3406 | BUILD_CHARCLASS_LOOP (isalpha) | |
3407 | else if (strcmp (name, "digit") == 0) | |
3408 | BUILD_CHARCLASS_LOOP (isdigit) | |
3409 | else if (strcmp (name, "print") == 0) | |
3410 | BUILD_CHARCLASS_LOOP (isprint) | |
3411 | else if (strcmp (name, "upper") == 0) | |
3412 | BUILD_CHARCLASS_LOOP (isupper) | |
3413 | else if (strcmp (name, "blank") == 0) | |
3414 | BUILD_CHARCLASS_LOOP (isblank) | |
3415 | else if (strcmp (name, "graph") == 0) | |
3416 | BUILD_CHARCLASS_LOOP (isgraph) | |
3417 | else if (strcmp (name, "punct") == 0) | |
3418 | BUILD_CHARCLASS_LOOP (ispunct) | |
3419 | else if (strcmp (name, "xdigit") == 0) | |
3420 | BUILD_CHARCLASS_LOOP (isxdigit) | |
3421 | else | |
3422 | return REG_ECTYPE; | |
3423 | ||
3424 | return REG_NOERROR; | |
3425 | } | |
3426 | ||
3427 | static bin_tree_t * | |
e2b6bfa3 | 3428 | build_charclass_op (dfa, trans, class_name, extra, not, err) |
3b0bdc72 | 3429 | re_dfa_t *dfa; |
66b110e8 | 3430 | RE_TRANSLATE_TYPE trans; |
e2b6bfa3 UD |
3431 | const unsigned char *class_name; |
3432 | const unsigned char *extra; | |
3b0bdc72 UD |
3433 | int not; |
3434 | reg_errcode_t *err; | |
3435 | { | |
3436 | re_bitset_ptr_t sbcset; | |
c0a0f9a3 | 3437 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3438 | re_charset_t *mbcset; |
c0a0f9a3 UD |
3439 | int alloc = 0; |
3440 | #else /* not RE_ENABLE_I18N */ | |
3441 | int non_match = 0; | |
3442 | #endif /* not RE_ENABLE_I18N */ | |
3b0bdc72 UD |
3443 | reg_errcode_t ret; |
3444 | re_token_t br_token; | |
3445 | bin_tree_t *tree; | |
c0a0f9a3 | 3446 | int new_idx; |
3b0bdc72 UD |
3447 | |
3448 | sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); | |
c0a0f9a3 | 3449 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3450 | mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); |
c0a0f9a3 UD |
3451 | #endif /* RE_ENABLE_I18N */ |
3452 | ||
3453 | #ifdef RE_ENABLE_I18N | |
bc15410e | 3454 | if (BE (sbcset == NULL || mbcset == NULL, 0)) |
c0a0f9a3 UD |
3455 | #else /* not RE_ENABLE_I18N */ |
3456 | if (BE (sbcset == NULL, 0)) | |
3457 | #endif /* not RE_ENABLE_I18N */ | |
3b0bdc72 UD |
3458 | { |
3459 | *err = REG_ESPACE; | |
3460 | return NULL; | |
3461 | } | |
3462 | ||
3463 | if (not) | |
3464 | { | |
c0a0f9a3 | 3465 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3466 | int i; |
3b0bdc72 UD |
3467 | /* |
3468 | if (syntax & RE_HAT_LISTS_NOT_NEWLINE) | |
15a7d175 | 3469 | bitset_set(cset->sbcset, '\0'); |
3b0bdc72 | 3470 | */ |
c0a0f9a3 | 3471 | mbcset->non_match = 1; |
3c0fb574 | 3472 | if (dfa->mb_cur_max > 1) |
15a7d175 UD |
3473 | for (i = 0; i < SBC_MAX; ++i) |
3474 | if (__btowc (i) == WEOF) | |
3475 | bitset_set (sbcset, i); | |
c0a0f9a3 UD |
3476 | #else /* not RE_ENABLE_I18N */ |
3477 | non_match = 1; | |
3478 | #endif /* not RE_ENABLE_I18N */ | |
3b0bdc72 UD |
3479 | } |
3480 | ||
602c2f9d | 3481 | /* We don't care the syntax in this case. */ |
66b110e8 | 3482 | ret = build_charclass (trans, sbcset, |
c0a0f9a3 | 3483 | #ifdef RE_ENABLE_I18N |
15a7d175 | 3484 | mbcset, &alloc, |
c0a0f9a3 | 3485 | #endif /* RE_ENABLE_I18N */ |
e2b6bfa3 | 3486 | class_name, 0); |
c0a0f9a3 | 3487 | |
bc15410e | 3488 | if (BE (ret != REG_NOERROR, 0)) |
3b0bdc72 UD |
3489 | { |
3490 | re_free (sbcset); | |
c0a0f9a3 | 3491 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3492 | free_charset (mbcset); |
c0a0f9a3 | 3493 | #endif /* RE_ENABLE_I18N */ |
7b7b9e70 | 3494 | *err = ret; |
3b0bdc72 UD |
3495 | return NULL; |
3496 | } | |
434d3784 | 3497 | /* \w match '_' also. */ |
e2b6bfa3 UD |
3498 | for (; *extra; extra++) |
3499 | bitset_set (sbcset, *extra); | |
3b0bdc72 UD |
3500 | |
3501 | /* If it is non-matching list. */ | |
c0a0f9a3 | 3502 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3503 | if (mbcset->non_match) |
c0a0f9a3 UD |
3504 | #else /* not RE_ENABLE_I18N */ |
3505 | if (non_match) | |
3506 | #endif /* not RE_ENABLE_I18N */ | |
3b0bdc72 UD |
3507 | bitset_not (sbcset); |
3508 | ||
3509 | /* Build a tree for simple bracket. */ | |
3510 | br_token.type = SIMPLE_BRACKET; | |
3511 | br_token.opr.sbcset = sbcset; | |
3512 | new_idx = re_dfa_add_node (dfa, br_token, 0); | |
3513 | tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 3514 | if (BE (new_idx == -1 || tree == NULL, 0)) |
3b0bdc72 UD |
3515 | goto build_word_op_espace; |
3516 | ||
c0a0f9a3 | 3517 | #ifdef RE_ENABLE_I18N |
3c0fb574 | 3518 | if (dfa->mb_cur_max > 1) |
3b0bdc72 UD |
3519 | { |
3520 | re_token_t alt_token; | |
3521 | bin_tree_t *mbc_tree; | |
3522 | /* Build a tree for complex bracket. */ | |
3523 | br_token.type = COMPLEX_BRACKET; | |
3524 | br_token.opr.mbcset = mbcset; | |
3525 | dfa->has_mb_node = 1; | |
3526 | new_idx = re_dfa_add_node (dfa, br_token, 0); | |
3527 | mbc_tree = create_tree (NULL, NULL, 0, new_idx); | |
bc15410e | 3528 | if (BE (new_idx == -1 || mbc_tree == NULL, 0)) |
15a7d175 | 3529 | goto build_word_op_espace; |
3b0bdc72 UD |
3530 | /* Then join them by ALT node. */ |
3531 | alt_token.type = OP_ALT; | |
3532 | new_idx = re_dfa_add_node (dfa, alt_token, 0); | |
3533 | tree = create_tree (tree, mbc_tree, 0, new_idx); | |
bc15410e | 3534 | if (BE (new_idx != -1 && mbc_tree != NULL, 1)) |
15a7d175 | 3535 | return tree; |
3b0bdc72 UD |
3536 | } |
3537 | else | |
3538 | { | |
3539 | free_charset (mbcset); | |
3540 | return tree; | |
3541 | } | |
c0a0f9a3 UD |
3542 | #else /* not RE_ENABLE_I18N */ |
3543 | return tree; | |
3544 | #endif /* not RE_ENABLE_I18N */ | |
3545 | ||
3b0bdc72 UD |
3546 | build_word_op_espace: |
3547 | re_free (sbcset); | |
c0a0f9a3 | 3548 | #ifdef RE_ENABLE_I18N |
3b0bdc72 | 3549 | free_charset (mbcset); |
c0a0f9a3 | 3550 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 UD |
3551 | *err = REG_ESPACE; |
3552 | return NULL; | |
3553 | } | |
3554 | ||
3555 | /* This is intended for the expressions like "a{1,3}". | |
3556 | Fetch a number from `input', and return the number. | |
3557 | Return -1, if the number field is empty like "{,1}". | |
3558 | Return -2, If an error is occured. */ | |
3559 | ||
3560 | static int | |
3561 | fetch_number (input, token, syntax) | |
3562 | re_string_t *input; | |
3563 | re_token_t *token; | |
3564 | reg_syntax_t syntax; | |
3565 | { | |
3566 | int num = -1; | |
3567 | unsigned char c; | |
3568 | while (1) | |
3569 | { | |
3570 | *token = fetch_token (input, syntax); | |
3571 | c = token->opr.c; | |
602c2f9d | 3572 | if (BE (token->type == END_OF_RE, 0)) |
15a7d175 | 3573 | return -2; |
3b0bdc72 | 3574 | if (token->type == OP_CLOSE_DUP_NUM || c == ',') |
15a7d175 | 3575 | break; |
602c2f9d | 3576 | num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) |
15a7d175 | 3577 | ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); |
602c2f9d | 3578 | num = (num > RE_DUP_MAX) ? -2 : num; |
3b0bdc72 | 3579 | } |
3b0bdc72 UD |
3580 | return num; |
3581 | } | |
3582 | \f | |
c0a0f9a3 | 3583 | #ifdef RE_ENABLE_I18N |
3b0bdc72 UD |
3584 | static void |
3585 | free_charset (re_charset_t *cset) | |
3586 | { | |
3587 | re_free (cset->mbchars); | |
c0a0f9a3 | 3588 | # ifdef _LIBC |
3b0bdc72 UD |
3589 | re_free (cset->coll_syms); |
3590 | re_free (cset->equiv_classes); | |
3591 | re_free (cset->range_starts); | |
3592 | re_free (cset->range_ends); | |
c0a0f9a3 | 3593 | # endif |
3b0bdc72 UD |
3594 | re_free (cset->char_classes); |
3595 | re_free (cset); | |
3596 | } | |
c0a0f9a3 | 3597 | #endif /* RE_ENABLE_I18N */ |
3b0bdc72 UD |
3598 | \f |
3599 | /* Functions for binary tree operation. */ | |
3600 | ||
3601 | /* Create a node of tree. | |
3602 | Note: This function automatically free left and right if malloc fails. */ | |
3603 | ||
3604 | static bin_tree_t * | |
3605 | create_tree (left, right, type, index) | |
3606 | bin_tree_t *left; | |
3607 | bin_tree_t *right; | |
3608 | re_token_type_t type; | |
3609 | int index; | |
3610 | { | |
3611 | bin_tree_t *tree; | |
3612 | tree = re_malloc (bin_tree_t, 1); | |
bc15410e | 3613 | if (BE (tree == NULL, 0)) |
3b0bdc72 UD |
3614 | { |
3615 | free_bin_tree (left); | |
3616 | free_bin_tree (right); | |
3617 | return NULL; | |
3618 | } | |
3619 | tree->parent = NULL; | |
3620 | tree->left = left; | |
3621 | tree->right = right; | |
3622 | tree->type = type; | |
3623 | tree->node_idx = index; | |
3624 | tree->first = -1; | |
3625 | tree->next = -1; | |
3626 | re_node_set_init_empty (&tree->eclosure); | |
3627 | ||
3628 | if (left != NULL) | |
3629 | left->parent = tree; | |
3630 | if (right != NULL) | |
3631 | right->parent = tree; | |
3632 | return tree; | |
3633 | } | |
3634 | ||
3635 | /* Free the sub tree pointed by TREE. */ | |
3636 | ||
3637 | static void | |
3638 | free_bin_tree (tree) | |
3639 | bin_tree_t *tree; | |
3640 | { | |
3641 | if (tree == NULL) | |
3642 | return; | |
3643 | /*re_node_set_free (&tree->eclosure);*/ | |
3644 | free_bin_tree (tree->left); | |
3645 | free_bin_tree (tree->right); | |
3646 | re_free (tree); | |
3647 | } | |
3648 | ||
3649 | /* Duplicate the node SRC, and return new node. */ | |
3650 | ||
3651 | static bin_tree_t * | |
3652 | duplicate_tree (src, dfa) | |
3653 | const bin_tree_t *src; | |
3654 | re_dfa_t *dfa; | |
3655 | { | |
3656 | bin_tree_t *left = NULL, *right = NULL, *new_tree; | |
3657 | int new_node_idx; | |
3658 | /* Since node indies must be according to Post-order of the tree, | |
3659 | we must duplicate the left at first. */ | |
3660 | if (src->left != NULL) | |
3661 | { | |
3662 | left = duplicate_tree (src->left, dfa); | |
3663 | if (left == NULL) | |
15a7d175 | 3664 | return NULL; |
3b0bdc72 UD |
3665 | } |
3666 | ||
3667 | /* Secondaly, duplicate the right. */ | |
3668 | if (src->right != NULL) | |
3669 | { | |
3670 | right = duplicate_tree (src->right, dfa); | |
3671 | if (right == NULL) | |
15a7d175 UD |
3672 | { |
3673 | free_bin_tree (left); | |
3674 | return NULL; | |
3675 | } | |
3b0bdc72 UD |
3676 | } |
3677 | ||
3678 | /* At last, duplicate itself. */ | |
3679 | if (src->type == NON_TYPE) | |
3680 | { | |
3681 | new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0); | |
3682 | dfa->nodes[new_node_idx].duplicated = 1; | |
bc15410e | 3683 | if (BE (new_node_idx == -1, 0)) |
15a7d175 UD |
3684 | { |
3685 | free_bin_tree (left); | |
3686 | free_bin_tree (right); | |
3687 | return NULL; | |
3688 | } | |
3b0bdc72 UD |
3689 | } |
3690 | else | |
3691 | new_node_idx = src->type; | |
3692 | ||
3693 | new_tree = create_tree (left, right, src->type, new_node_idx); | |
bc15410e | 3694 | if (BE (new_tree == NULL, 0)) |
3b0bdc72 UD |
3695 | { |
3696 | free_bin_tree (left); | |
3697 | free_bin_tree (right); | |
3698 | } | |
3699 | return new_tree; | |
3700 | } |