]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regex_internal.c
[BZ #1950, BZ #2153]
[thirdparty/glibc.git] / posix / regex_internal.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
32 \f
33 /* Functions for string operation. */
34
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
37
38 static reg_errcode_t
39 internal_function
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
42 {
43 reg_errcode_t ret;
44 int init_buf_len;
45
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
55
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
62 }
63
64 /* This function allocate the buffers, and initialize them. */
65
66 static reg_errcode_t
67 internal_function
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
70 {
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
74
75 if (len > 0)
76 {
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
80 }
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82
83 if (icase)
84 {
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
87 {
88 while (1)
89 {
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
100 }
101 }
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
105 }
106 else
107 {
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
113 {
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
117 {
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
120 }
121 }
122 }
123
124 return REG_NOERROR;
125 }
126
127 /* Helper functions for re_string_allocate, and re_string_construct. */
128
129 static reg_errcode_t
130 internal_function
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
132 {
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
135 {
136 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
137 if (BE (new_wcs == NULL, 0))
138 return REG_ESPACE;
139 pstr->wcs = new_wcs;
140 if (pstr->offsets != NULL)
141 {
142 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
143 if (BE (new_offsets == NULL, 0))
144 return REG_ESPACE;
145 pstr->offsets = new_offsets;
146 }
147 }
148 #endif /* RE_ENABLE_I18N */
149 if (pstr->mbs_allocated)
150 {
151 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 new_buf_len);
153 if (BE (new_mbs == NULL, 0))
154 return REG_ESPACE;
155 pstr->mbs = new_mbs;
156 }
157 pstr->bufs_len = new_buf_len;
158 return REG_NOERROR;
159 }
160
161
162 static void
163 internal_function
164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
165 RE_TRANSLATE_TYPE trans, int icase,
166 const re_dfa_t *dfa)
167 {
168 pstr->raw_mbs = (const unsigned char *) str;
169 pstr->len = len;
170 pstr->raw_len = len;
171 pstr->trans = trans;
172 pstr->icase = icase ? 1 : 0;
173 pstr->mbs_allocated = (trans != NULL || icase);
174 pstr->mb_cur_max = dfa->mb_cur_max;
175 pstr->is_utf8 = dfa->is_utf8;
176 pstr->map_notascii = dfa->map_notascii;
177 pstr->stop = pstr->len;
178 pstr->raw_stop = pstr->stop;
179 }
180
181 #ifdef RE_ENABLE_I18N
182
183 /* Build wide character buffer PSTR->WCS.
184 If the byte sequence of the string are:
185 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
186 Then wide character buffer will be:
187 <wc1> , WEOF , <wc2> , WEOF , <wc3>
188 We use WEOF for padding, they indicate that the position isn't
189 a first byte of a multibyte character.
190
191 Note that this function assumes PSTR->VALID_LEN elements are already
192 built and starts from PSTR->VALID_LEN. */
193
194 static void
195 internal_function
196 build_wcs_buffer (re_string_t *pstr)
197 {
198 #ifdef _LIBC
199 unsigned char buf[MB_LEN_MAX];
200 assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 #else
202 unsigned char buf[64];
203 #endif
204 mbstate_t prev_st;
205 int byte_idx, end_idx, remain_len;
206 size_t mbclen;
207
208 /* Build the buffers from pstr->valid_len to either pstr->len or
209 pstr->bufs_len. */
210 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
211 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
212 {
213 wchar_t wc;
214 const char *p;
215
216 remain_len = end_idx - byte_idx;
217 prev_st = pstr->cur_state;
218 /* Apply the translation if we need. */
219 if (BE (pstr->trans != NULL, 0))
220 {
221 int i, ch;
222
223 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224 {
225 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
226 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227 }
228 p = (const char *) buf;
229 }
230 else
231 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
232 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
233 if (BE (mbclen == (size_t) -2, 0))
234 {
235 /* The buffer doesn't have enough space, finish to build. */
236 pstr->cur_state = prev_st;
237 break;
238 }
239 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240 {
241 /* We treat these cases as a singlebyte character. */
242 mbclen = 1;
243 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
244 if (BE (pstr->trans != NULL, 0))
245 wc = pstr->trans[wc];
246 pstr->cur_state = prev_st;
247 }
248
249 /* Write wide character and padding. */
250 pstr->wcs[byte_idx++] = wc;
251 /* Write paddings. */
252 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
253 pstr->wcs[byte_idx++] = WEOF;
254 }
255 pstr->valid_len = byte_idx;
256 pstr->valid_raw_len = byte_idx;
257 }
258
259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
260 but for REG_ICASE. */
261
262 static reg_errcode_t
263 internal_function
264 build_wcs_upper_buffer (re_string_t *pstr)
265 {
266 mbstate_t prev_st;
267 int src_idx, byte_idx, end_idx, remain_len;
268 size_t mbclen;
269 #ifdef _LIBC
270 char buf[MB_LEN_MAX];
271 assert (MB_LEN_MAX >= pstr->mb_cur_max);
272 #else
273 char buf[64];
274 #endif
275
276 byte_idx = pstr->valid_len;
277 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278
279 /* The following optimization assumes that ASCII characters can be
280 mapped to wide characters with a simple cast. */
281 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282 {
283 while (byte_idx < end_idx)
284 {
285 wchar_t wc;
286
287 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
288 && mbsinit (&pstr->cur_state))
289 {
290 /* In case of a singlebyte character. */
291 pstr->mbs[byte_idx]
292 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
293 /* The next step uses the assumption that wchar_t is encoded
294 ASCII-safe: all ASCII values can be converted like this. */
295 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
296 ++byte_idx;
297 continue;
298 }
299
300 remain_len = end_idx - byte_idx;
301 prev_st = pstr->cur_state;
302 mbclen = mbrtowc (&wc,
303 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
304 + byte_idx), remain_len, &pstr->cur_state);
305 if (BE (mbclen + 2 > 2, 1))
306 {
307 wchar_t wcu = wc;
308 if (iswlower (wc))
309 {
310 size_t mbcdlen;
311
312 wcu = towupper (wc);
313 mbcdlen = wcrtomb (buf, wcu, &prev_st);
314 if (BE (mbclen == mbcdlen, 1))
315 memcpy (pstr->mbs + byte_idx, buf, mbclen);
316 else
317 {
318 src_idx = byte_idx;
319 goto offsets_needed;
320 }
321 }
322 else
323 memcpy (pstr->mbs + byte_idx,
324 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
325 pstr->wcs[byte_idx++] = wcu;
326 /* Write paddings. */
327 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
328 pstr->wcs[byte_idx++] = WEOF;
329 }
330 else if (mbclen == (size_t) -1 || mbclen == 0)
331 {
332 /* It is an invalid character or '\0'. Just use the byte. */
333 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
334 pstr->mbs[byte_idx] = ch;
335 /* And also cast it to wide char. */
336 pstr->wcs[byte_idx++] = (wchar_t) ch;
337 if (BE (mbclen == (size_t) -1, 0))
338 pstr->cur_state = prev_st;
339 }
340 else
341 {
342 /* The buffer doesn't have enough space, finish to build. */
343 pstr->cur_state = prev_st;
344 break;
345 }
346 }
347 pstr->valid_len = byte_idx;
348 pstr->valid_raw_len = byte_idx;
349 return REG_NOERROR;
350 }
351 else
352 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
353 {
354 wchar_t wc;
355 const char *p;
356 offsets_needed:
357 remain_len = end_idx - byte_idx;
358 prev_st = pstr->cur_state;
359 if (BE (pstr->trans != NULL, 0))
360 {
361 int i, ch;
362
363 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364 {
365 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
366 buf[i] = pstr->trans[ch];
367 }
368 p = (const char *) buf;
369 }
370 else
371 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
372 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
373 if (BE (mbclen + 2 > 2, 1))
374 {
375 wchar_t wcu = wc;
376 if (iswlower (wc))
377 {
378 size_t mbcdlen;
379
380 wcu = towupper (wc);
381 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
382 if (BE (mbclen == mbcdlen, 1))
383 memcpy (pstr->mbs + byte_idx, buf, mbclen);
384 else if (mbcdlen != (size_t) -1)
385 {
386 size_t i;
387
388 if (byte_idx + mbcdlen > pstr->bufs_len)
389 {
390 pstr->cur_state = prev_st;
391 break;
392 }
393
394 if (pstr->offsets == NULL)
395 {
396 pstr->offsets = re_malloc (int, pstr->bufs_len);
397
398 if (pstr->offsets == NULL)
399 return REG_ESPACE;
400 }
401 if (!pstr->offsets_needed)
402 {
403 for (i = 0; i < (size_t) byte_idx; ++i)
404 pstr->offsets[i] = i;
405 pstr->offsets_needed = 1;
406 }
407
408 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
409 pstr->wcs[byte_idx] = wcu;
410 pstr->offsets[byte_idx] = src_idx;
411 for (i = 1; i < mbcdlen; ++i)
412 {
413 pstr->offsets[byte_idx + i]
414 = src_idx + (i < mbclen ? i : mbclen - 1);
415 pstr->wcs[byte_idx + i] = WEOF;
416 }
417 pstr->len += mbcdlen - mbclen;
418 if (pstr->raw_stop > src_idx)
419 pstr->stop += mbcdlen - mbclen;
420 end_idx = (pstr->bufs_len > pstr->len)
421 ? pstr->len : pstr->bufs_len;
422 byte_idx += mbcdlen;
423 src_idx += mbclen;
424 continue;
425 }
426 else
427 memcpy (pstr->mbs + byte_idx, p, mbclen);
428 }
429 else
430 memcpy (pstr->mbs + byte_idx, p, mbclen);
431
432 if (BE (pstr->offsets_needed != 0, 0))
433 {
434 size_t i;
435 for (i = 0; i < mbclen; ++i)
436 pstr->offsets[byte_idx + i] = src_idx + i;
437 }
438 src_idx += mbclen;
439
440 pstr->wcs[byte_idx++] = wcu;
441 /* Write paddings. */
442 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
443 pstr->wcs[byte_idx++] = WEOF;
444 }
445 else if (mbclen == (size_t) -1 || mbclen == 0)
446 {
447 /* It is an invalid character or '\0'. Just use the byte. */
448 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449
450 if (BE (pstr->trans != NULL, 0))
451 ch = pstr->trans [ch];
452 pstr->mbs[byte_idx] = ch;
453
454 if (BE (pstr->offsets_needed != 0, 0))
455 pstr->offsets[byte_idx] = src_idx;
456 ++src_idx;
457
458 /* And also cast it to wide char. */
459 pstr->wcs[byte_idx++] = (wchar_t) ch;
460 if (BE (mbclen == (size_t) -1, 0))
461 pstr->cur_state = prev_st;
462 }
463 else
464 {
465 /* The buffer doesn't have enough space, finish to build. */
466 pstr->cur_state = prev_st;
467 break;
468 }
469 }
470 pstr->valid_len = byte_idx;
471 pstr->valid_raw_len = src_idx;
472 return REG_NOERROR;
473 }
474
475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
476 Return the index. */
477
478 static int
479 internal_function
480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
481 {
482 mbstate_t prev_st;
483 int rawbuf_idx;
484 size_t mbclen;
485 wchar_t wc = 0;
486
487 /* Skip the characters which are not necessary to check. */
488 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
489 rawbuf_idx < new_raw_idx;)
490 {
491 int remain_len;
492 remain_len = pstr->len - rawbuf_idx;
493 prev_st = pstr->cur_state;
494 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
495 remain_len, &pstr->cur_state);
496 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497 {
498 /* We treat these cases as a singlebyte character. */
499 mbclen = 1;
500 pstr->cur_state = prev_st;
501 }
502 /* Then proceed the next character. */
503 rawbuf_idx += mbclen;
504 }
505 *last_wc = (wint_t) wc;
506 return rawbuf_idx;
507 }
508 #endif /* RE_ENABLE_I18N */
509
510 /* Build the buffer PSTR->MBS, and apply the translation if we need.
511 This function is used in case of REG_ICASE. */
512
513 static void
514 internal_function
515 build_upper_buffer (re_string_t *pstr)
516 {
517 int char_idx, end_idx;
518 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
519
520 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
521 {
522 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
523 if (BE (pstr->trans != NULL, 0))
524 ch = pstr->trans[ch];
525 if (islower (ch))
526 pstr->mbs[char_idx] = toupper (ch);
527 else
528 pstr->mbs[char_idx] = ch;
529 }
530 pstr->valid_len = char_idx;
531 pstr->valid_raw_len = char_idx;
532 }
533
534 /* Apply TRANS to the buffer in PSTR. */
535
536 static void
537 internal_function
538 re_string_translate_buffer (re_string_t *pstr)
539 {
540 int buf_idx, end_idx;
541 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542
543 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
544 {
545 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
546 pstr->mbs[buf_idx] = pstr->trans[ch];
547 }
548
549 pstr->valid_len = buf_idx;
550 pstr->valid_raw_len = buf_idx;
551 }
552
553 /* This function re-construct the buffers.
554 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
555 convert to upper case in case of REG_ICASE, apply translation. */
556
557 static reg_errcode_t
558 internal_function
559 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
560 {
561 int offset = idx - pstr->raw_mbs_idx;
562 if (BE (offset < 0, 0))
563 {
564 /* Reset buffer. */
565 #ifdef RE_ENABLE_I18N
566 if (pstr->mb_cur_max > 1)
567 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
568 #endif /* RE_ENABLE_I18N */
569 pstr->len = pstr->raw_len;
570 pstr->stop = pstr->raw_stop;
571 pstr->valid_len = 0;
572 pstr->raw_mbs_idx = 0;
573 pstr->valid_raw_len = 0;
574 pstr->offsets_needed = 0;
575 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
576 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
577 if (!pstr->mbs_allocated)
578 pstr->mbs = (unsigned char *) pstr->raw_mbs;
579 offset = idx;
580 }
581
582 if (BE (offset != 0, 1))
583 {
584 /* Are the characters which are already checked remain? */
585 if (BE (offset < pstr->valid_raw_len, 1)
586 #ifdef RE_ENABLE_I18N
587 /* Handling this would enlarge the code too much.
588 Accept a slowdown in that case. */
589 && pstr->offsets_needed == 0
590 #endif
591 )
592 {
593 /* Yes, move them to the front of the buffer. */
594 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
595 #ifdef RE_ENABLE_I18N
596 if (pstr->mb_cur_max > 1)
597 memmove (pstr->wcs, pstr->wcs + offset,
598 (pstr->valid_len - offset) * sizeof (wint_t));
599 #endif /* RE_ENABLE_I18N */
600 if (BE (pstr->mbs_allocated, 0))
601 memmove (pstr->mbs, pstr->mbs + offset,
602 pstr->valid_len - offset);
603 pstr->valid_len -= offset;
604 pstr->valid_raw_len -= offset;
605 #if DEBUG
606 assert (pstr->valid_len > 0);
607 #endif
608 }
609 else
610 {
611 /* No, skip all characters until IDX. */
612 #ifdef RE_ENABLE_I18N
613 if (BE (pstr->offsets_needed, 0))
614 {
615 pstr->len = pstr->raw_len - idx + offset;
616 pstr->stop = pstr->raw_stop - idx + offset;
617 pstr->offsets_needed = 0;
618 }
619 #endif
620 pstr->valid_len = 0;
621 pstr->valid_raw_len = 0;
622 #ifdef RE_ENABLE_I18N
623 if (pstr->mb_cur_max > 1)
624 {
625 int wcs_idx;
626 wint_t wc = WEOF;
627
628 if (pstr->is_utf8)
629 {
630 const unsigned char *raw, *p, *q, *end;
631
632 /* Special case UTF-8. Multi-byte chars start with any
633 byte other than 0x80 - 0xbf. */
634 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
635 end = raw + (offset - pstr->mb_cur_max);
636 p = raw + offset - 1;
637 #ifdef _LIBC
638 /* We know the wchar_t encoding is UCS4, so for the simple
639 case, ASCII characters, skip the conversion step. */
640 if (isascii (*p) && BE (pstr->trans == NULL, 1))
641 {
642 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
643 pstr->valid_len = 0;
644 wc = (wchar_t) *p;
645 }
646 else
647 #endif
648 for (; p >= end; --p)
649 if ((*p & 0xc0) != 0x80)
650 {
651 mbstate_t cur_state;
652 wchar_t wc2;
653 int mlen = raw + pstr->len - p;
654 unsigned char buf[6];
655 size_t mbclen;
656
657 q = p;
658 if (BE (pstr->trans != NULL, 0))
659 {
660 int i = mlen < 6 ? mlen : 6;
661 while (--i >= 0)
662 buf[i] = pstr->trans[p[i]];
663 q = buf;
664 }
665 /* XXX Don't use mbrtowc, we know which conversion
666 to use (UTF-8 -> UCS4). */
667 memset (&cur_state, 0, sizeof (cur_state));
668 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
669 &cur_state);
670 if (raw + offset - p <= mbclen
671 && mbclen < (size_t) -2)
672 {
673 memset (&pstr->cur_state, '\0',
674 sizeof (mbstate_t));
675 pstr->valid_len = mbclen - (raw + offset - p);
676 wc = wc2;
677 }
678 break;
679 }
680 }
681
682 if (wc == WEOF)
683 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
684 if (BE (pstr->valid_len, 0))
685 {
686 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
687 pstr->wcs[wcs_idx] = WEOF;
688 if (pstr->mbs_allocated)
689 memset (pstr->mbs, 255, pstr->valid_len);
690 }
691 pstr->valid_raw_len = pstr->valid_len;
692 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
693 && IS_WIDE_WORD_CHAR (wc))
694 ? CONTEXT_WORD
695 : ((IS_WIDE_NEWLINE (wc)
696 && pstr->newline_anchor)
697 ? CONTEXT_NEWLINE : 0));
698 }
699 else
700 #endif /* RE_ENABLE_I18N */
701 {
702 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
703 if (pstr->trans)
704 c = pstr->trans[c];
705 pstr->tip_context = (bitset_contain (pstr->word_char, c)
706 ? CONTEXT_WORD
707 : ((IS_NEWLINE (c) && pstr->newline_anchor)
708 ? CONTEXT_NEWLINE : 0));
709 }
710 }
711 if (!BE (pstr->mbs_allocated, 0))
712 pstr->mbs += offset;
713 }
714 pstr->raw_mbs_idx = idx;
715 pstr->len -= offset;
716 pstr->stop -= offset;
717
718 /* Then build the buffers. */
719 #ifdef RE_ENABLE_I18N
720 if (pstr->mb_cur_max > 1)
721 {
722 if (pstr->icase)
723 {
724 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
725 if (BE (ret != REG_NOERROR, 0))
726 return ret;
727 }
728 else
729 build_wcs_buffer (pstr);
730 }
731 else
732 #endif /* RE_ENABLE_I18N */
733 if (BE (pstr->mbs_allocated, 0))
734 {
735 if (pstr->icase)
736 build_upper_buffer (pstr);
737 else if (pstr->trans != NULL)
738 re_string_translate_buffer (pstr);
739 }
740 else
741 pstr->valid_len = pstr->len;
742
743 pstr->cur_idx = 0;
744 return REG_NOERROR;
745 }
746
747 static unsigned char
748 internal_function __attribute ((pure))
749 re_string_peek_byte_case (const re_string_t *pstr, int idx)
750 {
751 int ch, off;
752
753 /* Handle the common (easiest) cases first. */
754 if (BE (!pstr->mbs_allocated, 1))
755 return re_string_peek_byte (pstr, idx);
756
757 #ifdef RE_ENABLE_I18N
758 if (pstr->mb_cur_max > 1
759 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
760 return re_string_peek_byte (pstr, idx);
761 #endif
762
763 off = pstr->cur_idx + idx;
764 #ifdef RE_ENABLE_I18N
765 if (pstr->offsets_needed)
766 off = pstr->offsets[off];
767 #endif
768
769 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
770
771 #ifdef RE_ENABLE_I18N
772 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
773 this function returns CAPITAL LETTER I instead of first byte of
774 DOTLESS SMALL LETTER I. The latter would confuse the parser,
775 since peek_byte_case doesn't advance cur_idx in any way. */
776 if (pstr->offsets_needed && !isascii (ch))
777 return re_string_peek_byte (pstr, idx);
778 #endif
779
780 return ch;
781 }
782
783 static unsigned char
784 internal_function __attribute ((pure))
785 re_string_fetch_byte_case (re_string_t *pstr)
786 {
787 if (BE (!pstr->mbs_allocated, 1))
788 return re_string_fetch_byte (pstr);
789
790 #ifdef RE_ENABLE_I18N
791 if (pstr->offsets_needed)
792 {
793 int off, ch;
794
795 /* For tr_TR.UTF-8 [[:islower:]] there is
796 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
797 in that case the whole multi-byte character and return
798 the original letter. On the other side, with
799 [[: DOTLESS SMALL LETTER I return [[:I, as doing
800 anything else would complicate things too much. */
801
802 if (!re_string_first_byte (pstr, pstr->cur_idx))
803 return re_string_fetch_byte (pstr);
804
805 off = pstr->offsets[pstr->cur_idx];
806 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
807
808 if (! isascii (ch))
809 return re_string_fetch_byte (pstr);
810
811 re_string_skip_bytes (pstr,
812 re_string_char_size_at (pstr, pstr->cur_idx));
813 return ch;
814 }
815 #endif
816
817 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
818 }
819
820 static void
821 internal_function
822 re_string_destruct (re_string_t *pstr)
823 {
824 #ifdef RE_ENABLE_I18N
825 re_free (pstr->wcs);
826 re_free (pstr->offsets);
827 #endif /* RE_ENABLE_I18N */
828 if (pstr->mbs_allocated)
829 re_free (pstr->mbs);
830 }
831
832 /* Return the context at IDX in INPUT. */
833
834 static unsigned int
835 internal_function
836 re_string_context_at (const re_string_t *input, int idx, int eflags)
837 {
838 int c;
839 if (BE (idx < 0, 0))
840 /* In this case, we use the value stored in input->tip_context,
841 since we can't know the character in input->mbs[-1] here. */
842 return input->tip_context;
843 if (BE (idx == input->len, 0))
844 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
845 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
846 #ifdef RE_ENABLE_I18N
847 if (input->mb_cur_max > 1)
848 {
849 wint_t wc;
850 int wc_idx = idx;
851 while(input->wcs[wc_idx] == WEOF)
852 {
853 #ifdef DEBUG
854 /* It must not happen. */
855 assert (wc_idx >= 0);
856 #endif
857 --wc_idx;
858 if (wc_idx < 0)
859 return input->tip_context;
860 }
861 wc = input->wcs[wc_idx];
862 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
863 return CONTEXT_WORD;
864 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
865 ? CONTEXT_NEWLINE : 0);
866 }
867 else
868 #endif
869 {
870 c = re_string_byte_at (input, idx);
871 if (bitset_contain (input->word_char, c))
872 return CONTEXT_WORD;
873 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
874 }
875 }
876 \f
877 /* Functions for set operation. */
878
879 static reg_errcode_t
880 internal_function
881 re_node_set_alloc (re_node_set *set, int size)
882 {
883 set->alloc = size;
884 set->nelem = 0;
885 set->elems = re_malloc (int, size);
886 if (BE (set->elems == NULL, 0))
887 return REG_ESPACE;
888 return REG_NOERROR;
889 }
890
891 static reg_errcode_t
892 internal_function
893 re_node_set_init_1 (re_node_set *set, int elem)
894 {
895 set->alloc = 1;
896 set->nelem = 1;
897 set->elems = re_malloc (int, 1);
898 if (BE (set->elems == NULL, 0))
899 {
900 set->alloc = set->nelem = 0;
901 return REG_ESPACE;
902 }
903 set->elems[0] = elem;
904 return REG_NOERROR;
905 }
906
907 static reg_errcode_t
908 internal_function
909 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
910 {
911 set->alloc = 2;
912 set->elems = re_malloc (int, 2);
913 if (BE (set->elems == NULL, 0))
914 return REG_ESPACE;
915 if (elem1 == elem2)
916 {
917 set->nelem = 1;
918 set->elems[0] = elem1;
919 }
920 else
921 {
922 set->nelem = 2;
923 if (elem1 < elem2)
924 {
925 set->elems[0] = elem1;
926 set->elems[1] = elem2;
927 }
928 else
929 {
930 set->elems[0] = elem2;
931 set->elems[1] = elem1;
932 }
933 }
934 return REG_NOERROR;
935 }
936
937 static reg_errcode_t
938 internal_function
939 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
940 {
941 dest->nelem = src->nelem;
942 if (src->nelem > 0)
943 {
944 dest->alloc = dest->nelem;
945 dest->elems = re_malloc (int, dest->alloc);
946 if (BE (dest->elems == NULL, 0))
947 {
948 dest->alloc = dest->nelem = 0;
949 return REG_ESPACE;
950 }
951 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
952 }
953 else
954 re_node_set_init_empty (dest);
955 return REG_NOERROR;
956 }
957
958 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
959 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
960 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
961
962 static reg_errcode_t
963 internal_function
964 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
965 const re_node_set *src2)
966 {
967 int i1, i2, is, id, delta, sbase;
968 if (src1->nelem == 0 || src2->nelem == 0)
969 return REG_NOERROR;
970
971 /* We need dest->nelem + 2 * elems_in_intersection; this is a
972 conservative estimate. */
973 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
974 {
975 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
976 int *new_elems = re_realloc (dest->elems, int, new_alloc);
977 if (BE (new_elems == NULL, 0))
978 return REG_ESPACE;
979 dest->elems = new_elems;
980 dest->alloc = new_alloc;
981 }
982
983 /* Find the items in the intersection of SRC1 and SRC2, and copy
984 into the top of DEST those that are not already in DEST itself. */
985 sbase = dest->nelem + src1->nelem + src2->nelem;
986 i1 = src1->nelem - 1;
987 i2 = src2->nelem - 1;
988 id = dest->nelem - 1;
989 for (;;)
990 {
991 if (src1->elems[i1] == src2->elems[i2])
992 {
993 /* Try to find the item in DEST. Maybe we could binary search? */
994 while (id >= 0 && dest->elems[id] > src1->elems[i1])
995 --id;
996
997 if (id < 0 || dest->elems[id] != src1->elems[i1])
998 dest->elems[--sbase] = src1->elems[i1];
999
1000 if (--i1 < 0 || --i2 < 0)
1001 break;
1002 }
1003
1004 /* Lower the highest of the two items. */
1005 else if (src1->elems[i1] < src2->elems[i2])
1006 {
1007 if (--i2 < 0)
1008 break;
1009 }
1010 else
1011 {
1012 if (--i1 < 0)
1013 break;
1014 }
1015 }
1016
1017 id = dest->nelem - 1;
1018 is = dest->nelem + src1->nelem + src2->nelem - 1;
1019 delta = is - sbase + 1;
1020
1021 /* Now copy. When DELTA becomes zero, the remaining
1022 DEST elements are already in place; this is more or
1023 less the same loop that is in re_node_set_merge. */
1024 dest->nelem += delta;
1025 if (delta > 0 && id >= 0)
1026 for (;;)
1027 {
1028 if (dest->elems[is] > dest->elems[id])
1029 {
1030 /* Copy from the top. */
1031 dest->elems[id + delta--] = dest->elems[is--];
1032 if (delta == 0)
1033 break;
1034 }
1035 else
1036 {
1037 /* Slide from the bottom. */
1038 dest->elems[id + delta] = dest->elems[id];
1039 if (--id < 0)
1040 break;
1041 }
1042 }
1043
1044 /* Copy remaining SRC elements. */
1045 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1046
1047 return REG_NOERROR;
1048 }
1049
1050 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1051 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1052
1053 static reg_errcode_t
1054 internal_function
1055 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1056 const re_node_set *src2)
1057 {
1058 int i1, i2, id;
1059 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1060 {
1061 dest->alloc = src1->nelem + src2->nelem;
1062 dest->elems = re_malloc (int, dest->alloc);
1063 if (BE (dest->elems == NULL, 0))
1064 return REG_ESPACE;
1065 }
1066 else
1067 {
1068 if (src1 != NULL && src1->nelem > 0)
1069 return re_node_set_init_copy (dest, src1);
1070 else if (src2 != NULL && src2->nelem > 0)
1071 return re_node_set_init_copy (dest, src2);
1072 else
1073 re_node_set_init_empty (dest);
1074 return REG_NOERROR;
1075 }
1076 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1077 {
1078 if (src1->elems[i1] > src2->elems[i2])
1079 {
1080 dest->elems[id++] = src2->elems[i2++];
1081 continue;
1082 }
1083 if (src1->elems[i1] == src2->elems[i2])
1084 ++i2;
1085 dest->elems[id++] = src1->elems[i1++];
1086 }
1087 if (i1 < src1->nelem)
1088 {
1089 memcpy (dest->elems + id, src1->elems + i1,
1090 (src1->nelem - i1) * sizeof (int));
1091 id += src1->nelem - i1;
1092 }
1093 else if (i2 < src2->nelem)
1094 {
1095 memcpy (dest->elems + id, src2->elems + i2,
1096 (src2->nelem - i2) * sizeof (int));
1097 id += src2->nelem - i2;
1098 }
1099 dest->nelem = id;
1100 return REG_NOERROR;
1101 }
1102
1103 /* Calculate the union set of the sets DEST and SRC. And store it to
1104 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1105
1106 static reg_errcode_t
1107 internal_function
1108 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1109 {
1110 int is, id, sbase, delta;
1111 if (src == NULL || src->nelem == 0)
1112 return REG_NOERROR;
1113 if (dest->alloc < 2 * src->nelem + dest->nelem)
1114 {
1115 int new_alloc = 2 * (src->nelem + dest->alloc);
1116 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1117 if (BE (new_buffer == NULL, 0))
1118 return REG_ESPACE;
1119 dest->elems = new_buffer;
1120 dest->alloc = new_alloc;
1121 }
1122
1123 if (BE (dest->nelem == 0, 0))
1124 {
1125 dest->nelem = src->nelem;
1126 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1127 return REG_NOERROR;
1128 }
1129
1130 /* Copy into the top of DEST the items of SRC that are not
1131 found in DEST. Maybe we could binary search in DEST? */
1132 for (sbase = dest->nelem + 2 * src->nelem,
1133 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1134 {
1135 if (dest->elems[id] == src->elems[is])
1136 is--, id--;
1137 else if (dest->elems[id] < src->elems[is])
1138 dest->elems[--sbase] = src->elems[is--];
1139 else /* if (dest->elems[id] > src->elems[is]) */
1140 --id;
1141 }
1142
1143 if (is >= 0)
1144 {
1145 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1146 sbase -= is + 1;
1147 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1148 }
1149
1150 id = dest->nelem - 1;
1151 is = dest->nelem + 2 * src->nelem - 1;
1152 delta = is - sbase + 1;
1153 if (delta == 0)
1154 return REG_NOERROR;
1155
1156 /* Now copy. When DELTA becomes zero, the remaining
1157 DEST elements are already in place. */
1158 dest->nelem += delta;
1159 for (;;)
1160 {
1161 if (dest->elems[is] > dest->elems[id])
1162 {
1163 /* Copy from the top. */
1164 dest->elems[id + delta--] = dest->elems[is--];
1165 if (delta == 0)
1166 break;
1167 }
1168 else
1169 {
1170 /* Slide from the bottom. */
1171 dest->elems[id + delta] = dest->elems[id];
1172 if (--id < 0)
1173 {
1174 /* Copy remaining SRC elements. */
1175 memcpy (dest->elems, dest->elems + sbase,
1176 delta * sizeof (int));
1177 break;
1178 }
1179 }
1180 }
1181
1182 return REG_NOERROR;
1183 }
1184
1185 /* Insert the new element ELEM to the re_node_set* SET.
1186 SET should not already have ELEM.
1187 return -1 if an error is occured, return 1 otherwise. */
1188
1189 static int
1190 internal_function
1191 re_node_set_insert (re_node_set *set, int elem)
1192 {
1193 int idx;
1194 /* In case the set is empty. */
1195 if (set->alloc == 0)
1196 {
1197 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1198 return 1;
1199 else
1200 return -1;
1201 }
1202
1203 if (BE (set->nelem, 0) == 0)
1204 {
1205 /* We already guaranteed above that set->alloc != 0. */
1206 set->elems[0] = elem;
1207 ++set->nelem;
1208 return 1;
1209 }
1210
1211 /* Realloc if we need. */
1212 if (set->alloc == set->nelem)
1213 {
1214 int *new_elems;
1215 set->alloc = set->alloc * 2;
1216 new_elems = re_realloc (set->elems, int, set->alloc);
1217 if (BE (new_elems == NULL, 0))
1218 return -1;
1219 set->elems = new_elems;
1220 }
1221
1222 /* Move the elements which follows the new element. Test the
1223 first element separately to skip a check in the inner loop. */
1224 if (elem < set->elems[0])
1225 {
1226 idx = 0;
1227 for (idx = set->nelem; idx > 0; idx--)
1228 set->elems[idx] = set->elems[idx - 1];
1229 }
1230 else
1231 {
1232 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1233 set->elems[idx] = set->elems[idx - 1];
1234 }
1235
1236 /* Insert the new element. */
1237 set->elems[idx] = elem;
1238 ++set->nelem;
1239 return 1;
1240 }
1241
1242 /* Insert the new element ELEM to the re_node_set* SET.
1243 SET should not already have any element greater than or equal to ELEM.
1244 Return -1 if an error is occured, return 1 otherwise. */
1245
1246 static int
1247 internal_function
1248 re_node_set_insert_last (re_node_set *set, int elem)
1249 {
1250 /* Realloc if we need. */
1251 if (set->alloc == set->nelem)
1252 {
1253 int *new_elems;
1254 set->alloc = (set->alloc + 1) * 2;
1255 new_elems = re_realloc (set->elems, int, set->alloc);
1256 if (BE (new_elems == NULL, 0))
1257 return -1;
1258 set->elems = new_elems;
1259 }
1260
1261 /* Insert the new element. */
1262 set->elems[set->nelem++] = elem;
1263 return 1;
1264 }
1265
1266 /* Compare two node sets SET1 and SET2.
1267 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1268
1269 static int
1270 internal_function __attribute ((pure))
1271 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1272 {
1273 int i;
1274 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1275 return 0;
1276 for (i = set1->nelem ; --i >= 0 ; )
1277 if (set1->elems[i] != set2->elems[i])
1278 return 0;
1279 return 1;
1280 }
1281
1282 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1283
1284 static int
1285 internal_function __attribute ((pure))
1286 re_node_set_contains (const re_node_set *set, int elem)
1287 {
1288 unsigned int idx, right, mid;
1289 if (set->nelem <= 0)
1290 return 0;
1291
1292 /* Binary search the element. */
1293 idx = 0;
1294 right = set->nelem - 1;
1295 while (idx < right)
1296 {
1297 mid = (idx + right) / 2;
1298 if (set->elems[mid] < elem)
1299 idx = mid + 1;
1300 else
1301 right = mid;
1302 }
1303 return set->elems[idx] == elem ? idx + 1 : 0;
1304 }
1305
1306 static void
1307 internal_function
1308 re_node_set_remove_at (re_node_set *set, int idx)
1309 {
1310 if (idx < 0 || idx >= set->nelem)
1311 return;
1312 --set->nelem;
1313 for (; idx < set->nelem; idx++)
1314 set->elems[idx] = set->elems[idx + 1];
1315 }
1316 \f
1317
1318 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1319 Or return -1, if an error will be occured. */
1320
1321 static int
1322 internal_function
1323 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1324 {
1325 int type = token.type;
1326 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1327 {
1328 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1329 int *new_nexts, *new_indices;
1330 re_node_set *new_edests, *new_eclosures;
1331 re_token_t *new_nodes;
1332
1333 /* Avoid overflows. */
1334 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1335 return -1;
1336
1337 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1338 if (BE (new_nodes == NULL, 0))
1339 return -1;
1340 dfa->nodes = new_nodes;
1341 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1342 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1343 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1344 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1345 if (BE (new_nexts == NULL || new_indices == NULL
1346 || new_edests == NULL || new_eclosures == NULL, 0))
1347 return -1;
1348 dfa->nexts = new_nexts;
1349 dfa->org_indices = new_indices;
1350 dfa->edests = new_edests;
1351 dfa->eclosures = new_eclosures;
1352 dfa->nodes_alloc = new_nodes_alloc;
1353 }
1354 dfa->nodes[dfa->nodes_len] = token;
1355 dfa->nodes[dfa->nodes_len].constraint = 0;
1356 #ifdef RE_ENABLE_I18N
1357 dfa->nodes[dfa->nodes_len].accept_mb =
1358 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1359 #endif
1360 dfa->nexts[dfa->nodes_len] = -1;
1361 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1362 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1363 return dfa->nodes_len++;
1364 }
1365
1366 static inline unsigned int
1367 internal_function
1368 calc_state_hash (const re_node_set *nodes, unsigned int context)
1369 {
1370 unsigned int hash = nodes->nelem + context;
1371 int i;
1372 for (i = 0 ; i < nodes->nelem ; i++)
1373 hash += nodes->elems[i];
1374 return hash;
1375 }
1376
1377 /* Search for the state whose node_set is equivalent to NODES.
1378 Return the pointer to the state, if we found it in the DFA.
1379 Otherwise create the new one and return it. In case of an error
1380 return NULL and set the error code in ERR.
1381 Note: - We assume NULL as the invalid state, then it is possible that
1382 return value is NULL and ERR is REG_NOERROR.
1383 - We never return non-NULL value in case of any errors, it is for
1384 optimization. */
1385
1386 static re_dfastate_t *
1387 internal_function
1388 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1389 const re_node_set *nodes)
1390 {
1391 unsigned int hash;
1392 re_dfastate_t *new_state;
1393 struct re_state_table_entry *spot;
1394 int i;
1395 if (BE (nodes->nelem == 0, 0))
1396 {
1397 *err = REG_NOERROR;
1398 return NULL;
1399 }
1400 hash = calc_state_hash (nodes, 0);
1401 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1402
1403 for (i = 0 ; i < spot->num ; i++)
1404 {
1405 re_dfastate_t *state = spot->array[i];
1406 if (hash != state->hash)
1407 continue;
1408 if (re_node_set_compare (&state->nodes, nodes))
1409 return state;
1410 }
1411
1412 /* There are no appropriate state in the dfa, create the new one. */
1413 new_state = create_ci_newstate (dfa, nodes, hash);
1414 if (BE (new_state == NULL, 0))
1415 *err = REG_ESPACE;
1416
1417 return new_state;
1418 }
1419
1420 /* Search for the state whose node_set is equivalent to NODES and
1421 whose context is equivalent to CONTEXT.
1422 Return the pointer to the state, if we found it in the DFA.
1423 Otherwise create the new one and return it. In case of an error
1424 return NULL and set the error code in ERR.
1425 Note: - We assume NULL as the invalid state, then it is possible that
1426 return value is NULL and ERR is REG_NOERROR.
1427 - We never return non-NULL value in case of any errors, it is for
1428 optimization. */
1429
1430 static re_dfastate_t *
1431 internal_function
1432 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1433 const re_node_set *nodes, unsigned int context)
1434 {
1435 unsigned int hash;
1436 re_dfastate_t *new_state;
1437 struct re_state_table_entry *spot;
1438 int i;
1439 if (nodes->nelem == 0)
1440 {
1441 *err = REG_NOERROR;
1442 return NULL;
1443 }
1444 hash = calc_state_hash (nodes, context);
1445 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1446
1447 for (i = 0 ; i < spot->num ; i++)
1448 {
1449 re_dfastate_t *state = spot->array[i];
1450 if (state->hash == hash
1451 && state->context == context
1452 && re_node_set_compare (state->entrance_nodes, nodes))
1453 return state;
1454 }
1455 /* There are no appropriate state in `dfa', create the new one. */
1456 new_state = create_cd_newstate (dfa, nodes, context, hash);
1457 if (BE (new_state == NULL, 0))
1458 *err = REG_ESPACE;
1459
1460 return new_state;
1461 }
1462
1463 /* Finish initialization of the new state NEWSTATE, and using its hash value
1464 HASH put in the appropriate bucket of DFA's state table. Return value
1465 indicates the error code if failed. */
1466
1467 static reg_errcode_t
1468 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1469 unsigned int hash)
1470 {
1471 struct re_state_table_entry *spot;
1472 reg_errcode_t err;
1473 int i;
1474
1475 newstate->hash = hash;
1476 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1477 if (BE (err != REG_NOERROR, 0))
1478 return REG_ESPACE;
1479 for (i = 0; i < newstate->nodes.nelem; i++)
1480 {
1481 int elem = newstate->nodes.elems[i];
1482 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1483 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1484 }
1485
1486 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1487 if (BE (spot->alloc <= spot->num, 0))
1488 {
1489 int new_alloc = 2 * spot->num + 2;
1490 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1491 new_alloc);
1492 if (BE (new_array == NULL, 0))
1493 return REG_ESPACE;
1494 spot->array = new_array;
1495 spot->alloc = new_alloc;
1496 }
1497 spot->array[spot->num++] = newstate;
1498 return REG_NOERROR;
1499 }
1500
1501 static void
1502 free_state (re_dfastate_t *state)
1503 {
1504 re_node_set_free (&state->non_eps_nodes);
1505 re_node_set_free (&state->inveclosure);
1506 if (state->entrance_nodes != &state->nodes)
1507 {
1508 re_node_set_free (state->entrance_nodes);
1509 re_free (state->entrance_nodes);
1510 }
1511 re_node_set_free (&state->nodes);
1512 re_free (state->word_trtable);
1513 re_free (state->trtable);
1514 re_free (state);
1515 }
1516
1517 /* Create the new state which is independ of contexts.
1518 Return the new state if succeeded, otherwise return NULL. */
1519
1520 static re_dfastate_t *
1521 internal_function
1522 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1523 unsigned int hash)
1524 {
1525 int i;
1526 reg_errcode_t err;
1527 re_dfastate_t *newstate;
1528
1529 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1530 if (BE (newstate == NULL, 0))
1531 return NULL;
1532 err = re_node_set_init_copy (&newstate->nodes, nodes);
1533 if (BE (err != REG_NOERROR, 0))
1534 {
1535 re_free (newstate);
1536 return NULL;
1537 }
1538
1539 newstate->entrance_nodes = &newstate->nodes;
1540 for (i = 0 ; i < nodes->nelem ; i++)
1541 {
1542 re_token_t *node = dfa->nodes + nodes->elems[i];
1543 re_token_type_t type = node->type;
1544 if (type == CHARACTER && !node->constraint)
1545 continue;
1546 #ifdef RE_ENABLE_I18N
1547 newstate->accept_mb |= node->accept_mb;
1548 #endif /* RE_ENABLE_I18N */
1549
1550 /* If the state has the halt node, the state is a halt state. */
1551 if (type == END_OF_RE)
1552 newstate->halt = 1;
1553 else if (type == OP_BACK_REF)
1554 newstate->has_backref = 1;
1555 else if (type == ANCHOR || node->constraint)
1556 newstate->has_constraint = 1;
1557 }
1558 err = register_state (dfa, newstate, hash);
1559 if (BE (err != REG_NOERROR, 0))
1560 {
1561 free_state (newstate);
1562 newstate = NULL;
1563 }
1564 return newstate;
1565 }
1566
1567 /* Create the new state which is depend on the context CONTEXT.
1568 Return the new state if succeeded, otherwise return NULL. */
1569
1570 static re_dfastate_t *
1571 internal_function
1572 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1573 unsigned int context, unsigned int hash)
1574 {
1575 int i, nctx_nodes = 0;
1576 reg_errcode_t err;
1577 re_dfastate_t *newstate;
1578
1579 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1580 if (BE (newstate == NULL, 0))
1581 return NULL;
1582 err = re_node_set_init_copy (&newstate->nodes, nodes);
1583 if (BE (err != REG_NOERROR, 0))
1584 {
1585 re_free (newstate);
1586 return NULL;
1587 }
1588
1589 newstate->context = context;
1590 newstate->entrance_nodes = &newstate->nodes;
1591
1592 for (i = 0 ; i < nodes->nelem ; i++)
1593 {
1594 unsigned int constraint = 0;
1595 re_token_t *node = dfa->nodes + nodes->elems[i];
1596 re_token_type_t type = node->type;
1597 if (node->constraint)
1598 constraint = node->constraint;
1599
1600 if (type == CHARACTER && !constraint)
1601 continue;
1602 #ifdef RE_ENABLE_I18N
1603 newstate->accept_mb |= node->accept_mb;
1604 #endif /* RE_ENABLE_I18N */
1605
1606 /* If the state has the halt node, the state is a halt state. */
1607 if (type == END_OF_RE)
1608 newstate->halt = 1;
1609 else if (type == OP_BACK_REF)
1610 newstate->has_backref = 1;
1611 else if (type == ANCHOR)
1612 constraint = node->opr.ctx_type;
1613
1614 if (constraint)
1615 {
1616 if (newstate->entrance_nodes == &newstate->nodes)
1617 {
1618 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1619 if (BE (newstate->entrance_nodes == NULL, 0))
1620 {
1621 free_state (newstate);
1622 return NULL;
1623 }
1624 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1625 nctx_nodes = 0;
1626 newstate->has_constraint = 1;
1627 }
1628
1629 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1630 {
1631 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1632 ++nctx_nodes;
1633 }
1634 }
1635 }
1636 err = register_state (dfa, newstate, hash);
1637 if (BE (err != REG_NOERROR, 0))
1638 {
1639 free_state (newstate);
1640 newstate = NULL;
1641 }
1642 return newstate;
1643 }