]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regex_internal.c
* posix/regex_internal.c (re_string_skip_chars): If no character has
[thirdparty/glibc.git] / posix / regex_internal.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
32 \f
33 /* Functions for string operation. */
34
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
37
38 static reg_errcode_t
39 internal_function
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
42 {
43 reg_errcode_t ret;
44 int init_buf_len;
45
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
55
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
62 }
63
64 /* This function allocate the buffers, and initialize them. */
65
66 static reg_errcode_t
67 internal_function
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
70 {
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
74
75 if (len > 0)
76 {
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
80 }
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82
83 if (icase)
84 {
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
87 {
88 while (1)
89 {
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
100 }
101 }
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
105 }
106 else
107 {
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
113 {
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
117 {
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
120 }
121 }
122 }
123
124 return REG_NOERROR;
125 }
126
127 /* Helper functions for re_string_allocate, and re_string_construct. */
128
129 static reg_errcode_t
130 internal_function
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
132 {
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
135 {
136 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
137 if (BE (new_wcs == NULL, 0))
138 return REG_ESPACE;
139 pstr->wcs = new_wcs;
140 if (pstr->offsets != NULL)
141 {
142 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
143 if (BE (new_offsets == NULL, 0))
144 return REG_ESPACE;
145 pstr->offsets = new_offsets;
146 }
147 }
148 #endif /* RE_ENABLE_I18N */
149 if (pstr->mbs_allocated)
150 {
151 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 new_buf_len);
153 if (BE (new_mbs == NULL, 0))
154 return REG_ESPACE;
155 pstr->mbs = new_mbs;
156 }
157 pstr->bufs_len = new_buf_len;
158 return REG_NOERROR;
159 }
160
161
162 static void
163 internal_function
164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
165 RE_TRANSLATE_TYPE trans, int icase,
166 const re_dfa_t *dfa)
167 {
168 pstr->raw_mbs = (const unsigned char *) str;
169 pstr->len = len;
170 pstr->raw_len = len;
171 pstr->trans = trans;
172 pstr->icase = icase ? 1 : 0;
173 pstr->mbs_allocated = (trans != NULL || icase);
174 pstr->mb_cur_max = dfa->mb_cur_max;
175 pstr->is_utf8 = dfa->is_utf8;
176 pstr->map_notascii = dfa->map_notascii;
177 pstr->stop = pstr->len;
178 pstr->raw_stop = pstr->stop;
179 }
180
181 #ifdef RE_ENABLE_I18N
182
183 /* Build wide character buffer PSTR->WCS.
184 If the byte sequence of the string are:
185 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
186 Then wide character buffer will be:
187 <wc1> , WEOF , <wc2> , WEOF , <wc3>
188 We use WEOF for padding, they indicate that the position isn't
189 a first byte of a multibyte character.
190
191 Note that this function assumes PSTR->VALID_LEN elements are already
192 built and starts from PSTR->VALID_LEN. */
193
194 static void
195 internal_function
196 build_wcs_buffer (re_string_t *pstr)
197 {
198 #ifdef _LIBC
199 unsigned char buf[MB_LEN_MAX];
200 assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 #else
202 unsigned char buf[64];
203 #endif
204 mbstate_t prev_st;
205 int byte_idx, end_idx, remain_len;
206 size_t mbclen;
207
208 /* Build the buffers from pstr->valid_len to either pstr->len or
209 pstr->bufs_len. */
210 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
211 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
212 {
213 wchar_t wc;
214 const char *p;
215
216 remain_len = end_idx - byte_idx;
217 prev_st = pstr->cur_state;
218 /* Apply the translation if we need. */
219 if (BE (pstr->trans != NULL, 0))
220 {
221 int i, ch;
222
223 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224 {
225 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
226 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227 }
228 p = (const char *) buf;
229 }
230 else
231 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
232 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
233 if (BE (mbclen == (size_t) -2, 0))
234 {
235 /* The buffer doesn't have enough space, finish to build. */
236 pstr->cur_state = prev_st;
237 break;
238 }
239 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240 {
241 /* We treat these cases as a singlebyte character. */
242 mbclen = 1;
243 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
244 if (BE (pstr->trans != NULL, 0))
245 wc = pstr->trans[wc];
246 pstr->cur_state = prev_st;
247 }
248
249 /* Write wide character and padding. */
250 pstr->wcs[byte_idx++] = wc;
251 /* Write paddings. */
252 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
253 pstr->wcs[byte_idx++] = WEOF;
254 }
255 pstr->valid_len = byte_idx;
256 pstr->valid_raw_len = byte_idx;
257 }
258
259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
260 but for REG_ICASE. */
261
262 static reg_errcode_t
263 internal_function
264 build_wcs_upper_buffer (re_string_t *pstr)
265 {
266 mbstate_t prev_st;
267 int src_idx, byte_idx, end_idx, remain_len;
268 size_t mbclen;
269 #ifdef _LIBC
270 char buf[MB_LEN_MAX];
271 assert (MB_LEN_MAX >= pstr->mb_cur_max);
272 #else
273 char buf[64];
274 #endif
275
276 byte_idx = pstr->valid_len;
277 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278
279 /* The following optimization assumes that ASCII characters can be
280 mapped to wide characters with a simple cast. */
281 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282 {
283 while (byte_idx < end_idx)
284 {
285 wchar_t wc;
286
287 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
288 && mbsinit (&pstr->cur_state))
289 {
290 /* In case of a singlebyte character. */
291 pstr->mbs[byte_idx]
292 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
293 /* The next step uses the assumption that wchar_t is encoded
294 ASCII-safe: all ASCII values can be converted like this. */
295 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
296 ++byte_idx;
297 continue;
298 }
299
300 remain_len = end_idx - byte_idx;
301 prev_st = pstr->cur_state;
302 mbclen = mbrtowc (&wc,
303 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
304 + byte_idx), remain_len, &pstr->cur_state);
305 if (BE (mbclen + 2 > 2, 1))
306 {
307 wchar_t wcu = wc;
308 if (iswlower (wc))
309 {
310 size_t mbcdlen;
311
312 wcu = towupper (wc);
313 mbcdlen = wcrtomb (buf, wcu, &prev_st);
314 if (BE (mbclen == mbcdlen, 1))
315 memcpy (pstr->mbs + byte_idx, buf, mbclen);
316 else
317 {
318 src_idx = byte_idx;
319 goto offsets_needed;
320 }
321 }
322 else
323 memcpy (pstr->mbs + byte_idx,
324 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
325 pstr->wcs[byte_idx++] = wcu;
326 /* Write paddings. */
327 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
328 pstr->wcs[byte_idx++] = WEOF;
329 }
330 else if (mbclen == (size_t) -1 || mbclen == 0)
331 {
332 /* It is an invalid character or '\0'. Just use the byte. */
333 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
334 pstr->mbs[byte_idx] = ch;
335 /* And also cast it to wide char. */
336 pstr->wcs[byte_idx++] = (wchar_t) ch;
337 if (BE (mbclen == (size_t) -1, 0))
338 pstr->cur_state = prev_st;
339 }
340 else
341 {
342 /* The buffer doesn't have enough space, finish to build. */
343 pstr->cur_state = prev_st;
344 break;
345 }
346 }
347 pstr->valid_len = byte_idx;
348 pstr->valid_raw_len = byte_idx;
349 return REG_NOERROR;
350 }
351 else
352 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
353 {
354 wchar_t wc;
355 const char *p;
356 offsets_needed:
357 remain_len = end_idx - byte_idx;
358 prev_st = pstr->cur_state;
359 if (BE (pstr->trans != NULL, 0))
360 {
361 int i, ch;
362
363 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364 {
365 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
366 buf[i] = pstr->trans[ch];
367 }
368 p = (const char *) buf;
369 }
370 else
371 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
372 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
373 if (BE (mbclen + 2 > 2, 1))
374 {
375 wchar_t wcu = wc;
376 if (iswlower (wc))
377 {
378 size_t mbcdlen;
379
380 wcu = towupper (wc);
381 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
382 if (BE (mbclen == mbcdlen, 1))
383 memcpy (pstr->mbs + byte_idx, buf, mbclen);
384 else if (mbcdlen != (size_t) -1)
385 {
386 size_t i;
387
388 if (byte_idx + mbcdlen > pstr->bufs_len)
389 {
390 pstr->cur_state = prev_st;
391 break;
392 }
393
394 if (pstr->offsets == NULL)
395 {
396 pstr->offsets = re_malloc (int, pstr->bufs_len);
397
398 if (pstr->offsets == NULL)
399 return REG_ESPACE;
400 }
401 if (!pstr->offsets_needed)
402 {
403 for (i = 0; i < (size_t) byte_idx; ++i)
404 pstr->offsets[i] = i;
405 pstr->offsets_needed = 1;
406 }
407
408 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
409 pstr->wcs[byte_idx] = wcu;
410 pstr->offsets[byte_idx] = src_idx;
411 for (i = 1; i < mbcdlen; ++i)
412 {
413 pstr->offsets[byte_idx + i]
414 = src_idx + (i < mbclen ? i : mbclen - 1);
415 pstr->wcs[byte_idx + i] = WEOF;
416 }
417 pstr->len += mbcdlen - mbclen;
418 if (pstr->raw_stop > src_idx)
419 pstr->stop += mbcdlen - mbclen;
420 end_idx = (pstr->bufs_len > pstr->len)
421 ? pstr->len : pstr->bufs_len;
422 byte_idx += mbcdlen;
423 src_idx += mbclen;
424 continue;
425 }
426 else
427 memcpy (pstr->mbs + byte_idx, p, mbclen);
428 }
429 else
430 memcpy (pstr->mbs + byte_idx, p, mbclen);
431
432 if (BE (pstr->offsets_needed != 0, 0))
433 {
434 size_t i;
435 for (i = 0; i < mbclen; ++i)
436 pstr->offsets[byte_idx + i] = src_idx + i;
437 }
438 src_idx += mbclen;
439
440 pstr->wcs[byte_idx++] = wcu;
441 /* Write paddings. */
442 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
443 pstr->wcs[byte_idx++] = WEOF;
444 }
445 else if (mbclen == (size_t) -1 || mbclen == 0)
446 {
447 /* It is an invalid character or '\0'. Just use the byte. */
448 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449
450 if (BE (pstr->trans != NULL, 0))
451 ch = pstr->trans [ch];
452 pstr->mbs[byte_idx] = ch;
453
454 if (BE (pstr->offsets_needed != 0, 0))
455 pstr->offsets[byte_idx] = src_idx;
456 ++src_idx;
457
458 /* And also cast it to wide char. */
459 pstr->wcs[byte_idx++] = (wchar_t) ch;
460 if (BE (mbclen == (size_t) -1, 0))
461 pstr->cur_state = prev_st;
462 }
463 else
464 {
465 /* The buffer doesn't have enough space, finish to build. */
466 pstr->cur_state = prev_st;
467 break;
468 }
469 }
470 pstr->valid_len = byte_idx;
471 pstr->valid_raw_len = src_idx;
472 return REG_NOERROR;
473 }
474
475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
476 Return the index. */
477
478 static int
479 internal_function
480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
481 {
482 mbstate_t prev_st;
483 int rawbuf_idx;
484 size_t mbclen;
485 wchar_t wc = WEOF;
486
487 /* Skip the characters which are not necessary to check. */
488 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
489 rawbuf_idx < new_raw_idx;)
490 {
491 int remain_len;
492 remain_len = pstr->len - rawbuf_idx;
493 prev_st = pstr->cur_state;
494 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
495 remain_len, &pstr->cur_state);
496 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497 {
498 /* We treat these cases as a single byte character. */
499 if (mbclen == 0 || remain_len == 0)
500 wc = L'\0';
501 else
502 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
503 mbclen = 1;
504 pstr->cur_state = prev_st;
505 }
506 /* Then proceed the next character. */
507 rawbuf_idx += mbclen;
508 }
509 *last_wc = (wint_t) wc;
510 return rawbuf_idx;
511 }
512 #endif /* RE_ENABLE_I18N */
513
514 /* Build the buffer PSTR->MBS, and apply the translation if we need.
515 This function is used in case of REG_ICASE. */
516
517 static void
518 internal_function
519 build_upper_buffer (re_string_t *pstr)
520 {
521 int char_idx, end_idx;
522 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
523
524 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
525 {
526 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
527 if (BE (pstr->trans != NULL, 0))
528 ch = pstr->trans[ch];
529 if (islower (ch))
530 pstr->mbs[char_idx] = toupper (ch);
531 else
532 pstr->mbs[char_idx] = ch;
533 }
534 pstr->valid_len = char_idx;
535 pstr->valid_raw_len = char_idx;
536 }
537
538 /* Apply TRANS to the buffer in PSTR. */
539
540 static void
541 internal_function
542 re_string_translate_buffer (re_string_t *pstr)
543 {
544 int buf_idx, end_idx;
545 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
546
547 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
548 {
549 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
550 pstr->mbs[buf_idx] = pstr->trans[ch];
551 }
552
553 pstr->valid_len = buf_idx;
554 pstr->valid_raw_len = buf_idx;
555 }
556
557 /* This function re-construct the buffers.
558 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
559 convert to upper case in case of REG_ICASE, apply translation. */
560
561 static reg_errcode_t
562 internal_function
563 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
564 {
565 int offset = idx - pstr->raw_mbs_idx;
566 if (BE (offset < 0, 0))
567 {
568 /* Reset buffer. */
569 #ifdef RE_ENABLE_I18N
570 if (pstr->mb_cur_max > 1)
571 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
572 #endif /* RE_ENABLE_I18N */
573 pstr->len = pstr->raw_len;
574 pstr->stop = pstr->raw_stop;
575 pstr->valid_len = 0;
576 pstr->raw_mbs_idx = 0;
577 pstr->valid_raw_len = 0;
578 pstr->offsets_needed = 0;
579 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
580 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
581 if (!pstr->mbs_allocated)
582 pstr->mbs = (unsigned char *) pstr->raw_mbs;
583 offset = idx;
584 }
585
586 if (BE (offset != 0, 1))
587 {
588 /* Are the characters which are already checked remain? */
589 if (BE (offset < pstr->valid_raw_len, 1)
590 #ifdef RE_ENABLE_I18N
591 /* Handling this would enlarge the code too much.
592 Accept a slowdown in that case. */
593 && pstr->offsets_needed == 0
594 #endif
595 )
596 {
597 /* Yes, move them to the front of the buffer. */
598 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
599 #ifdef RE_ENABLE_I18N
600 if (pstr->mb_cur_max > 1)
601 memmove (pstr->wcs, pstr->wcs + offset,
602 (pstr->valid_len - offset) * sizeof (wint_t));
603 #endif /* RE_ENABLE_I18N */
604 if (BE (pstr->mbs_allocated, 0))
605 memmove (pstr->mbs, pstr->mbs + offset,
606 pstr->valid_len - offset);
607 pstr->valid_len -= offset;
608 pstr->valid_raw_len -= offset;
609 #if DEBUG
610 assert (pstr->valid_len > 0);
611 #endif
612 }
613 else
614 {
615 /* No, skip all characters until IDX. */
616 #ifdef RE_ENABLE_I18N
617 if (BE (pstr->offsets_needed, 0))
618 {
619 pstr->len = pstr->raw_len - idx + offset;
620 pstr->stop = pstr->raw_stop - idx + offset;
621 pstr->offsets_needed = 0;
622 }
623 #endif
624 pstr->valid_len = 0;
625 #ifdef RE_ENABLE_I18N
626 if (pstr->mb_cur_max > 1)
627 {
628 int wcs_idx;
629 wint_t wc = WEOF;
630
631 if (pstr->is_utf8)
632 {
633 const unsigned char *raw, *p, *q, *end;
634
635 /* Special case UTF-8. Multi-byte chars start with any
636 byte other than 0x80 - 0xbf. */
637 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
638 end = raw + (offset - pstr->mb_cur_max);
639 p = raw + offset - 1;
640 #ifdef _LIBC
641 /* We know the wchar_t encoding is UCS4, so for the simple
642 case, ASCII characters, skip the conversion step. */
643 if (isascii (*p) && BE (pstr->trans == NULL, 1))
644 {
645 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
646 pstr->valid_len = 0;
647 wc = (wchar_t) *p;
648 }
649 else
650 #endif
651 for (; p >= end; --p)
652 if ((*p & 0xc0) != 0x80)
653 {
654 mbstate_t cur_state;
655 wchar_t wc2;
656 int mlen = raw + pstr->len - p;
657 unsigned char buf[6];
658 size_t mbclen;
659
660 q = p;
661 if (BE (pstr->trans != NULL, 0))
662 {
663 int i = mlen < 6 ? mlen : 6;
664 while (--i >= 0)
665 buf[i] = pstr->trans[p[i]];
666 q = buf;
667 }
668 /* XXX Don't use mbrtowc, we know which conversion
669 to use (UTF-8 -> UCS4). */
670 memset (&cur_state, 0, sizeof (cur_state));
671 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
672 &cur_state);
673 if (raw + offset - p <= mbclen
674 && mbclen < (size_t) -2)
675 {
676 memset (&pstr->cur_state, '\0',
677 sizeof (mbstate_t));
678 pstr->valid_len = mbclen - (raw + offset - p);
679 wc = wc2;
680 }
681 break;
682 }
683 }
684
685 if (wc == WEOF)
686 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
687 if (wc == WEOF)
688 pstr->tip_context
689 = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
690 else
691 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
692 && IS_WIDE_WORD_CHAR (wc))
693 ? CONTEXT_WORD
694 : ((IS_WIDE_NEWLINE (wc)
695 && pstr->newline_anchor)
696 ? CONTEXT_NEWLINE : 0));
697 if (BE (pstr->valid_len, 0))
698 {
699 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
700 pstr->wcs[wcs_idx] = WEOF;
701 if (pstr->mbs_allocated)
702 memset (pstr->mbs, 255, pstr->valid_len);
703 }
704 pstr->valid_raw_len = pstr->valid_len;
705 }
706 else
707 #endif /* RE_ENABLE_I18N */
708 {
709 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
710 pstr->valid_raw_len = 0;
711 if (pstr->trans)
712 c = pstr->trans[c];
713 pstr->tip_context = (bitset_contain (pstr->word_char, c)
714 ? CONTEXT_WORD
715 : ((IS_NEWLINE (c) && pstr->newline_anchor)
716 ? CONTEXT_NEWLINE : 0));
717 }
718 }
719 if (!BE (pstr->mbs_allocated, 0))
720 pstr->mbs += offset;
721 }
722 pstr->raw_mbs_idx = idx;
723 pstr->len -= offset;
724 pstr->stop -= offset;
725
726 /* Then build the buffers. */
727 #ifdef RE_ENABLE_I18N
728 if (pstr->mb_cur_max > 1)
729 {
730 if (pstr->icase)
731 {
732 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
733 if (BE (ret != REG_NOERROR, 0))
734 return ret;
735 }
736 else
737 build_wcs_buffer (pstr);
738 }
739 else
740 #endif /* RE_ENABLE_I18N */
741 if (BE (pstr->mbs_allocated, 0))
742 {
743 if (pstr->icase)
744 build_upper_buffer (pstr);
745 else if (pstr->trans != NULL)
746 re_string_translate_buffer (pstr);
747 }
748 else
749 pstr->valid_len = pstr->len;
750
751 pstr->cur_idx = 0;
752 return REG_NOERROR;
753 }
754
755 static unsigned char
756 internal_function __attribute ((pure))
757 re_string_peek_byte_case (const re_string_t *pstr, int idx)
758 {
759 int ch, off;
760
761 /* Handle the common (easiest) cases first. */
762 if (BE (!pstr->mbs_allocated, 1))
763 return re_string_peek_byte (pstr, idx);
764
765 #ifdef RE_ENABLE_I18N
766 if (pstr->mb_cur_max > 1
767 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
768 return re_string_peek_byte (pstr, idx);
769 #endif
770
771 off = pstr->cur_idx + idx;
772 #ifdef RE_ENABLE_I18N
773 if (pstr->offsets_needed)
774 off = pstr->offsets[off];
775 #endif
776
777 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
778
779 #ifdef RE_ENABLE_I18N
780 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
781 this function returns CAPITAL LETTER I instead of first byte of
782 DOTLESS SMALL LETTER I. The latter would confuse the parser,
783 since peek_byte_case doesn't advance cur_idx in any way. */
784 if (pstr->offsets_needed && !isascii (ch))
785 return re_string_peek_byte (pstr, idx);
786 #endif
787
788 return ch;
789 }
790
791 static unsigned char
792 internal_function __attribute ((pure))
793 re_string_fetch_byte_case (re_string_t *pstr)
794 {
795 if (BE (!pstr->mbs_allocated, 1))
796 return re_string_fetch_byte (pstr);
797
798 #ifdef RE_ENABLE_I18N
799 if (pstr->offsets_needed)
800 {
801 int off, ch;
802
803 /* For tr_TR.UTF-8 [[:islower:]] there is
804 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
805 in that case the whole multi-byte character and return
806 the original letter. On the other side, with
807 [[: DOTLESS SMALL LETTER I return [[:I, as doing
808 anything else would complicate things too much. */
809
810 if (!re_string_first_byte (pstr, pstr->cur_idx))
811 return re_string_fetch_byte (pstr);
812
813 off = pstr->offsets[pstr->cur_idx];
814 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
815
816 if (! isascii (ch))
817 return re_string_fetch_byte (pstr);
818
819 re_string_skip_bytes (pstr,
820 re_string_char_size_at (pstr, pstr->cur_idx));
821 return ch;
822 }
823 #endif
824
825 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
826 }
827
828 static void
829 internal_function
830 re_string_destruct (re_string_t *pstr)
831 {
832 #ifdef RE_ENABLE_I18N
833 re_free (pstr->wcs);
834 re_free (pstr->offsets);
835 #endif /* RE_ENABLE_I18N */
836 if (pstr->mbs_allocated)
837 re_free (pstr->mbs);
838 }
839
840 /* Return the context at IDX in INPUT. */
841
842 static unsigned int
843 internal_function
844 re_string_context_at (const re_string_t *input, int idx, int eflags)
845 {
846 int c;
847 if (BE (idx < 0, 0))
848 /* In this case, we use the value stored in input->tip_context,
849 since we can't know the character in input->mbs[-1] here. */
850 return input->tip_context;
851 if (BE (idx == input->len, 0))
852 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
853 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
854 #ifdef RE_ENABLE_I18N
855 if (input->mb_cur_max > 1)
856 {
857 wint_t wc;
858 int wc_idx = idx;
859 while(input->wcs[wc_idx] == WEOF)
860 {
861 #ifdef DEBUG
862 /* It must not happen. */
863 assert (wc_idx >= 0);
864 #endif
865 --wc_idx;
866 if (wc_idx < 0)
867 return input->tip_context;
868 }
869 wc = input->wcs[wc_idx];
870 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
871 return CONTEXT_WORD;
872 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
873 ? CONTEXT_NEWLINE : 0);
874 }
875 else
876 #endif
877 {
878 c = re_string_byte_at (input, idx);
879 if (bitset_contain (input->word_char, c))
880 return CONTEXT_WORD;
881 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
882 }
883 }
884 \f
885 /* Functions for set operation. */
886
887 static reg_errcode_t
888 internal_function
889 re_node_set_alloc (re_node_set *set, int size)
890 {
891 set->alloc = size;
892 set->nelem = 0;
893 set->elems = re_malloc (int, size);
894 if (BE (set->elems == NULL, 0))
895 return REG_ESPACE;
896 return REG_NOERROR;
897 }
898
899 static reg_errcode_t
900 internal_function
901 re_node_set_init_1 (re_node_set *set, int elem)
902 {
903 set->alloc = 1;
904 set->nelem = 1;
905 set->elems = re_malloc (int, 1);
906 if (BE (set->elems == NULL, 0))
907 {
908 set->alloc = set->nelem = 0;
909 return REG_ESPACE;
910 }
911 set->elems[0] = elem;
912 return REG_NOERROR;
913 }
914
915 static reg_errcode_t
916 internal_function
917 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
918 {
919 set->alloc = 2;
920 set->elems = re_malloc (int, 2);
921 if (BE (set->elems == NULL, 0))
922 return REG_ESPACE;
923 if (elem1 == elem2)
924 {
925 set->nelem = 1;
926 set->elems[0] = elem1;
927 }
928 else
929 {
930 set->nelem = 2;
931 if (elem1 < elem2)
932 {
933 set->elems[0] = elem1;
934 set->elems[1] = elem2;
935 }
936 else
937 {
938 set->elems[0] = elem2;
939 set->elems[1] = elem1;
940 }
941 }
942 return REG_NOERROR;
943 }
944
945 static reg_errcode_t
946 internal_function
947 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
948 {
949 dest->nelem = src->nelem;
950 if (src->nelem > 0)
951 {
952 dest->alloc = dest->nelem;
953 dest->elems = re_malloc (int, dest->alloc);
954 if (BE (dest->elems == NULL, 0))
955 {
956 dest->alloc = dest->nelem = 0;
957 return REG_ESPACE;
958 }
959 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
960 }
961 else
962 re_node_set_init_empty (dest);
963 return REG_NOERROR;
964 }
965
966 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
967 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
968 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
969
970 static reg_errcode_t
971 internal_function
972 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
973 const re_node_set *src2)
974 {
975 int i1, i2, is, id, delta, sbase;
976 if (src1->nelem == 0 || src2->nelem == 0)
977 return REG_NOERROR;
978
979 /* We need dest->nelem + 2 * elems_in_intersection; this is a
980 conservative estimate. */
981 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
982 {
983 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
984 int *new_elems = re_realloc (dest->elems, int, new_alloc);
985 if (BE (new_elems == NULL, 0))
986 return REG_ESPACE;
987 dest->elems = new_elems;
988 dest->alloc = new_alloc;
989 }
990
991 /* Find the items in the intersection of SRC1 and SRC2, and copy
992 into the top of DEST those that are not already in DEST itself. */
993 sbase = dest->nelem + src1->nelem + src2->nelem;
994 i1 = src1->nelem - 1;
995 i2 = src2->nelem - 1;
996 id = dest->nelem - 1;
997 for (;;)
998 {
999 if (src1->elems[i1] == src2->elems[i2])
1000 {
1001 /* Try to find the item in DEST. Maybe we could binary search? */
1002 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1003 --id;
1004
1005 if (id < 0 || dest->elems[id] != src1->elems[i1])
1006 dest->elems[--sbase] = src1->elems[i1];
1007
1008 if (--i1 < 0 || --i2 < 0)
1009 break;
1010 }
1011
1012 /* Lower the highest of the two items. */
1013 else if (src1->elems[i1] < src2->elems[i2])
1014 {
1015 if (--i2 < 0)
1016 break;
1017 }
1018 else
1019 {
1020 if (--i1 < 0)
1021 break;
1022 }
1023 }
1024
1025 id = dest->nelem - 1;
1026 is = dest->nelem + src1->nelem + src2->nelem - 1;
1027 delta = is - sbase + 1;
1028
1029 /* Now copy. When DELTA becomes zero, the remaining
1030 DEST elements are already in place; this is more or
1031 less the same loop that is in re_node_set_merge. */
1032 dest->nelem += delta;
1033 if (delta > 0 && id >= 0)
1034 for (;;)
1035 {
1036 if (dest->elems[is] > dest->elems[id])
1037 {
1038 /* Copy from the top. */
1039 dest->elems[id + delta--] = dest->elems[is--];
1040 if (delta == 0)
1041 break;
1042 }
1043 else
1044 {
1045 /* Slide from the bottom. */
1046 dest->elems[id + delta] = dest->elems[id];
1047 if (--id < 0)
1048 break;
1049 }
1050 }
1051
1052 /* Copy remaining SRC elements. */
1053 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1054
1055 return REG_NOERROR;
1056 }
1057
1058 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1059 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1060
1061 static reg_errcode_t
1062 internal_function
1063 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1064 const re_node_set *src2)
1065 {
1066 int i1, i2, id;
1067 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1068 {
1069 dest->alloc = src1->nelem + src2->nelem;
1070 dest->elems = re_malloc (int, dest->alloc);
1071 if (BE (dest->elems == NULL, 0))
1072 return REG_ESPACE;
1073 }
1074 else
1075 {
1076 if (src1 != NULL && src1->nelem > 0)
1077 return re_node_set_init_copy (dest, src1);
1078 else if (src2 != NULL && src2->nelem > 0)
1079 return re_node_set_init_copy (dest, src2);
1080 else
1081 re_node_set_init_empty (dest);
1082 return REG_NOERROR;
1083 }
1084 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1085 {
1086 if (src1->elems[i1] > src2->elems[i2])
1087 {
1088 dest->elems[id++] = src2->elems[i2++];
1089 continue;
1090 }
1091 if (src1->elems[i1] == src2->elems[i2])
1092 ++i2;
1093 dest->elems[id++] = src1->elems[i1++];
1094 }
1095 if (i1 < src1->nelem)
1096 {
1097 memcpy (dest->elems + id, src1->elems + i1,
1098 (src1->nelem - i1) * sizeof (int));
1099 id += src1->nelem - i1;
1100 }
1101 else if (i2 < src2->nelem)
1102 {
1103 memcpy (dest->elems + id, src2->elems + i2,
1104 (src2->nelem - i2) * sizeof (int));
1105 id += src2->nelem - i2;
1106 }
1107 dest->nelem = id;
1108 return REG_NOERROR;
1109 }
1110
1111 /* Calculate the union set of the sets DEST and SRC. And store it to
1112 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1113
1114 static reg_errcode_t
1115 internal_function
1116 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1117 {
1118 int is, id, sbase, delta;
1119 if (src == NULL || src->nelem == 0)
1120 return REG_NOERROR;
1121 if (dest->alloc < 2 * src->nelem + dest->nelem)
1122 {
1123 int new_alloc = 2 * (src->nelem + dest->alloc);
1124 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1125 if (BE (new_buffer == NULL, 0))
1126 return REG_ESPACE;
1127 dest->elems = new_buffer;
1128 dest->alloc = new_alloc;
1129 }
1130
1131 if (BE (dest->nelem == 0, 0))
1132 {
1133 dest->nelem = src->nelem;
1134 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1135 return REG_NOERROR;
1136 }
1137
1138 /* Copy into the top of DEST the items of SRC that are not
1139 found in DEST. Maybe we could binary search in DEST? */
1140 for (sbase = dest->nelem + 2 * src->nelem,
1141 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1142 {
1143 if (dest->elems[id] == src->elems[is])
1144 is--, id--;
1145 else if (dest->elems[id] < src->elems[is])
1146 dest->elems[--sbase] = src->elems[is--];
1147 else /* if (dest->elems[id] > src->elems[is]) */
1148 --id;
1149 }
1150
1151 if (is >= 0)
1152 {
1153 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1154 sbase -= is + 1;
1155 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1156 }
1157
1158 id = dest->nelem - 1;
1159 is = dest->nelem + 2 * src->nelem - 1;
1160 delta = is - sbase + 1;
1161 if (delta == 0)
1162 return REG_NOERROR;
1163
1164 /* Now copy. When DELTA becomes zero, the remaining
1165 DEST elements are already in place. */
1166 dest->nelem += delta;
1167 for (;;)
1168 {
1169 if (dest->elems[is] > dest->elems[id])
1170 {
1171 /* Copy from the top. */
1172 dest->elems[id + delta--] = dest->elems[is--];
1173 if (delta == 0)
1174 break;
1175 }
1176 else
1177 {
1178 /* Slide from the bottom. */
1179 dest->elems[id + delta] = dest->elems[id];
1180 if (--id < 0)
1181 {
1182 /* Copy remaining SRC elements. */
1183 memcpy (dest->elems, dest->elems + sbase,
1184 delta * sizeof (int));
1185 break;
1186 }
1187 }
1188 }
1189
1190 return REG_NOERROR;
1191 }
1192
1193 /* Insert the new element ELEM to the re_node_set* SET.
1194 SET should not already have ELEM.
1195 return -1 if an error is occured, return 1 otherwise. */
1196
1197 static int
1198 internal_function
1199 re_node_set_insert (re_node_set *set, int elem)
1200 {
1201 int idx;
1202 /* In case the set is empty. */
1203 if (set->alloc == 0)
1204 {
1205 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1206 return 1;
1207 else
1208 return -1;
1209 }
1210
1211 if (BE (set->nelem, 0) == 0)
1212 {
1213 /* We already guaranteed above that set->alloc != 0. */
1214 set->elems[0] = elem;
1215 ++set->nelem;
1216 return 1;
1217 }
1218
1219 /* Realloc if we need. */
1220 if (set->alloc == set->nelem)
1221 {
1222 int *new_elems;
1223 set->alloc = set->alloc * 2;
1224 new_elems = re_realloc (set->elems, int, set->alloc);
1225 if (BE (new_elems == NULL, 0))
1226 return -1;
1227 set->elems = new_elems;
1228 }
1229
1230 /* Move the elements which follows the new element. Test the
1231 first element separately to skip a check in the inner loop. */
1232 if (elem < set->elems[0])
1233 {
1234 idx = 0;
1235 for (idx = set->nelem; idx > 0; idx--)
1236 set->elems[idx] = set->elems[idx - 1];
1237 }
1238 else
1239 {
1240 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1241 set->elems[idx] = set->elems[idx - 1];
1242 }
1243
1244 /* Insert the new element. */
1245 set->elems[idx] = elem;
1246 ++set->nelem;
1247 return 1;
1248 }
1249
1250 /* Insert the new element ELEM to the re_node_set* SET.
1251 SET should not already have any element greater than or equal to ELEM.
1252 Return -1 if an error is occured, return 1 otherwise. */
1253
1254 static int
1255 internal_function
1256 re_node_set_insert_last (re_node_set *set, int elem)
1257 {
1258 /* Realloc if we need. */
1259 if (set->alloc == set->nelem)
1260 {
1261 int *new_elems;
1262 set->alloc = (set->alloc + 1) * 2;
1263 new_elems = re_realloc (set->elems, int, set->alloc);
1264 if (BE (new_elems == NULL, 0))
1265 return -1;
1266 set->elems = new_elems;
1267 }
1268
1269 /* Insert the new element. */
1270 set->elems[set->nelem++] = elem;
1271 return 1;
1272 }
1273
1274 /* Compare two node sets SET1 and SET2.
1275 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1276
1277 static int
1278 internal_function __attribute ((pure))
1279 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1280 {
1281 int i;
1282 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1283 return 0;
1284 for (i = set1->nelem ; --i >= 0 ; )
1285 if (set1->elems[i] != set2->elems[i])
1286 return 0;
1287 return 1;
1288 }
1289
1290 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1291
1292 static int
1293 internal_function __attribute ((pure))
1294 re_node_set_contains (const re_node_set *set, int elem)
1295 {
1296 unsigned int idx, right, mid;
1297 if (set->nelem <= 0)
1298 return 0;
1299
1300 /* Binary search the element. */
1301 idx = 0;
1302 right = set->nelem - 1;
1303 while (idx < right)
1304 {
1305 mid = (idx + right) / 2;
1306 if (set->elems[mid] < elem)
1307 idx = mid + 1;
1308 else
1309 right = mid;
1310 }
1311 return set->elems[idx] == elem ? idx + 1 : 0;
1312 }
1313
1314 static void
1315 internal_function
1316 re_node_set_remove_at (re_node_set *set, int idx)
1317 {
1318 if (idx < 0 || idx >= set->nelem)
1319 return;
1320 --set->nelem;
1321 for (; idx < set->nelem; idx++)
1322 set->elems[idx] = set->elems[idx + 1];
1323 }
1324 \f
1325
1326 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1327 Or return -1, if an error will be occured. */
1328
1329 static int
1330 internal_function
1331 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1332 {
1333 int type = token.type;
1334 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1335 {
1336 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1337 int *new_nexts, *new_indices;
1338 re_node_set *new_edests, *new_eclosures;
1339 re_token_t *new_nodes;
1340
1341 /* Avoid overflows. */
1342 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1343 return -1;
1344
1345 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1346 if (BE (new_nodes == NULL, 0))
1347 return -1;
1348 dfa->nodes = new_nodes;
1349 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1350 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1351 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1352 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1353 if (BE (new_nexts == NULL || new_indices == NULL
1354 || new_edests == NULL || new_eclosures == NULL, 0))
1355 return -1;
1356 dfa->nexts = new_nexts;
1357 dfa->org_indices = new_indices;
1358 dfa->edests = new_edests;
1359 dfa->eclosures = new_eclosures;
1360 dfa->nodes_alloc = new_nodes_alloc;
1361 }
1362 dfa->nodes[dfa->nodes_len] = token;
1363 dfa->nodes[dfa->nodes_len].constraint = 0;
1364 #ifdef RE_ENABLE_I18N
1365 dfa->nodes[dfa->nodes_len].accept_mb =
1366 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1367 #endif
1368 dfa->nexts[dfa->nodes_len] = -1;
1369 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1370 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1371 return dfa->nodes_len++;
1372 }
1373
1374 static inline unsigned int
1375 internal_function
1376 calc_state_hash (const re_node_set *nodes, unsigned int context)
1377 {
1378 unsigned int hash = nodes->nelem + context;
1379 int i;
1380 for (i = 0 ; i < nodes->nelem ; i++)
1381 hash += nodes->elems[i];
1382 return hash;
1383 }
1384
1385 /* Search for the state whose node_set is equivalent to NODES.
1386 Return the pointer to the state, if we found it in the DFA.
1387 Otherwise create the new one and return it. In case of an error
1388 return NULL and set the error code in ERR.
1389 Note: - We assume NULL as the invalid state, then it is possible that
1390 return value is NULL and ERR is REG_NOERROR.
1391 - We never return non-NULL value in case of any errors, it is for
1392 optimization. */
1393
1394 static re_dfastate_t *
1395 internal_function
1396 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1397 const re_node_set *nodes)
1398 {
1399 unsigned int hash;
1400 re_dfastate_t *new_state;
1401 struct re_state_table_entry *spot;
1402 int i;
1403 if (BE (nodes->nelem == 0, 0))
1404 {
1405 *err = REG_NOERROR;
1406 return NULL;
1407 }
1408 hash = calc_state_hash (nodes, 0);
1409 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1410
1411 for (i = 0 ; i < spot->num ; i++)
1412 {
1413 re_dfastate_t *state = spot->array[i];
1414 if (hash != state->hash)
1415 continue;
1416 if (re_node_set_compare (&state->nodes, nodes))
1417 return state;
1418 }
1419
1420 /* There are no appropriate state in the dfa, create the new one. */
1421 new_state = create_ci_newstate (dfa, nodes, hash);
1422 if (BE (new_state == NULL, 0))
1423 *err = REG_ESPACE;
1424
1425 return new_state;
1426 }
1427
1428 /* Search for the state whose node_set is equivalent to NODES and
1429 whose context is equivalent to CONTEXT.
1430 Return the pointer to the state, if we found it in the DFA.
1431 Otherwise create the new one and return it. In case of an error
1432 return NULL and set the error code in ERR.
1433 Note: - We assume NULL as the invalid state, then it is possible that
1434 return value is NULL and ERR is REG_NOERROR.
1435 - We never return non-NULL value in case of any errors, it is for
1436 optimization. */
1437
1438 static re_dfastate_t *
1439 internal_function
1440 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1441 const re_node_set *nodes, unsigned int context)
1442 {
1443 unsigned int hash;
1444 re_dfastate_t *new_state;
1445 struct re_state_table_entry *spot;
1446 int i;
1447 if (nodes->nelem == 0)
1448 {
1449 *err = REG_NOERROR;
1450 return NULL;
1451 }
1452 hash = calc_state_hash (nodes, context);
1453 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1454
1455 for (i = 0 ; i < spot->num ; i++)
1456 {
1457 re_dfastate_t *state = spot->array[i];
1458 if (state->hash == hash
1459 && state->context == context
1460 && re_node_set_compare (state->entrance_nodes, nodes))
1461 return state;
1462 }
1463 /* There are no appropriate state in `dfa', create the new one. */
1464 new_state = create_cd_newstate (dfa, nodes, context, hash);
1465 if (BE (new_state == NULL, 0))
1466 *err = REG_ESPACE;
1467
1468 return new_state;
1469 }
1470
1471 /* Finish initialization of the new state NEWSTATE, and using its hash value
1472 HASH put in the appropriate bucket of DFA's state table. Return value
1473 indicates the error code if failed. */
1474
1475 static reg_errcode_t
1476 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1477 unsigned int hash)
1478 {
1479 struct re_state_table_entry *spot;
1480 reg_errcode_t err;
1481 int i;
1482
1483 newstate->hash = hash;
1484 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1485 if (BE (err != REG_NOERROR, 0))
1486 return REG_ESPACE;
1487 for (i = 0; i < newstate->nodes.nelem; i++)
1488 {
1489 int elem = newstate->nodes.elems[i];
1490 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1491 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1492 }
1493
1494 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495 if (BE (spot->alloc <= spot->num, 0))
1496 {
1497 int new_alloc = 2 * spot->num + 2;
1498 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1499 new_alloc);
1500 if (BE (new_array == NULL, 0))
1501 return REG_ESPACE;
1502 spot->array = new_array;
1503 spot->alloc = new_alloc;
1504 }
1505 spot->array[spot->num++] = newstate;
1506 return REG_NOERROR;
1507 }
1508
1509 static void
1510 free_state (re_dfastate_t *state)
1511 {
1512 re_node_set_free (&state->non_eps_nodes);
1513 re_node_set_free (&state->inveclosure);
1514 if (state->entrance_nodes != &state->nodes)
1515 {
1516 re_node_set_free (state->entrance_nodes);
1517 re_free (state->entrance_nodes);
1518 }
1519 re_node_set_free (&state->nodes);
1520 re_free (state->word_trtable);
1521 re_free (state->trtable);
1522 re_free (state);
1523 }
1524
1525 /* Create the new state which is independ of contexts.
1526 Return the new state if succeeded, otherwise return NULL. */
1527
1528 static re_dfastate_t *
1529 internal_function
1530 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1531 unsigned int hash)
1532 {
1533 int i;
1534 reg_errcode_t err;
1535 re_dfastate_t *newstate;
1536
1537 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1538 if (BE (newstate == NULL, 0))
1539 return NULL;
1540 err = re_node_set_init_copy (&newstate->nodes, nodes);
1541 if (BE (err != REG_NOERROR, 0))
1542 {
1543 re_free (newstate);
1544 return NULL;
1545 }
1546
1547 newstate->entrance_nodes = &newstate->nodes;
1548 for (i = 0 ; i < nodes->nelem ; i++)
1549 {
1550 re_token_t *node = dfa->nodes + nodes->elems[i];
1551 re_token_type_t type = node->type;
1552 if (type == CHARACTER && !node->constraint)
1553 continue;
1554 #ifdef RE_ENABLE_I18N
1555 newstate->accept_mb |= node->accept_mb;
1556 #endif /* RE_ENABLE_I18N */
1557
1558 /* If the state has the halt node, the state is a halt state. */
1559 if (type == END_OF_RE)
1560 newstate->halt = 1;
1561 else if (type == OP_BACK_REF)
1562 newstate->has_backref = 1;
1563 else if (type == ANCHOR || node->constraint)
1564 newstate->has_constraint = 1;
1565 }
1566 err = register_state (dfa, newstate, hash);
1567 if (BE (err != REG_NOERROR, 0))
1568 {
1569 free_state (newstate);
1570 newstate = NULL;
1571 }
1572 return newstate;
1573 }
1574
1575 /* Create the new state which is depend on the context CONTEXT.
1576 Return the new state if succeeded, otherwise return NULL. */
1577
1578 static re_dfastate_t *
1579 internal_function
1580 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1581 unsigned int context, unsigned int hash)
1582 {
1583 int i, nctx_nodes = 0;
1584 reg_errcode_t err;
1585 re_dfastate_t *newstate;
1586
1587 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1588 if (BE (newstate == NULL, 0))
1589 return NULL;
1590 err = re_node_set_init_copy (&newstate->nodes, nodes);
1591 if (BE (err != REG_NOERROR, 0))
1592 {
1593 re_free (newstate);
1594 return NULL;
1595 }
1596
1597 newstate->context = context;
1598 newstate->entrance_nodes = &newstate->nodes;
1599
1600 for (i = 0 ; i < nodes->nelem ; i++)
1601 {
1602 unsigned int constraint = 0;
1603 re_token_t *node = dfa->nodes + nodes->elems[i];
1604 re_token_type_t type = node->type;
1605 if (node->constraint)
1606 constraint = node->constraint;
1607
1608 if (type == CHARACTER && !constraint)
1609 continue;
1610 #ifdef RE_ENABLE_I18N
1611 newstate->accept_mb |= node->accept_mb;
1612 #endif /* RE_ENABLE_I18N */
1613
1614 /* If the state has the halt node, the state is a halt state. */
1615 if (type == END_OF_RE)
1616 newstate->halt = 1;
1617 else if (type == OP_BACK_REF)
1618 newstate->has_backref = 1;
1619 else if (type == ANCHOR)
1620 constraint = node->opr.ctx_type;
1621
1622 if (constraint)
1623 {
1624 if (newstate->entrance_nodes == &newstate->nodes)
1625 {
1626 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1627 if (BE (newstate->entrance_nodes == NULL, 0))
1628 {
1629 free_state (newstate);
1630 return NULL;
1631 }
1632 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1633 nctx_nodes = 0;
1634 newstate->has_constraint = 1;
1635 }
1636
1637 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1638 {
1639 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1640 ++nctx_nodes;
1641 }
1642 }
1643 }
1644 err = register_state (dfa, newstate, hash);
1645 if (BE (err != REG_NOERROR, 0))
1646 {
1647 free_state (newstate);
1648 newstate = NULL;
1649 }
1650 return newstate;
1651 }