]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regex_internal.c
[BZ #1248]
[thirdparty/glibc.git] / posix / regex_internal.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
32 \f
33 /* Functions for string operation. */
34
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
37
38 static reg_errcode_t
39 internal_function
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
42 {
43 reg_errcode_t ret;
44 int init_buf_len;
45
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
55
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
62 }
63
64 /* This function allocate the buffers, and initialize them. */
65
66 static reg_errcode_t
67 internal_function
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
70 {
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
74
75 if (len > 0)
76 {
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
80 }
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82
83 if (icase)
84 {
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
87 {
88 while (1)
89 {
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
100 }
101 }
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
105 }
106 else
107 {
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
113 {
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
117 {
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
120 }
121 }
122 }
123
124 return REG_NOERROR;
125 }
126
127 /* Helper functions for re_string_allocate, and re_string_construct. */
128
129 static reg_errcode_t
130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
134 {
135 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
136 if (BE (new_wcs == NULL, 0))
137 return REG_ESPACE;
138 pstr->wcs = new_wcs;
139 if (pstr->offsets != NULL)
140 {
141 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
142 if (BE (new_offsets == NULL, 0))
143 return REG_ESPACE;
144 pstr->offsets = new_offsets;
145 }
146 }
147 #endif /* RE_ENABLE_I18N */
148 if (pstr->mbs_allocated)
149 {
150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
151 new_buf_len);
152 if (BE (new_mbs == NULL, 0))
153 return REG_ESPACE;
154 pstr->mbs = new_mbs;
155 }
156 pstr->bufs_len = new_buf_len;
157 return REG_NOERROR;
158 }
159
160
161 static void
162 re_string_construct_common (const char *str, int len, re_string_t *pstr,
163 RE_TRANSLATE_TYPE trans, int icase,
164 const re_dfa_t *dfa)
165 {
166 pstr->raw_mbs = (const unsigned char *) str;
167 pstr->len = len;
168 pstr->raw_len = len;
169 pstr->trans = trans;
170 pstr->icase = icase ? 1 : 0;
171 pstr->mbs_allocated = (trans != NULL || icase);
172 pstr->mb_cur_max = dfa->mb_cur_max;
173 pstr->is_utf8 = dfa->is_utf8;
174 pstr->map_notascii = dfa->map_notascii;
175 pstr->stop = pstr->len;
176 pstr->raw_stop = pstr->stop;
177 }
178
179 #ifdef RE_ENABLE_I18N
180
181 /* Build wide character buffer PSTR->WCS.
182 If the byte sequence of the string are:
183 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
184 Then wide character buffer will be:
185 <wc1> , WEOF , <wc2> , WEOF , <wc3>
186 We use WEOF for padding, they indicate that the position isn't
187 a first byte of a multibyte character.
188
189 Note that this function assumes PSTR->VALID_LEN elements are already
190 built and starts from PSTR->VALID_LEN. */
191
192 static void
193 build_wcs_buffer (re_string_t *pstr)
194 {
195 #ifdef _LIBC
196 unsigned char buf[MB_LEN_MAX];
197 assert (MB_LEN_MAX >= pstr->mb_cur_max);
198 #else
199 unsigned char buf[64];
200 #endif
201 mbstate_t prev_st;
202 int byte_idx, end_idx, remain_len;
203 size_t mbclen;
204
205 /* Build the buffers from pstr->valid_len to either pstr->len or
206 pstr->bufs_len. */
207 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
208 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
209 {
210 wchar_t wc;
211 const char *p;
212
213 remain_len = end_idx - byte_idx;
214 prev_st = pstr->cur_state;
215 /* Apply the translation if we need. */
216 if (BE (pstr->trans != NULL, 0))
217 {
218 int i, ch;
219
220 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
221 {
222 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
223 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
224 }
225 p = (const char *) buf;
226 }
227 else
228 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
229 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
230 if (BE (mbclen == (size_t) -2, 0))
231 {
232 /* The buffer doesn't have enough space, finish to build. */
233 pstr->cur_state = prev_st;
234 break;
235 }
236 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
237 {
238 /* We treat these cases as a singlebyte character. */
239 mbclen = 1;
240 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
241 if (BE (pstr->trans != NULL, 0))
242 wc = pstr->trans[wc];
243 pstr->cur_state = prev_st;
244 }
245
246 /* Write wide character and padding. */
247 pstr->wcs[byte_idx++] = wc;
248 /* Write paddings. */
249 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
250 pstr->wcs[byte_idx++] = WEOF;
251 }
252 pstr->valid_len = byte_idx;
253 pstr->valid_raw_len = byte_idx;
254 }
255
256 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
257 but for REG_ICASE. */
258
259 static int
260 build_wcs_upper_buffer (re_string_t *pstr)
261 {
262 mbstate_t prev_st;
263 int src_idx, byte_idx, end_idx, remain_len;
264 size_t mbclen;
265 #ifdef _LIBC
266 char buf[MB_LEN_MAX];
267 assert (MB_LEN_MAX >= pstr->mb_cur_max);
268 #else
269 char buf[64];
270 #endif
271
272 byte_idx = pstr->valid_len;
273 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
274
275 /* The following optimization assumes that ASCII characters can be
276 mapped to wide characters with a simple cast. */
277 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
278 {
279 while (byte_idx < end_idx)
280 {
281 wchar_t wc;
282
283 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
284 && mbsinit (&pstr->cur_state))
285 {
286 /* In case of a singlebyte character. */
287 pstr->mbs[byte_idx]
288 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
289 /* The next step uses the assumption that wchar_t is encoded
290 ASCII-safe: all ASCII values can be converted like this. */
291 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
292 ++byte_idx;
293 continue;
294 }
295
296 remain_len = end_idx - byte_idx;
297 prev_st = pstr->cur_state;
298 mbclen = mbrtowc (&wc,
299 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
300 + byte_idx), remain_len, &pstr->cur_state);
301 if (BE (mbclen + 2 > 2, 1))
302 {
303 wchar_t wcu = wc;
304 if (iswlower (wc))
305 {
306 size_t mbcdlen;
307
308 wcu = towupper (wc);
309 mbcdlen = wcrtomb (buf, wcu, &prev_st);
310 if (BE (mbclen == mbcdlen, 1))
311 memcpy (pstr->mbs + byte_idx, buf, mbclen);
312 else
313 {
314 src_idx = byte_idx;
315 goto offsets_needed;
316 }
317 }
318 else
319 memcpy (pstr->mbs + byte_idx,
320 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
321 pstr->wcs[byte_idx++] = wcu;
322 /* Write paddings. */
323 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
324 pstr->wcs[byte_idx++] = WEOF;
325 }
326 else if (mbclen == (size_t) -1 || mbclen == 0)
327 {
328 /* It is an invalid character or '\0'. Just use the byte. */
329 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
330 pstr->mbs[byte_idx] = ch;
331 /* And also cast it to wide char. */
332 pstr->wcs[byte_idx++] = (wchar_t) ch;
333 if (BE (mbclen == (size_t) -1, 0))
334 pstr->cur_state = prev_st;
335 }
336 else
337 {
338 /* The buffer doesn't have enough space, finish to build. */
339 pstr->cur_state = prev_st;
340 break;
341 }
342 }
343 pstr->valid_len = byte_idx;
344 pstr->valid_raw_len = byte_idx;
345 return REG_NOERROR;
346 }
347 else
348 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
349 {
350 wchar_t wc;
351 const char *p;
352 offsets_needed:
353 remain_len = end_idx - byte_idx;
354 prev_st = pstr->cur_state;
355 if (BE (pstr->trans != NULL, 0))
356 {
357 int i, ch;
358
359 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
360 {
361 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
362 buf[i] = pstr->trans[ch];
363 }
364 p = (const char *) buf;
365 }
366 else
367 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
368 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
369 if (BE (mbclen + 2 > 2, 1))
370 {
371 wchar_t wcu = wc;
372 if (iswlower (wc))
373 {
374 size_t mbcdlen;
375
376 wcu = towupper (wc);
377 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
378 if (BE (mbclen == mbcdlen, 1))
379 memcpy (pstr->mbs + byte_idx, buf, mbclen);
380 else if (mbcdlen != (size_t) -1)
381 {
382 size_t i;
383
384 if (byte_idx + mbcdlen > pstr->bufs_len)
385 {
386 pstr->cur_state = prev_st;
387 break;
388 }
389
390 if (pstr->offsets == NULL)
391 {
392 pstr->offsets = re_malloc (int, pstr->bufs_len);
393
394 if (pstr->offsets == NULL)
395 return REG_ESPACE;
396 }
397 if (!pstr->offsets_needed)
398 {
399 for (i = 0; i < (size_t) byte_idx; ++i)
400 pstr->offsets[i] = i;
401 pstr->offsets_needed = 1;
402 }
403
404 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
405 pstr->wcs[byte_idx] = wcu;
406 pstr->offsets[byte_idx] = src_idx;
407 for (i = 1; i < mbcdlen; ++i)
408 {
409 pstr->offsets[byte_idx + i]
410 = src_idx + (i < mbclen ? i : mbclen - 1);
411 pstr->wcs[byte_idx + i] = WEOF;
412 }
413 pstr->len += mbcdlen - mbclen;
414 if (pstr->raw_stop > src_idx)
415 pstr->stop += mbcdlen - mbclen;
416 end_idx = (pstr->bufs_len > pstr->len)
417 ? pstr->len : pstr->bufs_len;
418 byte_idx += mbcdlen;
419 src_idx += mbclen;
420 continue;
421 }
422 else
423 memcpy (pstr->mbs + byte_idx, p, mbclen);
424 }
425 else
426 memcpy (pstr->mbs + byte_idx, p, mbclen);
427
428 if (BE (pstr->offsets_needed != 0, 0))
429 {
430 size_t i;
431 for (i = 0; i < mbclen; ++i)
432 pstr->offsets[byte_idx + i] = src_idx + i;
433 }
434 src_idx += mbclen;
435
436 pstr->wcs[byte_idx++] = wcu;
437 /* Write paddings. */
438 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
439 pstr->wcs[byte_idx++] = WEOF;
440 }
441 else if (mbclen == (size_t) -1 || mbclen == 0)
442 {
443 /* It is an invalid character or '\0'. Just use the byte. */
444 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
445
446 if (BE (pstr->trans != NULL, 0))
447 ch = pstr->trans [ch];
448 pstr->mbs[byte_idx] = ch;
449
450 if (BE (pstr->offsets_needed != 0, 0))
451 pstr->offsets[byte_idx] = src_idx;
452 ++src_idx;
453
454 /* And also cast it to wide char. */
455 pstr->wcs[byte_idx++] = (wchar_t) ch;
456 if (BE (mbclen == (size_t) -1, 0))
457 pstr->cur_state = prev_st;
458 }
459 else
460 {
461 /* The buffer doesn't have enough space, finish to build. */
462 pstr->cur_state = prev_st;
463 break;
464 }
465 }
466 pstr->valid_len = byte_idx;
467 pstr->valid_raw_len = src_idx;
468 return REG_NOERROR;
469 }
470
471 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
472 Return the index. */
473
474 static int
475 internal_function
476 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
477 {
478 mbstate_t prev_st;
479 int rawbuf_idx;
480 size_t mbclen;
481 wchar_t wc = 0;
482
483 /* Skip the characters which are not necessary to check. */
484 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
485 rawbuf_idx < new_raw_idx;)
486 {
487 int remain_len;
488 remain_len = pstr->len - rawbuf_idx;
489 prev_st = pstr->cur_state;
490 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
491 remain_len, &pstr->cur_state);
492 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
493 {
494 /* We treat these cases as a singlebyte character. */
495 mbclen = 1;
496 pstr->cur_state = prev_st;
497 }
498 /* Then proceed the next character. */
499 rawbuf_idx += mbclen;
500 }
501 *last_wc = (wint_t) wc;
502 return rawbuf_idx;
503 }
504 #endif /* RE_ENABLE_I18N */
505
506 /* Build the buffer PSTR->MBS, and apply the translation if we need.
507 This function is used in case of REG_ICASE. */
508
509 static void
510 build_upper_buffer (re_string_t *pstr)
511 {
512 int char_idx, end_idx;
513 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
514
515 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
516 {
517 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
518 if (BE (pstr->trans != NULL, 0))
519 ch = pstr->trans[ch];
520 if (islower (ch))
521 pstr->mbs[char_idx] = toupper (ch);
522 else
523 pstr->mbs[char_idx] = ch;
524 }
525 pstr->valid_len = char_idx;
526 pstr->valid_raw_len = char_idx;
527 }
528
529 /* Apply TRANS to the buffer in PSTR. */
530
531 static void
532 re_string_translate_buffer (re_string_t *pstr)
533 {
534 int buf_idx, end_idx;
535 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
536
537 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
538 {
539 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
540 pstr->mbs[buf_idx] = pstr->trans[ch];
541 }
542
543 pstr->valid_len = buf_idx;
544 pstr->valid_raw_len = buf_idx;
545 }
546
547 /* This function re-construct the buffers.
548 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
549 convert to upper case in case of REG_ICASE, apply translation. */
550
551 static reg_errcode_t
552 internal_function
553 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
554 {
555 int offset = idx - pstr->raw_mbs_idx;
556 if (BE (offset < 0, 0))
557 {
558 /* Reset buffer. */
559 #ifdef RE_ENABLE_I18N
560 if (pstr->mb_cur_max > 1)
561 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
562 #endif /* RE_ENABLE_I18N */
563 pstr->len = pstr->raw_len;
564 pstr->stop = pstr->raw_stop;
565 pstr->valid_len = 0;
566 pstr->raw_mbs_idx = 0;
567 pstr->valid_raw_len = 0;
568 pstr->offsets_needed = 0;
569 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
570 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
571 if (!pstr->mbs_allocated)
572 pstr->mbs = (unsigned char *) pstr->raw_mbs;
573 offset = idx;
574 }
575
576 if (BE (offset != 0, 1))
577 {
578 /* Are the characters which are already checked remain? */
579 if (BE (offset < pstr->valid_raw_len, 1)
580 #ifdef RE_ENABLE_I18N
581 /* Handling this would enlarge the code too much.
582 Accept a slowdown in that case. */
583 && pstr->offsets_needed == 0
584 #endif
585 )
586 {
587 /* Yes, move them to the front of the buffer. */
588 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
589 #ifdef RE_ENABLE_I18N
590 if (pstr->mb_cur_max > 1)
591 memmove (pstr->wcs, pstr->wcs + offset,
592 (pstr->valid_len - offset) * sizeof (wint_t));
593 #endif /* RE_ENABLE_I18N */
594 if (BE (pstr->mbs_allocated, 0))
595 memmove (pstr->mbs, pstr->mbs + offset,
596 pstr->valid_len - offset);
597 pstr->valid_len -= offset;
598 pstr->valid_raw_len -= offset;
599 #if DEBUG
600 assert (pstr->valid_len > 0);
601 #endif
602 }
603 else
604 {
605 /* No, skip all characters until IDX. */
606 #ifdef RE_ENABLE_I18N
607 if (BE (pstr->offsets_needed, 0))
608 {
609 pstr->len = pstr->raw_len - idx + offset;
610 pstr->stop = pstr->raw_stop - idx + offset;
611 pstr->offsets_needed = 0;
612 }
613 #endif
614 pstr->valid_len = 0;
615 pstr->valid_raw_len = 0;
616 #ifdef RE_ENABLE_I18N
617 if (pstr->mb_cur_max > 1)
618 {
619 int wcs_idx;
620 wint_t wc = WEOF;
621
622 if (pstr->is_utf8)
623 {
624 const unsigned char *raw, *p, *q, *end;
625
626 /* Special case UTF-8. Multi-byte chars start with any
627 byte other than 0x80 - 0xbf. */
628 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
629 end = raw + (offset - pstr->mb_cur_max);
630 p = raw + offset - 1;
631 #ifdef _LIBC
632 /* We know the wchar_t encoding is UCS4, so for the simple
633 case, ASCII characters, skip the conversion step. */
634 if (isascii (*p) && BE (pstr->trans == NULL, 1))
635 {
636 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
637 pstr->valid_len = 0;
638 wc = (wchar_t) *p;
639 }
640 else
641 #endif
642 for (; p >= end; --p)
643 if ((*p & 0xc0) != 0x80)
644 {
645 mbstate_t cur_state;
646 wchar_t wc2;
647 int mlen = raw + pstr->len - p;
648 unsigned char buf[6];
649 size_t mbclen;
650
651 q = p;
652 if (BE (pstr->trans != NULL, 0))
653 {
654 int i = mlen < 6 ? mlen : 6;
655 while (--i >= 0)
656 buf[i] = pstr->trans[p[i]];
657 q = buf;
658 }
659 /* XXX Don't use mbrtowc, we know which conversion
660 to use (UTF-8 -> UCS4). */
661 memset (&cur_state, 0, sizeof (cur_state));
662 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
663 &cur_state);
664 if (raw + offset - p <= mbclen
665 && mbclen < (size_t) -2)
666 {
667 memset (&pstr->cur_state, '\0',
668 sizeof (mbstate_t));
669 pstr->valid_len = mbclen - (raw + offset - p);
670 wc = wc2;
671 }
672 break;
673 }
674 }
675
676 if (wc == WEOF)
677 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
678 if (BE (pstr->valid_len, 0))
679 {
680 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
681 pstr->wcs[wcs_idx] = WEOF;
682 if (pstr->mbs_allocated)
683 memset (pstr->mbs, 255, pstr->valid_len);
684 }
685 pstr->valid_raw_len = pstr->valid_len;
686 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
687 && IS_WIDE_WORD_CHAR (wc))
688 ? CONTEXT_WORD
689 : ((IS_WIDE_NEWLINE (wc)
690 && pstr->newline_anchor)
691 ? CONTEXT_NEWLINE : 0));
692 }
693 else
694 #endif /* RE_ENABLE_I18N */
695 {
696 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
697 if (pstr->trans)
698 c = pstr->trans[c];
699 pstr->tip_context = (bitset_contain (pstr->word_char, c)
700 ? CONTEXT_WORD
701 : ((IS_NEWLINE (c) && pstr->newline_anchor)
702 ? CONTEXT_NEWLINE : 0));
703 }
704 }
705 if (!BE (pstr->mbs_allocated, 0))
706 pstr->mbs += offset;
707 }
708 pstr->raw_mbs_idx = idx;
709 pstr->len -= offset;
710 pstr->stop -= offset;
711
712 /* Then build the buffers. */
713 #ifdef RE_ENABLE_I18N
714 if (pstr->mb_cur_max > 1)
715 {
716 if (pstr->icase)
717 {
718 int ret = build_wcs_upper_buffer (pstr);
719 if (BE (ret != REG_NOERROR, 0))
720 return ret;
721 }
722 else
723 build_wcs_buffer (pstr);
724 }
725 else
726 #endif /* RE_ENABLE_I18N */
727 if (BE (pstr->mbs_allocated, 0))
728 {
729 if (pstr->icase)
730 build_upper_buffer (pstr);
731 else if (pstr->trans != NULL)
732 re_string_translate_buffer (pstr);
733 }
734 else
735 pstr->valid_len = pstr->len;
736
737 pstr->cur_idx = 0;
738 return REG_NOERROR;
739 }
740
741 static unsigned char
742 internal_function __attribute ((pure))
743 re_string_peek_byte_case (const re_string_t *pstr, int idx)
744 {
745 int ch, off;
746
747 /* Handle the common (easiest) cases first. */
748 if (BE (!pstr->mbs_allocated, 1))
749 return re_string_peek_byte (pstr, idx);
750
751 #ifdef RE_ENABLE_I18N
752 if (pstr->mb_cur_max > 1
753 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
754 return re_string_peek_byte (pstr, idx);
755 #endif
756
757 off = pstr->cur_idx + idx;
758 #ifdef RE_ENABLE_I18N
759 if (pstr->offsets_needed)
760 off = pstr->offsets[off];
761 #endif
762
763 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
764
765 #ifdef RE_ENABLE_I18N
766 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
767 this function returns CAPITAL LETTER I instead of first byte of
768 DOTLESS SMALL LETTER I. The latter would confuse the parser,
769 since peek_byte_case doesn't advance cur_idx in any way. */
770 if (pstr->offsets_needed && !isascii (ch))
771 return re_string_peek_byte (pstr, idx);
772 #endif
773
774 return ch;
775 }
776
777 static unsigned char
778 internal_function __attribute ((pure))
779 re_string_fetch_byte_case (re_string_t *pstr)
780 {
781 if (BE (!pstr->mbs_allocated, 1))
782 return re_string_fetch_byte (pstr);
783
784 #ifdef RE_ENABLE_I18N
785 if (pstr->offsets_needed)
786 {
787 int off, ch;
788
789 /* For tr_TR.UTF-8 [[:islower:]] there is
790 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
791 in that case the whole multi-byte character and return
792 the original letter. On the other side, with
793 [[: DOTLESS SMALL LETTER I return [[:I, as doing
794 anything else would complicate things too much. */
795
796 if (!re_string_first_byte (pstr, pstr->cur_idx))
797 return re_string_fetch_byte (pstr);
798
799 off = pstr->offsets[pstr->cur_idx];
800 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
801
802 if (! isascii (ch))
803 return re_string_fetch_byte (pstr);
804
805 re_string_skip_bytes (pstr,
806 re_string_char_size_at (pstr, pstr->cur_idx));
807 return ch;
808 }
809 #endif
810
811 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
812 }
813
814 static void
815 internal_function
816 re_string_destruct (re_string_t *pstr)
817 {
818 #ifdef RE_ENABLE_I18N
819 re_free (pstr->wcs);
820 re_free (pstr->offsets);
821 #endif /* RE_ENABLE_I18N */
822 if (pstr->mbs_allocated)
823 re_free (pstr->mbs);
824 }
825
826 /* Return the context at IDX in INPUT. */
827
828 static unsigned int
829 re_string_context_at (const re_string_t *input, int idx, int eflags)
830 {
831 int c;
832 if (BE (idx < 0, 0))
833 /* In this case, we use the value stored in input->tip_context,
834 since we can't know the character in input->mbs[-1] here. */
835 return input->tip_context;
836 if (BE (idx == input->len, 0))
837 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
838 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
839 #ifdef RE_ENABLE_I18N
840 if (input->mb_cur_max > 1)
841 {
842 wint_t wc;
843 int wc_idx = idx;
844 while(input->wcs[wc_idx] == WEOF)
845 {
846 #ifdef DEBUG
847 /* It must not happen. */
848 assert (wc_idx >= 0);
849 #endif
850 --wc_idx;
851 if (wc_idx < 0)
852 return input->tip_context;
853 }
854 wc = input->wcs[wc_idx];
855 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
856 return CONTEXT_WORD;
857 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
858 ? CONTEXT_NEWLINE : 0);
859 }
860 else
861 #endif
862 {
863 c = re_string_byte_at (input, idx);
864 if (bitset_contain (input->word_char, c))
865 return CONTEXT_WORD;
866 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
867 }
868 }
869 \f
870 /* Functions for set operation. */
871
872 static reg_errcode_t
873 internal_function
874 re_node_set_alloc (re_node_set *set, int size)
875 {
876 set->alloc = size;
877 set->nelem = 0;
878 set->elems = re_malloc (int, size);
879 if (BE (set->elems == NULL, 0))
880 return REG_ESPACE;
881 return REG_NOERROR;
882 }
883
884 static reg_errcode_t
885 internal_function
886 re_node_set_init_1 (re_node_set *set, int elem)
887 {
888 set->alloc = 1;
889 set->nelem = 1;
890 set->elems = re_malloc (int, 1);
891 if (BE (set->elems == NULL, 0))
892 {
893 set->alloc = set->nelem = 0;
894 return REG_ESPACE;
895 }
896 set->elems[0] = elem;
897 return REG_NOERROR;
898 }
899
900 static reg_errcode_t
901 internal_function
902 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
903 {
904 set->alloc = 2;
905 set->elems = re_malloc (int, 2);
906 if (BE (set->elems == NULL, 0))
907 return REG_ESPACE;
908 if (elem1 == elem2)
909 {
910 set->nelem = 1;
911 set->elems[0] = elem1;
912 }
913 else
914 {
915 set->nelem = 2;
916 if (elem1 < elem2)
917 {
918 set->elems[0] = elem1;
919 set->elems[1] = elem2;
920 }
921 else
922 {
923 set->elems[0] = elem2;
924 set->elems[1] = elem1;
925 }
926 }
927 return REG_NOERROR;
928 }
929
930 static reg_errcode_t
931 internal_function
932 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
933 {
934 dest->nelem = src->nelem;
935 if (src->nelem > 0)
936 {
937 dest->alloc = dest->nelem;
938 dest->elems = re_malloc (int, dest->alloc);
939 if (BE (dest->elems == NULL, 0))
940 {
941 dest->alloc = dest->nelem = 0;
942 return REG_ESPACE;
943 }
944 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
945 }
946 else
947 re_node_set_init_empty (dest);
948 return REG_NOERROR;
949 }
950
951 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
952 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
953 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
954
955 static reg_errcode_t
956 internal_function
957 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
958 const re_node_set *src2)
959 {
960 int i1, i2, is, id, delta, sbase;
961 if (src1->nelem == 0 || src2->nelem == 0)
962 return REG_NOERROR;
963
964 /* We need dest->nelem + 2 * elems_in_intersection; this is a
965 conservative estimate. */
966 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
967 {
968 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
969 int *new_elems = re_realloc (dest->elems, int, new_alloc);
970 if (BE (new_elems == NULL, 0))
971 return REG_ESPACE;
972 dest->elems = new_elems;
973 dest->alloc = new_alloc;
974 }
975
976 /* Find the items in the intersection of SRC1 and SRC2, and copy
977 into the top of DEST those that are not already in DEST itself. */
978 sbase = dest->nelem + src1->nelem + src2->nelem;
979 i1 = src1->nelem - 1;
980 i2 = src2->nelem - 1;
981 id = dest->nelem - 1;
982 for (;;)
983 {
984 if (src1->elems[i1] == src2->elems[i2])
985 {
986 /* Try to find the item in DEST. Maybe we could binary search? */
987 while (id >= 0 && dest->elems[id] > src1->elems[i1])
988 --id;
989
990 if (id < 0 || dest->elems[id] != src1->elems[i1])
991 dest->elems[--sbase] = src1->elems[i1];
992
993 if (--i1 < 0 || --i2 < 0)
994 break;
995 }
996
997 /* Lower the highest of the two items. */
998 else if (src1->elems[i1] < src2->elems[i2])
999 {
1000 if (--i2 < 0)
1001 break;
1002 }
1003 else
1004 {
1005 if (--i1 < 0)
1006 break;
1007 }
1008 }
1009
1010 id = dest->nelem - 1;
1011 is = dest->nelem + src1->nelem + src2->nelem - 1;
1012 delta = is - sbase + 1;
1013
1014 /* Now copy. When DELTA becomes zero, the remaining
1015 DEST elements are already in place; this is more or
1016 less the same loop that is in re_node_set_merge. */
1017 dest->nelem += delta;
1018 if (delta > 0 && id >= 0)
1019 for (;;)
1020 {
1021 if (dest->elems[is] > dest->elems[id])
1022 {
1023 /* Copy from the top. */
1024 dest->elems[id + delta--] = dest->elems[is--];
1025 if (delta == 0)
1026 break;
1027 }
1028 else
1029 {
1030 /* Slide from the bottom. */
1031 dest->elems[id + delta] = dest->elems[id];
1032 if (--id < 0)
1033 break;
1034 }
1035 }
1036
1037 /* Copy remaining SRC elements. */
1038 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1039
1040 return REG_NOERROR;
1041 }
1042
1043 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1044 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1045
1046 static reg_errcode_t
1047 internal_function
1048 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1049 const re_node_set *src2)
1050 {
1051 int i1, i2, id;
1052 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1053 {
1054 dest->alloc = src1->nelem + src2->nelem;
1055 dest->elems = re_malloc (int, dest->alloc);
1056 if (BE (dest->elems == NULL, 0))
1057 return REG_ESPACE;
1058 }
1059 else
1060 {
1061 if (src1 != NULL && src1->nelem > 0)
1062 return re_node_set_init_copy (dest, src1);
1063 else if (src2 != NULL && src2->nelem > 0)
1064 return re_node_set_init_copy (dest, src2);
1065 else
1066 re_node_set_init_empty (dest);
1067 return REG_NOERROR;
1068 }
1069 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1070 {
1071 if (src1->elems[i1] > src2->elems[i2])
1072 {
1073 dest->elems[id++] = src2->elems[i2++];
1074 continue;
1075 }
1076 if (src1->elems[i1] == src2->elems[i2])
1077 ++i2;
1078 dest->elems[id++] = src1->elems[i1++];
1079 }
1080 if (i1 < src1->nelem)
1081 {
1082 memcpy (dest->elems + id, src1->elems + i1,
1083 (src1->nelem - i1) * sizeof (int));
1084 id += src1->nelem - i1;
1085 }
1086 else if (i2 < src2->nelem)
1087 {
1088 memcpy (dest->elems + id, src2->elems + i2,
1089 (src2->nelem - i2) * sizeof (int));
1090 id += src2->nelem - i2;
1091 }
1092 dest->nelem = id;
1093 return REG_NOERROR;
1094 }
1095
1096 /* Calculate the union set of the sets DEST and SRC. And store it to
1097 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1098
1099 static reg_errcode_t
1100 internal_function
1101 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1102 {
1103 int is, id, sbase, delta;
1104 if (src == NULL || src->nelem == 0)
1105 return REG_NOERROR;
1106 if (dest->alloc < 2 * src->nelem + dest->nelem)
1107 {
1108 int new_alloc = 2 * (src->nelem + dest->alloc);
1109 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1110 if (BE (new_buffer == NULL, 0))
1111 return REG_ESPACE;
1112 dest->elems = new_buffer;
1113 dest->alloc = new_alloc;
1114 }
1115
1116 if (BE (dest->nelem == 0, 0))
1117 {
1118 dest->nelem = src->nelem;
1119 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1120 return REG_NOERROR;
1121 }
1122
1123 /* Copy into the top of DEST the items of SRC that are not
1124 found in DEST. Maybe we could binary search in DEST? */
1125 for (sbase = dest->nelem + 2 * src->nelem,
1126 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1127 {
1128 if (dest->elems[id] == src->elems[is])
1129 is--, id--;
1130 else if (dest->elems[id] < src->elems[is])
1131 dest->elems[--sbase] = src->elems[is--];
1132 else /* if (dest->elems[id] > src->elems[is]) */
1133 --id;
1134 }
1135
1136 if (is >= 0)
1137 {
1138 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1139 sbase -= is + 1;
1140 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1141 }
1142
1143 id = dest->nelem - 1;
1144 is = dest->nelem + 2 * src->nelem - 1;
1145 delta = is - sbase + 1;
1146 if (delta == 0)
1147 return REG_NOERROR;
1148
1149 /* Now copy. When DELTA becomes zero, the remaining
1150 DEST elements are already in place. */
1151 dest->nelem += delta;
1152 for (;;)
1153 {
1154 if (dest->elems[is] > dest->elems[id])
1155 {
1156 /* Copy from the top. */
1157 dest->elems[id + delta--] = dest->elems[is--];
1158 if (delta == 0)
1159 break;
1160 }
1161 else
1162 {
1163 /* Slide from the bottom. */
1164 dest->elems[id + delta] = dest->elems[id];
1165 if (--id < 0)
1166 {
1167 /* Copy remaining SRC elements. */
1168 memcpy (dest->elems, dest->elems + sbase,
1169 delta * sizeof (int));
1170 break;
1171 }
1172 }
1173 }
1174
1175 return REG_NOERROR;
1176 }
1177
1178 /* Insert the new element ELEM to the re_node_set* SET.
1179 SET should not already have ELEM.
1180 return -1 if an error is occured, return 1 otherwise. */
1181
1182 static int
1183 internal_function
1184 re_node_set_insert (re_node_set *set, int elem)
1185 {
1186 int idx;
1187 /* In case the set is empty. */
1188 if (set->alloc == 0)
1189 {
1190 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1191 return 1;
1192 else
1193 return -1;
1194 }
1195
1196 if (BE (set->nelem, 0) == 0)
1197 {
1198 /* We already guaranteed above that set->alloc != 0. */
1199 set->elems[0] = elem;
1200 ++set->nelem;
1201 return 1;
1202 }
1203
1204 /* Realloc if we need. */
1205 if (set->alloc == set->nelem)
1206 {
1207 int *new_elems;
1208 set->alloc = set->alloc * 2;
1209 new_elems = re_realloc (set->elems, int, set->alloc);
1210 if (BE (new_elems == NULL, 0))
1211 return -1;
1212 set->elems = new_elems;
1213 }
1214
1215 /* Move the elements which follows the new element. Test the
1216 first element separately to skip a check in the inner loop. */
1217 if (elem < set->elems[0])
1218 {
1219 idx = 0;
1220 for (idx = set->nelem; idx > 0; idx--)
1221 set->elems[idx] = set->elems[idx - 1];
1222 }
1223 else
1224 {
1225 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1226 set->elems[idx] = set->elems[idx - 1];
1227 }
1228
1229 /* Insert the new element. */
1230 set->elems[idx] = elem;
1231 ++set->nelem;
1232 return 1;
1233 }
1234
1235 /* Insert the new element ELEM to the re_node_set* SET.
1236 SET should not already have any element greater than or equal to ELEM.
1237 Return -1 if an error is occured, return 1 otherwise. */
1238
1239 static int
1240 internal_function
1241 re_node_set_insert_last (re_node_set *set, int elem)
1242 {
1243 /* Realloc if we need. */
1244 if (set->alloc == set->nelem)
1245 {
1246 int *new_elems;
1247 set->alloc = (set->alloc + 1) * 2;
1248 new_elems = re_realloc (set->elems, int, set->alloc);
1249 if (BE (new_elems == NULL, 0))
1250 return -1;
1251 set->elems = new_elems;
1252 }
1253
1254 /* Insert the new element. */
1255 set->elems[set->nelem++] = elem;
1256 return 1;
1257 }
1258
1259 /* Compare two node sets SET1 and SET2.
1260 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1261
1262 static int
1263 internal_function __attribute ((pure))
1264 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1265 {
1266 int i;
1267 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1268 return 0;
1269 for (i = set1->nelem ; --i >= 0 ; )
1270 if (set1->elems[i] != set2->elems[i])
1271 return 0;
1272 return 1;
1273 }
1274
1275 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1276
1277 static int
1278 internal_function __attribute ((pure))
1279 re_node_set_contains (const re_node_set *set, int elem)
1280 {
1281 unsigned int idx, right, mid;
1282 if (set->nelem <= 0)
1283 return 0;
1284
1285 /* Binary search the element. */
1286 idx = 0;
1287 right = set->nelem - 1;
1288 while (idx < right)
1289 {
1290 mid = (idx + right) / 2;
1291 if (set->elems[mid] < elem)
1292 idx = mid + 1;
1293 else
1294 right = mid;
1295 }
1296 return set->elems[idx] == elem ? idx + 1 : 0;
1297 }
1298
1299 static void
1300 internal_function
1301 re_node_set_remove_at (re_node_set *set, int idx)
1302 {
1303 if (idx < 0 || idx >= set->nelem)
1304 return;
1305 --set->nelem;
1306 for (; idx < set->nelem; idx++)
1307 set->elems[idx] = set->elems[idx + 1];
1308 }
1309 \f
1310
1311 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1312 Or return -1, if an error will be occured. */
1313
1314 static int
1315 internal_function
1316 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1317 {
1318 int type = token.type;
1319 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1320 {
1321 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1322 int *new_nexts, *new_indices;
1323 re_node_set *new_edests, *new_eclosures;
1324 re_token_t *new_nodes;
1325
1326 /* Avoid overflows. */
1327 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1328 return -1;
1329
1330 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1331 if (BE (new_nodes == NULL, 0))
1332 return -1;
1333 dfa->nodes = new_nodes;
1334 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1335 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1336 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1337 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1338 if (BE (new_nexts == NULL || new_indices == NULL
1339 || new_edests == NULL || new_eclosures == NULL, 0))
1340 return -1;
1341 dfa->nexts = new_nexts;
1342 dfa->org_indices = new_indices;
1343 dfa->edests = new_edests;
1344 dfa->eclosures = new_eclosures;
1345 dfa->nodes_alloc = new_nodes_alloc;
1346 }
1347 dfa->nodes[dfa->nodes_len] = token;
1348 dfa->nodes[dfa->nodes_len].constraint = 0;
1349 #ifdef RE_ENABLE_I18N
1350 dfa->nodes[dfa->nodes_len].accept_mb =
1351 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1352 #endif
1353 dfa->nexts[dfa->nodes_len] = -1;
1354 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1355 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1356 return dfa->nodes_len++;
1357 }
1358
1359 static inline unsigned int
1360 internal_function
1361 calc_state_hash (const re_node_set *nodes, unsigned int context)
1362 {
1363 unsigned int hash = nodes->nelem + context;
1364 int i;
1365 for (i = 0 ; i < nodes->nelem ; i++)
1366 hash += nodes->elems[i];
1367 return hash;
1368 }
1369
1370 /* Search for the state whose node_set is equivalent to NODES.
1371 Return the pointer to the state, if we found it in the DFA.
1372 Otherwise create the new one and return it. In case of an error
1373 return NULL and set the error code in ERR.
1374 Note: - We assume NULL as the invalid state, then it is possible that
1375 return value is NULL and ERR is REG_NOERROR.
1376 - We never return non-NULL value in case of any errors, it is for
1377 optimization. */
1378
1379 static re_dfastate_t *
1380 internal_function
1381 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1382 const re_node_set *nodes)
1383 {
1384 unsigned int hash;
1385 re_dfastate_t *new_state;
1386 struct re_state_table_entry *spot;
1387 int i;
1388 if (BE (nodes->nelem == 0, 0))
1389 {
1390 *err = REG_NOERROR;
1391 return NULL;
1392 }
1393 hash = calc_state_hash (nodes, 0);
1394 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1395
1396 for (i = 0 ; i < spot->num ; i++)
1397 {
1398 re_dfastate_t *state = spot->array[i];
1399 if (hash != state->hash)
1400 continue;
1401 if (re_node_set_compare (&state->nodes, nodes))
1402 return state;
1403 }
1404
1405 /* There are no appropriate state in the dfa, create the new one. */
1406 new_state = create_ci_newstate (dfa, nodes, hash);
1407 if (BE (new_state == NULL, 0))
1408 *err = REG_ESPACE;
1409
1410 return new_state;
1411 }
1412
1413 /* Search for the state whose node_set is equivalent to NODES and
1414 whose context is equivalent to CONTEXT.
1415 Return the pointer to the state, if we found it in the DFA.
1416 Otherwise create the new one and return it. In case of an error
1417 return NULL and set the error code in ERR.
1418 Note: - We assume NULL as the invalid state, then it is possible that
1419 return value is NULL and ERR is REG_NOERROR.
1420 - We never return non-NULL value in case of any errors, it is for
1421 optimization. */
1422
1423 static re_dfastate_t *
1424 internal_function
1425 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1426 const re_node_set *nodes, unsigned int context)
1427 {
1428 unsigned int hash;
1429 re_dfastate_t *new_state;
1430 struct re_state_table_entry *spot;
1431 int i;
1432 if (nodes->nelem == 0)
1433 {
1434 *err = REG_NOERROR;
1435 return NULL;
1436 }
1437 hash = calc_state_hash (nodes, context);
1438 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1439
1440 for (i = 0 ; i < spot->num ; i++)
1441 {
1442 re_dfastate_t *state = spot->array[i];
1443 if (state->hash == hash
1444 && state->context == context
1445 && re_node_set_compare (state->entrance_nodes, nodes))
1446 return state;
1447 }
1448 /* There are no appropriate state in `dfa', create the new one. */
1449 new_state = create_cd_newstate (dfa, nodes, context, hash);
1450 if (BE (new_state == NULL, 0))
1451 *err = REG_ESPACE;
1452
1453 return new_state;
1454 }
1455
1456 /* Finish initialization of the new state NEWSTATE, and using its hash value
1457 HASH put in the appropriate bucket of DFA's state table. Return value
1458 indicates the error code if failed. */
1459
1460 static reg_errcode_t
1461 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1462 unsigned int hash)
1463 {
1464 struct re_state_table_entry *spot;
1465 reg_errcode_t err;
1466 int i;
1467
1468 newstate->hash = hash;
1469 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1470 if (BE (err != REG_NOERROR, 0))
1471 return REG_ESPACE;
1472 for (i = 0; i < newstate->nodes.nelem; i++)
1473 {
1474 int elem = newstate->nodes.elems[i];
1475 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1476 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1477 }
1478
1479 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1480 if (BE (spot->alloc <= spot->num, 0))
1481 {
1482 int new_alloc = 2 * spot->num + 2;
1483 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1484 new_alloc);
1485 if (BE (new_array == NULL, 0))
1486 return REG_ESPACE;
1487 spot->array = new_array;
1488 spot->alloc = new_alloc;
1489 }
1490 spot->array[spot->num++] = newstate;
1491 return REG_NOERROR;
1492 }
1493
1494 static void
1495 free_state (re_dfastate_t *state)
1496 {
1497 re_node_set_free (&state->non_eps_nodes);
1498 re_node_set_free (&state->inveclosure);
1499 if (state->entrance_nodes != &state->nodes)
1500 {
1501 re_node_set_free (state->entrance_nodes);
1502 re_free (state->entrance_nodes);
1503 }
1504 re_node_set_free (&state->nodes);
1505 re_free (state->word_trtable);
1506 re_free (state->trtable);
1507 re_free (state);
1508 }
1509
1510 /* Create the new state which is independ of contexts.
1511 Return the new state if succeeded, otherwise return NULL. */
1512
1513 static re_dfastate_t *
1514 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1515 unsigned int hash)
1516 {
1517 int i;
1518 reg_errcode_t err;
1519 re_dfastate_t *newstate;
1520
1521 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1522 if (BE (newstate == NULL, 0))
1523 return NULL;
1524 err = re_node_set_init_copy (&newstate->nodes, nodes);
1525 if (BE (err != REG_NOERROR, 0))
1526 {
1527 re_free (newstate);
1528 return NULL;
1529 }
1530
1531 newstate->entrance_nodes = &newstate->nodes;
1532 for (i = 0 ; i < nodes->nelem ; i++)
1533 {
1534 re_token_t *node = dfa->nodes + nodes->elems[i];
1535 re_token_type_t type = node->type;
1536 if (type == CHARACTER && !node->constraint)
1537 continue;
1538 #ifdef RE_ENABLE_I18N
1539 newstate->accept_mb |= node->accept_mb;
1540 #endif /* RE_ENABLE_I18N */
1541
1542 /* If the state has the halt node, the state is a halt state. */
1543 if (type == END_OF_RE)
1544 newstate->halt = 1;
1545 else if (type == OP_BACK_REF)
1546 newstate->has_backref = 1;
1547 else if (type == ANCHOR || node->constraint)
1548 newstate->has_constraint = 1;
1549 }
1550 err = register_state (dfa, newstate, hash);
1551 if (BE (err != REG_NOERROR, 0))
1552 {
1553 free_state (newstate);
1554 newstate = NULL;
1555 }
1556 return newstate;
1557 }
1558
1559 /* Create the new state which is depend on the context CONTEXT.
1560 Return the new state if succeeded, otherwise return NULL. */
1561
1562 static re_dfastate_t *
1563 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1564 unsigned int context, unsigned int hash)
1565 {
1566 int i, nctx_nodes = 0;
1567 reg_errcode_t err;
1568 re_dfastate_t *newstate;
1569
1570 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1571 if (BE (newstate == NULL, 0))
1572 return NULL;
1573 err = re_node_set_init_copy (&newstate->nodes, nodes);
1574 if (BE (err != REG_NOERROR, 0))
1575 {
1576 re_free (newstate);
1577 return NULL;
1578 }
1579
1580 newstate->context = context;
1581 newstate->entrance_nodes = &newstate->nodes;
1582
1583 for (i = 0 ; i < nodes->nelem ; i++)
1584 {
1585 unsigned int constraint = 0;
1586 re_token_t *node = dfa->nodes + nodes->elems[i];
1587 re_token_type_t type = node->type;
1588 if (node->constraint)
1589 constraint = node->constraint;
1590
1591 if (type == CHARACTER && !constraint)
1592 continue;
1593 #ifdef RE_ENABLE_I18N
1594 newstate->accept_mb |= node->accept_mb;
1595 #endif /* RE_ENABLE_I18N */
1596
1597 /* If the state has the halt node, the state is a halt state. */
1598 if (type == END_OF_RE)
1599 newstate->halt = 1;
1600 else if (type == OP_BACK_REF)
1601 newstate->has_backref = 1;
1602 else if (type == ANCHOR)
1603 constraint = node->opr.ctx_type;
1604
1605 if (constraint)
1606 {
1607 if (newstate->entrance_nodes == &newstate->nodes)
1608 {
1609 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1610 if (BE (newstate->entrance_nodes == NULL, 0))
1611 {
1612 free_state (newstate);
1613 return NULL;
1614 }
1615 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1616 nctx_nodes = 0;
1617 newstate->has_constraint = 1;
1618 }
1619
1620 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1621 {
1622 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1623 ++nctx_nodes;
1624 }
1625 }
1626 }
1627 err = register_state (dfa, newstate, hash);
1628 if (BE (err != REG_NOERROR, 0))
1629 {
1630 free_state (newstate);
1631 newstate = NULL;
1632 }
1633 return newstate;
1634 }