]> git.ipfire.org Git - thirdparty/glibc.git/blob - posix/regex_internal.c
Update.
[thirdparty/glibc.git] / posix / regex_internal.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 #ifdef RE_ENABLE_I18N
26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28 #endif /* RE_ENABLE_I18N */
29 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
30 const re_node_set *nodes,
31 unsigned int hash) internal_function;
32 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
33 unsigned int hash) internal_function;
34 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
36 unsigned int hash) internal_function;
37 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
38 const re_node_set *nodes,
39 unsigned int context,
40 unsigned int hash) internal_function;
41 static unsigned int inline calc_state_hash (const re_node_set *nodes,
42 unsigned int context) internal_function;
43 \f
44 /* Functions for string operation. */
45
46 /* This function allocate the buffers. It is necessary to call
47 re_string_reconstruct before using the object. */
48
49 static reg_errcode_t
50 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
51 re_string_t *pstr;
52 const char *str;
53 int len, init_len, icase;
54 RE_TRANSLATE_TYPE trans;
55 const re_dfa_t *dfa;
56 {
57 reg_errcode_t ret;
58 int init_buf_len;
59
60 /* Ensure at least one character fits into the buffers. */
61 if (init_len < dfa->mb_cur_max)
62 init_len = dfa->mb_cur_max;
63 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
64 re_string_construct_common (str, len, pstr, trans, icase, dfa);
65
66 ret = re_string_realloc_buffers (pstr, init_buf_len);
67 if (BE (ret != REG_NOERROR, 0))
68 return ret;
69
70 pstr->word_char = dfa->word_char;
71 pstr->word_ops_used = dfa->word_ops_used;
72 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
73 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
74 pstr->valid_raw_len = pstr->valid_len;
75 return REG_NOERROR;
76 }
77
78 /* This function allocate the buffers, and initialize them. */
79
80 static reg_errcode_t
81 re_string_construct (pstr, str, len, trans, icase, dfa)
82 re_string_t *pstr;
83 const char *str;
84 int len, icase;
85 RE_TRANSLATE_TYPE trans;
86 const re_dfa_t *dfa;
87 {
88 reg_errcode_t ret;
89 memset (pstr, '\0', sizeof (re_string_t));
90 re_string_construct_common (str, len, pstr, trans, icase, dfa);
91
92 if (len > 0)
93 {
94 ret = re_string_realloc_buffers (pstr, len + 1);
95 if (BE (ret != REG_NOERROR, 0))
96 return ret;
97 }
98 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
99
100 if (icase)
101 {
102 #ifdef RE_ENABLE_I18N
103 if (dfa->mb_cur_max > 1)
104 {
105 while (1)
106 {
107 ret = build_wcs_upper_buffer (pstr);
108 if (BE (ret != REG_NOERROR, 0))
109 return ret;
110 if (pstr->valid_raw_len >= len)
111 break;
112 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
113 break;
114 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
115 if (BE (ret != REG_NOERROR, 0))
116 return ret;
117 }
118 }
119 else
120 #endif /* RE_ENABLE_I18N */
121 build_upper_buffer (pstr);
122 }
123 else
124 {
125 #ifdef RE_ENABLE_I18N
126 if (dfa->mb_cur_max > 1)
127 build_wcs_buffer (pstr);
128 else
129 #endif /* RE_ENABLE_I18N */
130 {
131 if (trans != NULL)
132 re_string_translate_buffer (pstr);
133 else
134 {
135 pstr->valid_len = pstr->bufs_len;
136 pstr->valid_raw_len = pstr->bufs_len;
137 }
138 }
139 }
140
141 return REG_NOERROR;
142 }
143
144 /* Helper functions for re_string_allocate, and re_string_construct. */
145
146 static reg_errcode_t
147 re_string_realloc_buffers (pstr, new_buf_len)
148 re_string_t *pstr;
149 int new_buf_len;
150 {
151 #ifdef RE_ENABLE_I18N
152 if (pstr->mb_cur_max > 1)
153 {
154 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
155 if (BE (new_array == NULL, 0))
156 return REG_ESPACE;
157 pstr->wcs = new_array;
158 if (pstr->offsets != NULL)
159 {
160 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
161 if (BE (new_array == NULL, 0))
162 return REG_ESPACE;
163 pstr->offsets = new_array;
164 }
165 }
166 #endif /* RE_ENABLE_I18N */
167 if (pstr->mbs_allocated)
168 {
169 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
170 new_buf_len);
171 if (BE (new_array == NULL, 0))
172 return REG_ESPACE;
173 pstr->mbs = new_array;
174 }
175 pstr->bufs_len = new_buf_len;
176 return REG_NOERROR;
177 }
178
179
180 static void
181 re_string_construct_common (str, len, pstr, trans, icase, dfa)
182 const char *str;
183 int len;
184 re_string_t *pstr;
185 RE_TRANSLATE_TYPE trans;
186 int icase;
187 const re_dfa_t *dfa;
188 {
189 pstr->raw_mbs = (const unsigned char *) str;
190 pstr->len = len;
191 pstr->raw_len = len;
192 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
193 pstr->icase = icase ? 1 : 0;
194 pstr->mbs_allocated = (trans != NULL || icase);
195 pstr->mb_cur_max = dfa->mb_cur_max;
196 pstr->is_utf8 = dfa->is_utf8;
197 pstr->map_notascii = dfa->map_notascii;
198 pstr->stop = pstr->len;
199 pstr->raw_stop = pstr->stop;
200 }
201
202 #ifdef RE_ENABLE_I18N
203
204 /* Build wide character buffer PSTR->WCS.
205 If the byte sequence of the string are:
206 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
207 Then wide character buffer will be:
208 <wc1> , WEOF , <wc2> , WEOF , <wc3>
209 We use WEOF for padding, they indicate that the position isn't
210 a first byte of a multibyte character.
211
212 Note that this function assumes PSTR->VALID_LEN elements are already
213 built and starts from PSTR->VALID_LEN. */
214
215 static void
216 build_wcs_buffer (pstr)
217 re_string_t *pstr;
218 {
219 #ifdef _LIBC
220 unsigned char buf[pstr->mb_cur_max];
221 #else
222 unsigned char buf[64];
223 #endif
224 mbstate_t prev_st;
225 int byte_idx, end_idx, mbclen, remain_len;
226
227 /* Build the buffers from pstr->valid_len to either pstr->len or
228 pstr->bufs_len. */
229 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
230 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
231 {
232 wchar_t wc;
233 const char *p;
234
235 remain_len = end_idx - byte_idx;
236 prev_st = pstr->cur_state;
237 /* Apply the translation if we need. */
238 if (BE (pstr->trans != NULL, 0))
239 {
240 int i, ch;
241
242 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
243 {
244 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
245 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
246 }
247 p = (const char *) buf;
248 }
249 else
250 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
251 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
252 if (BE (mbclen == (size_t) -2, 0))
253 {
254 /* The buffer doesn't have enough space, finish to build. */
255 pstr->cur_state = prev_st;
256 break;
257 }
258 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
259 {
260 /* We treat these cases as a singlebyte character. */
261 mbclen = 1;
262 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
263 if (BE (pstr->trans != NULL, 0))
264 wc = pstr->trans[wc];
265 pstr->cur_state = prev_st;
266 }
267
268 /* Write wide character and padding. */
269 pstr->wcs[byte_idx++] = wc;
270 /* Write paddings. */
271 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
272 pstr->wcs[byte_idx++] = WEOF;
273 }
274 pstr->valid_len = byte_idx;
275 pstr->valid_raw_len = byte_idx;
276 }
277
278 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
279 but for REG_ICASE. */
280
281 static int
282 build_wcs_upper_buffer (pstr)
283 re_string_t *pstr;
284 {
285 mbstate_t prev_st;
286 int src_idx, byte_idx, end_idx, mbclen, remain_len;
287 #ifdef _LIBC
288 unsigned char buf[pstr->mb_cur_max];
289 #else
290 unsigned char buf[64];
291 #endif
292
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
298 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 {
300 while (byte_idx < end_idx)
301 {
302 wchar_t wc;
303
304 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
305 && mbsinit (&pstr->cur_state))
306 {
307 /* In case of a singlebyte character. */
308 pstr->mbs[byte_idx]
309 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
310 /* The next step uses the assumption that wchar_t is encoded
311 ASCII-safe: all ASCII values can be converted like this. */
312 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
313 ++byte_idx;
314 continue;
315 }
316
317 remain_len = end_idx - byte_idx;
318 prev_st = pstr->cur_state;
319 mbclen = mbrtowc (&wc,
320 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
321 + byte_idx), remain_len, &pstr->cur_state);
322 if (BE (mbclen > 0, 1))
323 {
324 wchar_t wcu = wc;
325 if (iswlower (wc))
326 {
327 int mbcdlen;
328
329 wcu = towupper (wc);
330 mbcdlen = wcrtomb (buf, wcu, &prev_st);
331 if (BE (mbclen == mbcdlen, 1))
332 memcpy (pstr->mbs + byte_idx, buf, mbclen);
333 else
334 {
335 src_idx = byte_idx;
336 goto offsets_needed;
337 }
338 }
339 else
340 memcpy (pstr->mbs + byte_idx,
341 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
342 pstr->wcs[byte_idx++] = wcu;
343 /* Write paddings. */
344 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
345 pstr->wcs[byte_idx++] = WEOF;
346 }
347 else if (mbclen == (size_t) -1 || mbclen == 0)
348 {
349 /* It is an invalid character or '\0'. Just use the byte. */
350 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
351 pstr->mbs[byte_idx] = ch;
352 /* And also cast it to wide char. */
353 pstr->wcs[byte_idx++] = (wchar_t) ch;
354 if (BE (mbclen == (size_t) -1, 0))
355 pstr->cur_state = prev_st;
356 }
357 else
358 {
359 /* The buffer doesn't have enough space, finish to build. */
360 pstr->cur_state = prev_st;
361 break;
362 }
363 }
364 pstr->valid_len = byte_idx;
365 pstr->valid_raw_len = byte_idx;
366 return REG_NOERROR;
367 }
368 else
369 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
370 {
371 wchar_t wc;
372 const char *p;
373 offsets_needed:
374 remain_len = end_idx - byte_idx;
375 prev_st = pstr->cur_state;
376 if (BE (pstr->trans != NULL, 0))
377 {
378 int i, ch;
379
380 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
381 {
382 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
383 buf[i] = pstr->trans[ch];
384 }
385 p = (const char *) buf;
386 }
387 else
388 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
389 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
390 if (BE (mbclen > 0, 1))
391 {
392 wchar_t wcu = wc;
393 if (iswlower (wc))
394 {
395 int mbcdlen;
396
397 wcu = towupper (wc);
398 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
399 if (BE (mbclen == mbcdlen, 1))
400 memcpy (pstr->mbs + byte_idx, buf, mbclen);
401 else
402 {
403 int i;
404
405 if (byte_idx + mbcdlen > pstr->bufs_len)
406 {
407 pstr->cur_state = prev_st;
408 break;
409 }
410
411 if (pstr->offsets == NULL)
412 {
413 pstr->offsets = re_malloc (int, pstr->bufs_len);
414
415 if (pstr->offsets == NULL)
416 return REG_ESPACE;
417 }
418 if (!pstr->offsets_needed)
419 {
420 for (i = 0; i < byte_idx; ++i)
421 pstr->offsets[i] = i;
422 pstr->offsets_needed = 1;
423 }
424
425 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
426 pstr->wcs[byte_idx] = wcu;
427 pstr->offsets[byte_idx] = src_idx;
428 for (i = 1; i < mbcdlen; ++i)
429 {
430 pstr->offsets[byte_idx + i]
431 = src_idx + (i < mbclen ? i : mbclen - 1);
432 pstr->wcs[byte_idx + i] = WEOF;
433 }
434 pstr->len += mbcdlen - mbclen;
435 if (pstr->raw_stop > src_idx)
436 pstr->stop += mbcdlen - mbclen;
437 end_idx = (pstr->bufs_len > pstr->len)
438 ? pstr->len : pstr->bufs_len;
439 byte_idx += mbcdlen;
440 src_idx += mbclen;
441 continue;
442 }
443 }
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
446
447 if (BE (pstr->offsets_needed != 0, 0))
448 {
449 int i;
450 for (i = 0; i < mbclen; ++i)
451 pstr->offsets[byte_idx + i] = src_idx + i;
452 }
453 src_idx += mbclen;
454
455 pstr->wcs[byte_idx++] = wcu;
456 /* Write paddings. */
457 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 pstr->wcs[byte_idx++] = WEOF;
459 }
460 else if (mbclen == (size_t) -1 || mbclen == 0)
461 {
462 /* It is an invalid character or '\0'. Just use the byte. */
463 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
464
465 if (BE (pstr->trans != NULL, 0))
466 ch = pstr->trans [ch];
467 pstr->mbs[byte_idx] = ch;
468
469 if (BE (pstr->offsets_needed != 0, 0))
470 pstr->offsets[byte_idx] = src_idx;
471 ++src_idx;
472
473 /* And also cast it to wide char. */
474 pstr->wcs[byte_idx++] = (wchar_t) ch;
475 if (BE (mbclen == (size_t) -1, 0))
476 pstr->cur_state = prev_st;
477 }
478 else
479 {
480 /* The buffer doesn't have enough space, finish to build. */
481 pstr->cur_state = prev_st;
482 break;
483 }
484 }
485 pstr->valid_len = byte_idx;
486 pstr->valid_raw_len = src_idx;
487 return REG_NOERROR;
488 }
489
490 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
491 Return the index. */
492
493 static int
494 re_string_skip_chars (pstr, new_raw_idx, last_wc)
495 re_string_t *pstr;
496 int new_raw_idx;
497 wint_t *last_wc;
498 {
499 mbstate_t prev_st;
500 int rawbuf_idx, mbclen;
501 wchar_t wc = 0;
502
503 /* Skip the characters which are not necessary to check. */
504 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
505 rawbuf_idx < new_raw_idx;)
506 {
507 int remain_len;
508 remain_len = pstr->len - rawbuf_idx;
509 prev_st = pstr->cur_state;
510 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
511 remain_len, &pstr->cur_state);
512 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
513 {
514 /* We treat these cases as a singlebyte character. */
515 mbclen = 1;
516 pstr->cur_state = prev_st;
517 }
518 /* Then proceed the next character. */
519 rawbuf_idx += mbclen;
520 }
521 *last_wc = (wint_t) wc;
522 return rawbuf_idx;
523 }
524 #endif /* RE_ENABLE_I18N */
525
526 /* Build the buffer PSTR->MBS, and apply the translation if we need.
527 This function is used in case of REG_ICASE. */
528
529 static void
530 build_upper_buffer (pstr)
531 re_string_t *pstr;
532 {
533 int char_idx, end_idx;
534 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535
536 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537 {
538 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539 if (BE (pstr->trans != NULL, 0))
540 ch = pstr->trans[ch];
541 if (islower (ch))
542 pstr->mbs[char_idx] = toupper (ch);
543 else
544 pstr->mbs[char_idx] = ch;
545 }
546 pstr->valid_len = char_idx;
547 pstr->valid_raw_len = char_idx;
548 }
549
550 /* Apply TRANS to the buffer in PSTR. */
551
552 static void
553 re_string_translate_buffer (pstr)
554 re_string_t *pstr;
555 {
556 int buf_idx, end_idx;
557 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560 {
561 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562 pstr->mbs[buf_idx] = pstr->trans[ch];
563 }
564
565 pstr->valid_len = buf_idx;
566 pstr->valid_raw_len = buf_idx;
567 }
568
569 /* This function re-construct the buffers.
570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571 convert to upper case in case of REG_ICASE, apply translation. */
572
573 static reg_errcode_t
574 re_string_reconstruct (pstr, idx, eflags)
575 re_string_t *pstr;
576 int idx, eflags;
577 {
578 int offset = idx - pstr->raw_mbs_idx;
579 if (BE (offset < 0, 0))
580 {
581 /* Reset buffer. */
582 #ifdef RE_ENABLE_I18N
583 if (pstr->mb_cur_max > 1)
584 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585 #endif /* RE_ENABLE_I18N */
586 pstr->len = pstr->raw_len;
587 pstr->stop = pstr->raw_stop;
588 pstr->valid_len = 0;
589 pstr->raw_mbs_idx = 0;
590 pstr->valid_raw_len = 0;
591 pstr->offsets_needed = 0;
592 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
594 if (!pstr->mbs_allocated)
595 pstr->mbs = (unsigned char *) pstr->raw_mbs;
596 offset = idx;
597 }
598
599 if (BE (offset != 0, 1))
600 {
601 /* Are the characters which are already checked remain? */
602 if (BE (offset < pstr->valid_raw_len, 1)
603 #ifdef RE_ENABLE_I18N
604 /* Handling this would enlarge the code too much.
605 Accept a slowdown in that case. */
606 && pstr->offsets_needed == 0
607 #endif
608 )
609 {
610 /* Yes, move them to the front of the buffer. */
611 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
612 #ifdef RE_ENABLE_I18N
613 if (pstr->mb_cur_max > 1)
614 memmove (pstr->wcs, pstr->wcs + offset,
615 (pstr->valid_len - offset) * sizeof (wint_t));
616 #endif /* RE_ENABLE_I18N */
617 if (BE (pstr->mbs_allocated, 0))
618 memmove (pstr->mbs, pstr->mbs + offset,
619 pstr->valid_len - offset);
620 pstr->valid_len -= offset;
621 pstr->valid_raw_len -= offset;
622 #if DEBUG
623 assert (pstr->valid_len > 0);
624 #endif
625 }
626 else
627 {
628 /* No, skip all characters until IDX. */
629 #ifdef RE_ENABLE_I18N
630 if (BE (pstr->offsets_needed, 0))
631 {
632 pstr->len = pstr->raw_len - idx + offset;
633 pstr->stop = pstr->raw_stop - idx + offset;
634 pstr->offsets_needed = 0;
635 }
636 #endif
637 pstr->valid_len = 0;
638 pstr->valid_raw_len = 0;
639 #ifdef RE_ENABLE_I18N
640 if (pstr->mb_cur_max > 1)
641 {
642 int wcs_idx;
643 wint_t wc = WEOF;
644
645 if (pstr->is_utf8)
646 {
647 const unsigned char *raw, *p, *q, *end;
648
649 /* Special case UTF-8. Multi-byte chars start with any
650 byte other than 0x80 - 0xbf. */
651 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
652 end = raw + (offset - pstr->mb_cur_max);
653 for (p = raw + offset - 1; p >= end; --p)
654 if ((*p & 0xc0) != 0x80)
655 {
656 mbstate_t cur_state;
657 wchar_t wc2;
658 int mlen = raw + pstr->len - p;
659 unsigned char buf[6];
660
661 q = p;
662 if (BE (pstr->trans != NULL, 0))
663 {
664 int i = mlen < 6 ? mlen : 6;
665 while (--i >= 0)
666 buf[i] = pstr->trans[p[i]];
667 q = buf;
668 }
669 /* XXX Don't use mbrtowc, we know which conversion
670 to use (UTF-8 -> UCS4). */
671 memset (&cur_state, 0, sizeof (cur_state));
672 mlen = mbrtowc (&wc2, p, mlen, &cur_state)
673 - (raw + offset - p);
674 if (mlen >= 0)
675 {
676 memset (&pstr->cur_state, '\0',
677 sizeof (mbstate_t));
678 pstr->valid_len = mlen;
679 wc = wc2;
680 }
681 break;
682 }
683 }
684
685 if (wc == WEOF)
686 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
687 if (BE (pstr->valid_len, 0))
688 {
689 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
690 pstr->wcs[wcs_idx] = WEOF;
691 if (pstr->mbs_allocated)
692 memset (pstr->mbs, 255, pstr->valid_len);
693 }
694 pstr->valid_raw_len = pstr->valid_len;
695 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
696 && IS_WIDE_WORD_CHAR (wc))
697 ? CONTEXT_WORD
698 : ((IS_WIDE_NEWLINE (wc)
699 && pstr->newline_anchor)
700 ? CONTEXT_NEWLINE : 0));
701 }
702 else
703 #endif /* RE_ENABLE_I18N */
704 {
705 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
706 if (pstr->trans)
707 c = pstr->trans[c];
708 pstr->tip_context = (bitset_contain (pstr->word_char, c)
709 ? CONTEXT_WORD
710 : ((IS_NEWLINE (c) && pstr->newline_anchor)
711 ? CONTEXT_NEWLINE : 0));
712 }
713 }
714 if (!BE (pstr->mbs_allocated, 0))
715 pstr->mbs += offset;
716 }
717 pstr->raw_mbs_idx = idx;
718 pstr->len -= offset;
719 pstr->stop -= offset;
720
721 /* Then build the buffers. */
722 #ifdef RE_ENABLE_I18N
723 if (pstr->mb_cur_max > 1)
724 {
725 if (pstr->icase)
726 {
727 int ret = build_wcs_upper_buffer (pstr);
728 if (BE (ret != REG_NOERROR, 0))
729 return ret;
730 }
731 else
732 build_wcs_buffer (pstr);
733 }
734 else
735 #endif /* RE_ENABLE_I18N */
736 if (BE (pstr->mbs_allocated, 0))
737 {
738 if (pstr->icase)
739 build_upper_buffer (pstr);
740 else if (pstr->trans != NULL)
741 re_string_translate_buffer (pstr);
742 }
743 else
744 pstr->valid_len = pstr->len;
745
746 pstr->cur_idx = 0;
747 return REG_NOERROR;
748 }
749
750 static unsigned char
751 re_string_peek_byte_case (pstr, idx)
752 const re_string_t *pstr;
753 int idx;
754 {
755 int ch, off;
756
757 /* Handle the common (easiest) cases first. */
758 if (BE (!pstr->mbs_allocated, 1))
759 return re_string_peek_byte (pstr, idx);
760
761 #ifdef RE_ENABLE_I18N
762 if (pstr->mb_cur_max > 1
763 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
764 return re_string_peek_byte (pstr, idx);
765 #endif
766
767 off = pstr->cur_idx + idx;
768 #ifdef RE_ENABLE_I18N
769 if (pstr->offsets_needed)
770 off = pstr->offsets[off];
771 #endif
772
773 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
774
775 #ifdef RE_ENABLE_I18N
776 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
777 this function returns CAPITAL LETTER I instead of first byte of
778 DOTLESS SMALL LETTER I. The latter would confuse the parser,
779 since peek_byte_case doesn't advance cur_idx in any way. */
780 if (pstr->offsets_needed && !isascii (ch))
781 return re_string_peek_byte (pstr, idx);
782 #endif
783
784 return ch;
785 }
786
787 static unsigned char
788 re_string_fetch_byte_case (pstr)
789 re_string_t *pstr;
790 {
791 if (BE (!pstr->mbs_allocated, 1))
792 return re_string_fetch_byte (pstr);
793
794 #ifdef RE_ENABLE_I18N
795 if (pstr->offsets_needed)
796 {
797 int off, ch;
798
799 /* For tr_TR.UTF-8 [[:islower:]] there is
800 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
801 in that case the whole multi-byte character and return
802 the original letter. On the other side, with
803 [[: DOTLESS SMALL LETTER I return [[:I, as doing
804 anything else would complicate things too much. */
805
806 if (!re_string_first_byte (pstr, pstr->cur_idx))
807 return re_string_fetch_byte (pstr);
808
809 off = pstr->offsets[pstr->cur_idx];
810 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
811
812 if (! isascii (ch))
813 return re_string_fetch_byte (pstr);
814
815 re_string_skip_bytes (pstr,
816 re_string_char_size_at (pstr, pstr->cur_idx));
817 return ch;
818 }
819 #endif
820
821 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
822 }
823
824 static void
825 re_string_destruct (pstr)
826 re_string_t *pstr;
827 {
828 #ifdef RE_ENABLE_I18N
829 re_free (pstr->wcs);
830 re_free (pstr->offsets);
831 #endif /* RE_ENABLE_I18N */
832 if (pstr->mbs_allocated)
833 re_free (pstr->mbs);
834 }
835
836 /* Return the context at IDX in INPUT. */
837
838 static unsigned int
839 re_string_context_at (input, idx, eflags)
840 const re_string_t *input;
841 int idx, eflags;
842 {
843 int c;
844 if (BE (idx < 0, 0))
845 /* In this case, we use the value stored in input->tip_context,
846 since we can't know the character in input->mbs[-1] here. */
847 return input->tip_context;
848 if (BE (idx == input->len, 0))
849 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
850 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
851 #ifdef RE_ENABLE_I18N
852 if (input->mb_cur_max > 1)
853 {
854 wint_t wc;
855 int wc_idx = idx;
856 while(input->wcs[wc_idx] == WEOF)
857 {
858 #ifdef DEBUG
859 /* It must not happen. */
860 assert (wc_idx >= 0);
861 #endif
862 --wc_idx;
863 if (wc_idx < 0)
864 return input->tip_context;
865 }
866 wc = input->wcs[wc_idx];
867 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
868 return CONTEXT_WORD;
869 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
870 ? CONTEXT_NEWLINE : 0);
871 }
872 else
873 #endif
874 {
875 c = re_string_byte_at (input, idx);
876 if (bitset_contain (input->word_char, c))
877 return CONTEXT_WORD;
878 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
879 }
880 }
881 \f
882 /* Functions for set operation. */
883
884 static reg_errcode_t
885 re_node_set_alloc (set, size)
886 re_node_set *set;
887 int size;
888 {
889 set->alloc = size;
890 set->nelem = 0;
891 set->elems = re_malloc (int, size);
892 if (BE (set->elems == NULL, 0))
893 return REG_ESPACE;
894 return REG_NOERROR;
895 }
896
897 static reg_errcode_t
898 re_node_set_init_1 (set, elem)
899 re_node_set *set;
900 int elem;
901 {
902 set->alloc = 1;
903 set->nelem = 1;
904 set->elems = re_malloc (int, 1);
905 if (BE (set->elems == NULL, 0))
906 {
907 set->alloc = set->nelem = 0;
908 return REG_ESPACE;
909 }
910 set->elems[0] = elem;
911 return REG_NOERROR;
912 }
913
914 static reg_errcode_t
915 re_node_set_init_2 (set, elem1, elem2)
916 re_node_set *set;
917 int elem1, elem2;
918 {
919 set->alloc = 2;
920 set->elems = re_malloc (int, 2);
921 if (BE (set->elems == NULL, 0))
922 return REG_ESPACE;
923 if (elem1 == elem2)
924 {
925 set->nelem = 1;
926 set->elems[0] = elem1;
927 }
928 else
929 {
930 set->nelem = 2;
931 if (elem1 < elem2)
932 {
933 set->elems[0] = elem1;
934 set->elems[1] = elem2;
935 }
936 else
937 {
938 set->elems[0] = elem2;
939 set->elems[1] = elem1;
940 }
941 }
942 return REG_NOERROR;
943 }
944
945 static reg_errcode_t
946 re_node_set_init_copy (dest, src)
947 re_node_set *dest;
948 const re_node_set *src;
949 {
950 dest->nelem = src->nelem;
951 if (src->nelem > 0)
952 {
953 dest->alloc = dest->nelem;
954 dest->elems = re_malloc (int, dest->alloc);
955 if (BE (dest->elems == NULL, 0))
956 {
957 dest->alloc = dest->nelem = 0;
958 return REG_ESPACE;
959 }
960 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
961 }
962 else
963 re_node_set_init_empty (dest);
964 return REG_NOERROR;
965 }
966
967 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
968 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
969 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
970
971 static reg_errcode_t
972 re_node_set_add_intersect (dest, src1, src2)
973 re_node_set *dest;
974 const re_node_set *src1, *src2;
975 {
976 int i1, i2, is, id, delta, sbase;
977 if (src1->nelem == 0 || src2->nelem == 0)
978 return REG_NOERROR;
979
980 /* We need dest->nelem + 2 * elems_in_intersection; this is a
981 conservative estimate. */
982 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
983 {
984 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
985 int *new_elems = re_realloc (dest->elems, int, new_alloc);
986 if (BE (new_elems == NULL, 0))
987 return REG_ESPACE;
988 dest->elems = new_elems;
989 dest->alloc = new_alloc;
990 }
991
992 /* Find the items in the intersection of SRC1 and SRC2, and copy
993 into the top of DEST those that are not already in DEST itself. */
994 sbase = dest->nelem + src1->nelem + src2->nelem;
995 i1 = src1->nelem - 1;
996 i2 = src2->nelem - 1;
997 id = dest->nelem - 1;
998 for (;;)
999 {
1000 if (src1->elems[i1] == src2->elems[i2])
1001 {
1002 /* Try to find the item in DEST. Maybe we could binary search? */
1003 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1004 --id;
1005
1006 if (id < 0 || dest->elems[id] != src1->elems[i1])
1007 dest->elems[--sbase] = src1->elems[i1];
1008
1009 if (--i1 < 0 || --i2 < 0)
1010 break;
1011 }
1012
1013 /* Lower the highest of the two items. */
1014 else if (src1->elems[i1] < src2->elems[i2])
1015 {
1016 if (--i2 < 0)
1017 break;
1018 }
1019 else
1020 {
1021 if (--i1 < 0)
1022 break;
1023 }
1024 }
1025
1026 id = dest->nelem - 1;
1027 is = dest->nelem + src1->nelem + src2->nelem - 1;
1028 delta = is - sbase + 1;
1029
1030 /* Now copy. When DELTA becomes zero, the remaining
1031 DEST elements are already in place; this is more or
1032 less the same loop that is in re_node_set_merge. */
1033 dest->nelem += delta;
1034 if (delta > 0 && id >= 0)
1035 for (;;)
1036 {
1037 if (dest->elems[is] > dest->elems[id])
1038 {
1039 /* Copy from the top. */
1040 dest->elems[id + delta--] = dest->elems[is--];
1041 if (delta == 0)
1042 break;
1043 }
1044 else
1045 {
1046 /* Slide from the bottom. */
1047 dest->elems[id + delta] = dest->elems[id];
1048 if (--id < 0)
1049 break;
1050 }
1051 }
1052
1053 /* Copy remaining SRC elements. */
1054 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1055
1056 return REG_NOERROR;
1057 }
1058
1059 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1060 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1061
1062 static reg_errcode_t
1063 re_node_set_init_union (dest, src1, src2)
1064 re_node_set *dest;
1065 const re_node_set *src1, *src2;
1066 {
1067 int i1, i2, id;
1068 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1069 {
1070 dest->alloc = src1->nelem + src2->nelem;
1071 dest->elems = re_malloc (int, dest->alloc);
1072 if (BE (dest->elems == NULL, 0))
1073 return REG_ESPACE;
1074 }
1075 else
1076 {
1077 if (src1 != NULL && src1->nelem > 0)
1078 return re_node_set_init_copy (dest, src1);
1079 else if (src2 != NULL && src2->nelem > 0)
1080 return re_node_set_init_copy (dest, src2);
1081 else
1082 re_node_set_init_empty (dest);
1083 return REG_NOERROR;
1084 }
1085 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1086 {
1087 if (src1->elems[i1] > src2->elems[i2])
1088 {
1089 dest->elems[id++] = src2->elems[i2++];
1090 continue;
1091 }
1092 if (src1->elems[i1] == src2->elems[i2])
1093 ++i2;
1094 dest->elems[id++] = src1->elems[i1++];
1095 }
1096 if (i1 < src1->nelem)
1097 {
1098 memcpy (dest->elems + id, src1->elems + i1,
1099 (src1->nelem - i1) * sizeof (int));
1100 id += src1->nelem - i1;
1101 }
1102 else if (i2 < src2->nelem)
1103 {
1104 memcpy (dest->elems + id, src2->elems + i2,
1105 (src2->nelem - i2) * sizeof (int));
1106 id += src2->nelem - i2;
1107 }
1108 dest->nelem = id;
1109 return REG_NOERROR;
1110 }
1111
1112 /* Calculate the union set of the sets DEST and SRC. And store it to
1113 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1114
1115 static reg_errcode_t
1116 re_node_set_merge (dest, src)
1117 re_node_set *dest;
1118 const re_node_set *src;
1119 {
1120 int is, id, sbase, delta;
1121 if (src == NULL || src->nelem == 0)
1122 return REG_NOERROR;
1123 if (dest->alloc < 2 * src->nelem + dest->nelem)
1124 {
1125 int new_alloc = 2 * (src->nelem + dest->alloc);
1126 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1127 if (BE (new_buffer == NULL, 0))
1128 return REG_ESPACE;
1129 dest->elems = new_buffer;
1130 dest->alloc = new_alloc;
1131 }
1132
1133 if (BE (dest->nelem == 0, 0))
1134 {
1135 dest->nelem = src->nelem;
1136 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1137 return REG_NOERROR;
1138 }
1139
1140 /* Copy into the top of DEST the items of SRC that are not
1141 found in DEST. Maybe we could binary search in DEST? */
1142 for (sbase = dest->nelem + 2 * src->nelem,
1143 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1144 {
1145 if (dest->elems[id] == src->elems[is])
1146 is--, id--;
1147 else if (dest->elems[id] < src->elems[is])
1148 dest->elems[--sbase] = src->elems[is--];
1149 else /* if (dest->elems[id] > src->elems[is]) */
1150 --id;
1151 }
1152
1153 if (is >= 0)
1154 {
1155 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1156 sbase -= is + 1;
1157 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1158 }
1159
1160 id = dest->nelem - 1;
1161 is = dest->nelem + 2 * src->nelem - 1;
1162 delta = is - sbase + 1;
1163 if (delta == 0)
1164 return REG_NOERROR;
1165
1166 /* Now copy. When DELTA becomes zero, the remaining
1167 DEST elements are already in place. */
1168 dest->nelem += delta;
1169 for (;;)
1170 {
1171 if (dest->elems[is] > dest->elems[id])
1172 {
1173 /* Copy from the top. */
1174 dest->elems[id + delta--] = dest->elems[is--];
1175 if (delta == 0)
1176 break;
1177 }
1178 else
1179 {
1180 /* Slide from the bottom. */
1181 dest->elems[id + delta] = dest->elems[id];
1182 if (--id < 0)
1183 {
1184 /* Copy remaining SRC elements. */
1185 memcpy (dest->elems, dest->elems + sbase,
1186 delta * sizeof (int));
1187 break;
1188 }
1189 }
1190 }
1191
1192 return REG_NOERROR;
1193 }
1194
1195 /* Insert the new element ELEM to the re_node_set* SET.
1196 SET should not already have ELEM.
1197 return -1 if an error is occured, return 1 otherwise. */
1198
1199 static int
1200 re_node_set_insert (set, elem)
1201 re_node_set *set;
1202 int elem;
1203 {
1204 int idx;
1205 /* In case the set is empty. */
1206 if (set->alloc == 0)
1207 {
1208 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1209 return 1;
1210 else
1211 return -1;
1212 }
1213
1214 if (BE (set->nelem, 0) == 0)
1215 {
1216 /* We already guaranteed above that set->alloc != 0. */
1217 set->elems[0] = elem;
1218 ++set->nelem;
1219 return 1;
1220 }
1221
1222 /* Realloc if we need. */
1223 if (set->alloc == set->nelem)
1224 {
1225 int *new_array;
1226 set->alloc = set->alloc * 2;
1227 new_array = re_realloc (set->elems, int, set->alloc);
1228 if (BE (new_array == NULL, 0))
1229 return -1;
1230 set->elems = new_array;
1231 }
1232
1233 /* Move the elements which follows the new element. Test the
1234 first element separately to skip a check in the inner loop. */
1235 if (elem < set->elems[0])
1236 {
1237 idx = 0;
1238 for (idx = set->nelem; idx > 0; idx--)
1239 set->elems[idx] = set->elems[idx - 1];
1240 }
1241 else
1242 {
1243 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1244 set->elems[idx] = set->elems[idx - 1];
1245 }
1246
1247 /* Insert the new element. */
1248 set->elems[idx] = elem;
1249 ++set->nelem;
1250 return 1;
1251 }
1252
1253 /* Insert the new element ELEM to the re_node_set* SET.
1254 SET should not already have any element greater than or equal to ELEM.
1255 Return -1 if an error is occured, return 1 otherwise. */
1256
1257 static int
1258 re_node_set_insert_last (set, elem)
1259 re_node_set *set;
1260 int elem;
1261 {
1262 /* Realloc if we need. */
1263 if (set->alloc == set->nelem)
1264 {
1265 int *new_array;
1266 set->alloc = (set->alloc + 1) * 2;
1267 new_array = re_realloc (set->elems, int, set->alloc);
1268 if (BE (new_array == NULL, 0))
1269 return -1;
1270 set->elems = new_array;
1271 }
1272
1273 /* Insert the new element. */
1274 set->elems[set->nelem++] = elem;
1275 return 1;
1276 }
1277
1278 /* Compare two node sets SET1 and SET2.
1279 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1280
1281 static int
1282 re_node_set_compare (set1, set2)
1283 const re_node_set *set1, *set2;
1284 {
1285 int i;
1286 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1287 return 0;
1288 for (i = set1->nelem ; --i >= 0 ; )
1289 if (set1->elems[i] != set2->elems[i])
1290 return 0;
1291 return 1;
1292 }
1293
1294 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1295
1296 static int
1297 re_node_set_contains (set, elem)
1298 const re_node_set *set;
1299 int elem;
1300 {
1301 int idx, right, mid;
1302 if (set->nelem <= 0)
1303 return 0;
1304
1305 /* Binary search the element. */
1306 idx = 0;
1307 right = set->nelem - 1;
1308 while (idx < right)
1309 {
1310 mid = (idx + right) / 2;
1311 if (set->elems[mid] < elem)
1312 idx = mid + 1;
1313 else
1314 right = mid;
1315 }
1316 return set->elems[idx] == elem ? idx + 1 : 0;
1317 }
1318
1319 static void
1320 re_node_set_remove_at (set, idx)
1321 re_node_set *set;
1322 int idx;
1323 {
1324 if (idx < 0 || idx >= set->nelem)
1325 return;
1326 --set->nelem;
1327 for (; idx < set->nelem; idx++)
1328 set->elems[idx] = set->elems[idx + 1];
1329 }
1330 \f
1331
1332 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1333 Or return -1, if an error will be occured. */
1334
1335 static int
1336 re_dfa_add_node (dfa, token, mode)
1337 re_dfa_t *dfa;
1338 re_token_t token;
1339 int mode;
1340 {
1341 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1342 {
1343 int new_nodes_alloc = dfa->nodes_alloc * 2;
1344 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1345 new_nodes_alloc);
1346 if (BE (new_array == NULL, 0))
1347 return -1;
1348 dfa->nodes = new_array;
1349 if (mode)
1350 {
1351 int *new_nexts, *new_indices;
1352 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1353
1354 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1355 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1356 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1357 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1358 new_nodes_alloc);
1359 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1360 new_nodes_alloc);
1361 if (BE (new_nexts == NULL || new_indices == NULL
1362 || new_edests == NULL || new_eclosures == NULL
1363 || new_inveclosures == NULL, 0))
1364 return -1;
1365 dfa->nexts = new_nexts;
1366 dfa->org_indices = new_indices;
1367 dfa->edests = new_edests;
1368 dfa->eclosures = new_eclosures;
1369 dfa->inveclosures = new_inveclosures;
1370 }
1371 dfa->nodes_alloc = new_nodes_alloc;
1372 }
1373 dfa->nodes[dfa->nodes_len] = token;
1374 dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1375 dfa->nodes[dfa->nodes_len].duplicated = 0;
1376 dfa->nodes[dfa->nodes_len].constraint = 0;
1377 return dfa->nodes_len++;
1378 }
1379
1380 static unsigned int inline
1381 calc_state_hash (nodes, context)
1382 const re_node_set *nodes;
1383 unsigned int context;
1384 {
1385 unsigned int hash = nodes->nelem + context;
1386 int i;
1387 for (i = 0 ; i < nodes->nelem ; i++)
1388 hash += nodes->elems[i];
1389 return hash;
1390 }
1391
1392 /* Search for the state whose node_set is equivalent to NODES.
1393 Return the pointer to the state, if we found it in the DFA.
1394 Otherwise create the new one and return it. In case of an error
1395 return NULL and set the error code in ERR.
1396 Note: - We assume NULL as the invalid state, then it is possible that
1397 return value is NULL and ERR is REG_NOERROR.
1398 - We never return non-NULL value in case of any errors, it is for
1399 optimization. */
1400
1401 static re_dfastate_t*
1402 re_acquire_state (err, dfa, nodes)
1403 reg_errcode_t *err;
1404 re_dfa_t *dfa;
1405 const re_node_set *nodes;
1406 {
1407 unsigned int hash;
1408 re_dfastate_t *new_state;
1409 struct re_state_table_entry *spot;
1410 int i;
1411 if (BE (nodes->nelem == 0, 0))
1412 {
1413 *err = REG_NOERROR;
1414 return NULL;
1415 }
1416 hash = calc_state_hash (nodes, 0);
1417 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1418
1419 for (i = 0 ; i < spot->num ; i++)
1420 {
1421 re_dfastate_t *state = spot->array[i];
1422 if (hash != state->hash)
1423 continue;
1424 if (re_node_set_compare (&state->nodes, nodes))
1425 return state;
1426 }
1427
1428 /* There are no appropriate state in the dfa, create the new one. */
1429 new_state = create_ci_newstate (dfa, nodes, hash);
1430 if (BE (new_state != NULL, 1))
1431 return new_state;
1432 else
1433 {
1434 *err = REG_ESPACE;
1435 return NULL;
1436 }
1437 }
1438
1439 /* Search for the state whose node_set is equivalent to NODES and
1440 whose context is equivalent to CONTEXT.
1441 Return the pointer to the state, if we found it in the DFA.
1442 Otherwise create the new one and return it. In case of an error
1443 return NULL and set the error code in ERR.
1444 Note: - We assume NULL as the invalid state, then it is possible that
1445 return value is NULL and ERR is REG_NOERROR.
1446 - We never return non-NULL value in case of any errors, it is for
1447 optimization. */
1448
1449 static re_dfastate_t*
1450 re_acquire_state_context (err, dfa, nodes, context)
1451 reg_errcode_t *err;
1452 re_dfa_t *dfa;
1453 const re_node_set *nodes;
1454 unsigned int context;
1455 {
1456 unsigned int hash;
1457 re_dfastate_t *new_state;
1458 struct re_state_table_entry *spot;
1459 int i;
1460 if (nodes->nelem == 0)
1461 {
1462 *err = REG_NOERROR;
1463 return NULL;
1464 }
1465 hash = calc_state_hash (nodes, context);
1466 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1467
1468 for (i = 0 ; i < spot->num ; i++)
1469 {
1470 re_dfastate_t *state = spot->array[i];
1471 if (state->hash == hash
1472 && state->context == context
1473 && re_node_set_compare (state->entrance_nodes, nodes))
1474 return state;
1475 }
1476 /* There are no appropriate state in `dfa', create the new one. */
1477 new_state = create_cd_newstate (dfa, nodes, context, hash);
1478 if (BE (new_state != NULL, 1))
1479 return new_state;
1480 else
1481 {
1482 *err = REG_ESPACE;
1483 return NULL;
1484 }
1485 }
1486
1487 /* Allocate memory for DFA state and initialize common properties.
1488 Return the new state if succeeded, otherwise return NULL. */
1489
1490 static re_dfastate_t *
1491 create_newstate_common (dfa, nodes, hash)
1492 re_dfa_t *dfa;
1493 const re_node_set *nodes;
1494 unsigned int hash;
1495 {
1496 re_dfastate_t *newstate;
1497 reg_errcode_t err;
1498 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1499 if (BE (newstate == NULL, 0))
1500 return NULL;
1501 err = re_node_set_init_copy (&newstate->nodes, nodes);
1502 if (BE (err != REG_NOERROR, 0))
1503 {
1504 re_free (newstate);
1505 return NULL;
1506 }
1507 newstate->trtable = NULL;
1508 newstate->hash = hash;
1509 return newstate;
1510 }
1511
1512 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1513 position. Return value indicate the error code if failed. */
1514
1515 static reg_errcode_t
1516 register_state (dfa, newstate, hash)
1517 re_dfa_t *dfa;
1518 re_dfastate_t *newstate;
1519 unsigned int hash;
1520 {
1521 struct re_state_table_entry *spot;
1522 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1523
1524 if (BE (spot->alloc <= spot->num, 0))
1525 {
1526 int new_alloc = 2 * spot->num + 2;
1527 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1528 new_alloc);
1529 if (BE (new_array == NULL, 0))
1530 return REG_ESPACE;
1531 spot->array = new_array;
1532 spot->alloc = new_alloc;
1533 }
1534 spot->array[spot->num++] = newstate;
1535 return REG_NOERROR;
1536 }
1537
1538 /* Create the new state which is independ of contexts.
1539 Return the new state if succeeded, otherwise return NULL. */
1540
1541 static re_dfastate_t *
1542 create_ci_newstate (dfa, nodes, hash)
1543 re_dfa_t *dfa;
1544 const re_node_set *nodes;
1545 unsigned int hash;
1546 {
1547 int i;
1548 reg_errcode_t err;
1549 re_dfastate_t *newstate;
1550 newstate = create_newstate_common (dfa, nodes, hash);
1551 if (BE (newstate == NULL, 0))
1552 return NULL;
1553 newstate->entrance_nodes = &newstate->nodes;
1554
1555 for (i = 0 ; i < nodes->nelem ; i++)
1556 {
1557 re_token_t *node = dfa->nodes + nodes->elems[i];
1558 re_token_type_t type = node->type;
1559 if (type == CHARACTER && !node->constraint)
1560 continue;
1561
1562 /* If the state has the halt node, the state is a halt state. */
1563 else if (type == END_OF_RE)
1564 newstate->halt = 1;
1565 #ifdef RE_ENABLE_I18N
1566 else if (type == COMPLEX_BRACKET
1567 || type == OP_UTF8_PERIOD
1568 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1569 newstate->accept_mb = 1;
1570 #endif /* RE_ENABLE_I18N */
1571 else if (type == OP_BACK_REF)
1572 newstate->has_backref = 1;
1573 else if (type == ANCHOR || node->constraint)
1574 newstate->has_constraint = 1;
1575 }
1576 err = register_state (dfa, newstate, hash);
1577 if (BE (err != REG_NOERROR, 0))
1578 {
1579 free_state (newstate);
1580 newstate = NULL;
1581 }
1582 return newstate;
1583 }
1584
1585 /* Create the new state which is depend on the context CONTEXT.
1586 Return the new state if succeeded, otherwise return NULL. */
1587
1588 static re_dfastate_t *
1589 create_cd_newstate (dfa, nodes, context, hash)
1590 re_dfa_t *dfa;
1591 const re_node_set *nodes;
1592 unsigned int context, hash;
1593 {
1594 int i, nctx_nodes = 0;
1595 reg_errcode_t err;
1596 re_dfastate_t *newstate;
1597
1598 newstate = create_newstate_common (dfa, nodes, hash);
1599 if (BE (newstate == NULL, 0))
1600 return NULL;
1601 newstate->context = context;
1602 newstate->entrance_nodes = &newstate->nodes;
1603
1604 for (i = 0 ; i < nodes->nelem ; i++)
1605 {
1606 unsigned int constraint = 0;
1607 re_token_t *node = dfa->nodes + nodes->elems[i];
1608 re_token_type_t type = node->type;
1609 if (node->constraint)
1610 constraint = node->constraint;
1611
1612 if (type == CHARACTER && !constraint)
1613 continue;
1614 /* If the state has the halt node, the state is a halt state. */
1615 else if (type == END_OF_RE)
1616 newstate->halt = 1;
1617 #ifdef RE_ENABLE_I18N
1618 else if (type == COMPLEX_BRACKET
1619 || type == OP_UTF8_PERIOD
1620 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1621 newstate->accept_mb = 1;
1622 #endif /* RE_ENABLE_I18N */
1623 else if (type == OP_BACK_REF)
1624 newstate->has_backref = 1;
1625 else if (type == ANCHOR)
1626 constraint = node->opr.ctx_type;
1627
1628 if (constraint)
1629 {
1630 if (newstate->entrance_nodes == &newstate->nodes)
1631 {
1632 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633 if (BE (newstate->entrance_nodes == NULL, 0))
1634 {
1635 free_state (newstate);
1636 return NULL;
1637 }
1638 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639 nctx_nodes = 0;
1640 newstate->has_constraint = 1;
1641 }
1642
1643 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1644 {
1645 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1646 ++nctx_nodes;
1647 }
1648 }
1649 }
1650 err = register_state (dfa, newstate, hash);
1651 if (BE (err != REG_NOERROR, 0))
1652 {
1653 free_state (newstate);
1654 newstate = NULL;
1655 }
1656 return newstate;
1657 }
1658
1659 static void
1660 free_state (state)
1661 re_dfastate_t *state;
1662 {
1663 if (state->entrance_nodes != &state->nodes)
1664 {
1665 re_node_set_free (state->entrance_nodes);
1666 re_free (state->entrance_nodes);
1667 }
1668 re_node_set_free (&state->nodes);
1669 re_free (state->trtable);
1670 re_free (state);
1671 }