]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regex_internal.c
misc/test-errno-linux: Handle EINVAL from quotactl
[thirdparty/glibc.git] / posix / regex_internal.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
688903eb 2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
eb04c213 18 <https://www.gnu.org/licenses/>. */
3b0bdc72 19
eb04c213 20static void re_string_construct_common (const char *str, Idx len,
1b2c2628 21 re_string_t *pstr,
eb04c213 22 RE_TRANSLATE_TYPE trans, bool icase,
b41bd5bc 23 const re_dfa_t *dfa);
01ed6ceb 24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
1b2c2628 25 const re_node_set *nodes,
eb04c213 26 re_hashval_t hash);
01ed6ceb 27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
1b2c2628
UD
28 const re_node_set *nodes,
29 unsigned int context,
eb04c213
AZ
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
3b0bdc72
UD
41\f
42/* Functions for string operation. */
43
612546c6
UD
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
3b0bdc72 47static reg_errcode_t
b41bd5bc 48__attribute_warn_unused_result__
eb04c213
AZ
49re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
3b0bdc72
UD
51{
52 reg_errcode_t ret;
eb04c213 53 Idx init_buf_len;
97fd3a30
UD
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
f0c7c524 59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (BE (ret != REG_NOERROR, 0))
63 return ret;
64
56b168be
UD
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
37369d1c
UD
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
612546c6
UD
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
b41bd5bc 76__attribute_warn_unused_result__
eb04c213
AZ
77re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
612546c6
UD
79{
80 reg_errcode_t ret;
56b168be 81 memset (pstr, '\0', sizeof (re_string_t));
f0c7c524 82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
83
84 if (len > 0)
3b0bdc72 85 {
612546c6 86 ret = re_string_realloc_buffers (pstr, len + 1);
bc15410e 87 if (BE (ret != REG_NOERROR, 0))
1b2c2628 88 return ret;
3b0bdc72 89 }
37369d1c 90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
612546c6
UD
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
f0c7c524 95 if (dfa->mb_cur_max > 1)
37369d1c
UD
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (BE (ret != REG_NOERROR, 0))
108 return ret;
109 }
110 }
612546c6 111 else
3b0bdc72 112#endif /* RE_ENABLE_I18N */
1b2c2628 113 build_upper_buffer (pstr);
612546c6
UD
114 }
115 else
3b0bdc72 116 {
612546c6 117#ifdef RE_ENABLE_I18N
f0c7c524 118 if (dfa->mb_cur_max > 1)
1b2c2628 119 build_wcs_buffer (pstr);
612546c6
UD
120 else
121#endif /* RE_ENABLE_I18N */
1b2c2628
UD
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
37369d1c
UD
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
1b2c2628 130 }
3b0bdc72 131 }
612546c6 132
3b0bdc72
UD
133 return REG_NOERROR;
134}
135
612546c6 136/* Helper functions for re_string_allocate, and re_string_construct. */
3b0bdc72
UD
137
138static reg_errcode_t
b41bd5bc 139__attribute_warn_unused_result__
eb04c213 140re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
3b0bdc72 141{
3b0bdc72 142#ifdef RE_ENABLE_I18N
3c0fb574 143 if (pstr->mb_cur_max > 1)
3b0bdc72 144 {
54dd0ab3
UD
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
eb04c213
AZ
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
54dd0ab3
UD
150 return REG_ESPACE;
151
152 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
2d87db5b 153 if (BE (new_wcs == NULL, 0))
1b2c2628 154 return REG_ESPACE;
2d87db5b 155 pstr->wcs = new_wcs;
37369d1c
UD
156 if (pstr->offsets != NULL)
157 {
eb04c213 158 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
2d87db5b 159 if (BE (new_offsets == NULL, 0))
37369d1c 160 return REG_ESPACE;
2d87db5b 161 pstr->offsets = new_offsets;
37369d1c 162 }
3b0bdc72 163 }
3b0bdc72 164#endif /* RE_ENABLE_I18N */
37369d1c 165 if (pstr->mbs_allocated)
3b0bdc72 166 {
2d87db5b
UD
167 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
168 new_buf_len);
169 if (BE (new_mbs == NULL, 0))
1b2c2628 170 return REG_ESPACE;
2d87db5b 171 pstr->mbs = new_mbs;
3b0bdc72 172 }
612546c6 173 pstr->bufs_len = new_buf_len;
3b0bdc72
UD
174 return REG_NOERROR;
175}
176
612546c6 177
3b0bdc72 178static void
eb04c213
AZ
179re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
180 RE_TRANSLATE_TYPE trans, bool icase,
e2f55264 181 const re_dfa_t *dfa)
3b0bdc72 182{
c202c2c5 183 pstr->raw_mbs = (const unsigned char *) str;
3b0bdc72 184 pstr->len = len;
37369d1c 185 pstr->raw_len = len;
997470b3 186 pstr->trans = trans;
eb04c213 187 pstr->icase = icase;
37369d1c 188 pstr->mbs_allocated = (trans != NULL || icase);
f0c7c524
UD
189 pstr->mb_cur_max = dfa->mb_cur_max;
190 pstr->is_utf8 = dfa->is_utf8;
191 pstr->map_notascii = dfa->map_notascii;
37369d1c
UD
192 pstr->stop = pstr->len;
193 pstr->raw_stop = pstr->stop;
3b0bdc72
UD
194}
195
196#ifdef RE_ENABLE_I18N
197
612546c6 198/* Build wide character buffer PSTR->WCS.
3b0bdc72
UD
199 If the byte sequence of the string are:
200 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201 Then wide character buffer will be:
202 <wc1> , WEOF , <wc2> , WEOF , <wc3>
203 We use WEOF for padding, they indicate that the position isn't
612546c6 204 a first byte of a multibyte character.
3b0bdc72 205
612546c6
UD
206 Note that this function assumes PSTR->VALID_LEN elements are already
207 built and starts from PSTR->VALID_LEN. */
208
209static void
e2f55264 210build_wcs_buffer (re_string_t *pstr)
3b0bdc72 211{
37369d1c 212#ifdef _LIBC
ec73fd87
UD
213 unsigned char buf[MB_LEN_MAX];
214 assert (MB_LEN_MAX >= pstr->mb_cur_max);
37369d1c
UD
215#else
216 unsigned char buf[64];
217#endif
612546c6 218 mbstate_t prev_st;
eb04c213 219 Idx byte_idx, end_idx, remain_len;
88764ae2 220 size_t mbclen;
37369d1c 221
612546c6
UD
222 /* Build the buffers from pstr->valid_len to either pstr->len or
223 pstr->bufs_len. */
37369d1c 224 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
612546c6
UD
225 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
226 {
227 wchar_t wc;
37369d1c
UD
228 const char *p;
229
612546c6
UD
230 remain_len = end_idx - byte_idx;
231 prev_st = pstr->cur_state;
37369d1c
UD
232 /* Apply the translation if we need. */
233 if (BE (pstr->trans != NULL, 0))
234 {
235 int i, ch;
236
237 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
238 {
239 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
d40eb37a 240 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
37369d1c
UD
241 }
242 p = (const char *) buf;
243 }
244 else
245 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
b3918c7d 246 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
8887a920
UD
247 if (BE (mbclen == (size_t) -1 || mbclen == 0
248 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
1b2c2628
UD
249 {
250 /* We treat these cases as a singlebyte character. */
251 mbclen = 1;
252 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
37369d1c
UD
253 if (BE (pstr->trans != NULL, 0))
254 wc = pstr->trans[wc];
1b2c2628
UD
255 pstr->cur_state = prev_st;
256 }
8887a920
UD
257 else if (BE (mbclen == (size_t) -2, 0))
258 {
259 /* The buffer doesn't have enough space, finish to build. */
260 pstr->cur_state = prev_st;
261 break;
262 }
612546c6 263
3b0bdc72 264 /* Write wide character and padding. */
612546c6
UD
265 pstr->wcs[byte_idx++] = wc;
266 /* Write paddings. */
267 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1b2c2628 268 pstr->wcs[byte_idx++] = WEOF;
3b0bdc72 269 }
612546c6 270 pstr->valid_len = byte_idx;
37369d1c 271 pstr->valid_raw_len = byte_idx;
3b0bdc72
UD
272}
273
612546c6
UD
274/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275 but for REG_ICASE. */
276
4f7e7f8e 277static reg_errcode_t
b41bd5bc 278__attribute_warn_unused_result__
e2f55264 279build_wcs_upper_buffer (re_string_t *pstr)
3b0bdc72 280{
612546c6 281 mbstate_t prev_st;
eb04c213 282 Idx src_idx, byte_idx, end_idx, remain_len;
88764ae2 283 size_t mbclen;
37369d1c 284#ifdef _LIBC
ec73fd87
UD
285 char buf[MB_LEN_MAX];
286 assert (MB_LEN_MAX >= pstr->mb_cur_max);
37369d1c 287#else
1c99f950 288 char buf[64];
37369d1c
UD
289#endif
290
291 byte_idx = pstr->valid_len;
292 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
f0c7c524 293
e40a38b3
UD
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
37369d1c
UD
296 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297 {
298 while (byte_idx < end_idx)
1b2c2628 299 {
f0c7c524 300 wchar_t wc;
37369d1c
UD
301
302 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 && mbsinit (&pstr->cur_state))
304 {
305 /* In case of a singlebyte character. */
306 pstr->mbs[byte_idx]
307 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 /* The next step uses the assumption that wchar_t is encoded
e40a38b3 309 ASCII-safe: all ASCII values can be converted like this. */
37369d1c
UD
310 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 ++byte_idx;
312 continue;
313 }
314
f0c7c524
UD
315 remain_len = end_idx - byte_idx;
316 prev_st = pstr->cur_state;
b3918c7d
UD
317 mbclen = __mbrtowc (&wc,
318 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 + byte_idx), remain_len, &pstr->cur_state);
58559f14 320 if (BE (0 < mbclen && mbclen < (size_t) -2, 1))
1b2c2628 321 {
eb04c213
AZ
322 wchar_t wcu = __towupper (wc);
323 if (wcu != wc)
37369d1c 324 {
88764ae2 325 size_t mbcdlen;
37369d1c 326
a5f0adb3 327 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
37369d1c
UD
328 if (BE (mbclen == mbcdlen, 1))
329 memcpy (pstr->mbs + byte_idx, buf, mbclen);
330 else
331 {
332 src_idx = byte_idx;
333 goto offsets_needed;
334 }
335 }
f0c7c524
UD
336 else
337 memcpy (pstr->mbs + byte_idx,
338 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
37369d1c 339 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
340 /* Write paddings. */
341 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
342 pstr->wcs[byte_idx++] = WEOF;
343 }
8887a920
UD
344 else if (mbclen == (size_t) -1 || mbclen == 0
345 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
f0c7c524 346 {
8887a920
UD
347 /* It is an invalid character, an incomplete character
348 at the end of the string, or '\0'. Just use the byte. */
f0c7c524 349 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
02b50340
UD
350 pstr->mbs[byte_idx] = ch;
351 /* And also cast it to wide char. */
352 pstr->wcs[byte_idx++] = (wchar_t) ch;
f0c7c524
UD
353 if (BE (mbclen == (size_t) -1, 0))
354 pstr->cur_state = prev_st;
1b2c2628 355 }
1b2c2628 356 else
f0c7c524
UD
357 {
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr->cur_state = prev_st;
360 break;
361 }
1b2c2628 362 }
37369d1c
UD
363 pstr->valid_len = byte_idx;
364 pstr->valid_raw_len = byte_idx;
365 return REG_NOERROR;
366 }
f0c7c524 367 else
37369d1c 368 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
f0c7c524
UD
369 {
370 wchar_t wc;
37369d1c 371 const char *p;
e40a38b3 372 offsets_needed:
f0c7c524
UD
373 remain_len = end_idx - byte_idx;
374 prev_st = pstr->cur_state;
37369d1c 375 if (BE (pstr->trans != NULL, 0))
f0c7c524 376 {
37369d1c
UD
377 int i, ch;
378
379 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
f0c7c524 380 {
37369d1c
UD
381 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 buf[i] = pstr->trans[ch];
f0c7c524 383 }
37369d1c 384 p = (const char *) buf;
f0c7c524 385 }
37369d1c
UD
386 else
387 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
b3918c7d 388 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
58559f14 389 if (BE (0 < mbclen && mbclen < (size_t) -2, 1))
f0c7c524 390 {
eb04c213
AZ
391 wchar_t wcu = __towupper (wc);
392 if (wcu != wc)
37369d1c 393 {
88764ae2 394 size_t mbcdlen;
37369d1c 395
a5f0adb3 396 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
37369d1c
UD
397 if (BE (mbclen == mbcdlen, 1))
398 memcpy (pstr->mbs + byte_idx, buf, mbclen);
88764ae2 399 else if (mbcdlen != (size_t) -1)
37369d1c 400 {
88764ae2 401 size_t i;
37369d1c
UD
402
403 if (byte_idx + mbcdlen > pstr->bufs_len)
404 {
405 pstr->cur_state = prev_st;
406 break;
407 }
408
409 if (pstr->offsets == NULL)
410 {
eb04c213 411 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
37369d1c
UD
412
413 if (pstr->offsets == NULL)
414 return REG_ESPACE;
415 }
416 if (!pstr->offsets_needed)
417 {
88764ae2 418 for (i = 0; i < (size_t) byte_idx; ++i)
37369d1c
UD
419 pstr->offsets[i] = i;
420 pstr->offsets_needed = 1;
421 }
422
423 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424 pstr->wcs[byte_idx] = wcu;
425 pstr->offsets[byte_idx] = src_idx;
426 for (i = 1; i < mbcdlen; ++i)
427 {
428 pstr->offsets[byte_idx + i]
429 = src_idx + (i < mbclen ? i : mbclen - 1);
430 pstr->wcs[byte_idx + i] = WEOF;
431 }
432 pstr->len += mbcdlen - mbclen;
433 if (pstr->raw_stop > src_idx)
434 pstr->stop += mbcdlen - mbclen;
435 end_idx = (pstr->bufs_len > pstr->len)
436 ? pstr->len : pstr->bufs_len;
437 byte_idx += mbcdlen;
438 src_idx += mbclen;
439 continue;
440 }
2da42bc0
UD
441 else
442 memcpy (pstr->mbs + byte_idx, p, mbclen);
37369d1c 443 }
f0c7c524 444 else
37369d1c
UD
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
446
447 if (BE (pstr->offsets_needed != 0, 0))
448 {
88764ae2 449 size_t i;
37369d1c
UD
450 for (i = 0; i < mbclen; ++i)
451 pstr->offsets[byte_idx + i] = src_idx + i;
452 }
453 src_idx += mbclen;
454
455 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
456 /* Write paddings. */
457 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 pstr->wcs[byte_idx++] = WEOF;
459 }
8887a920
UD
460 else if (mbclen == (size_t) -1 || mbclen == 0
461 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
02b50340 462 {
37369d1c
UD
463 /* It is an invalid character or '\0'. Just use the byte. */
464 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
465
466 if (BE (pstr->trans != NULL, 0))
467 ch = pstr->trans [ch];
02b50340 468 pstr->mbs[byte_idx] = ch;
37369d1c
UD
469
470 if (BE (pstr->offsets_needed != 0, 0))
471 pstr->offsets[byte_idx] = src_idx;
472 ++src_idx;
473
02b50340
UD
474 /* And also cast it to wide char. */
475 pstr->wcs[byte_idx++] = (wchar_t) ch;
476 if (BE (mbclen == (size_t) -1, 0))
477 pstr->cur_state = prev_st;
478 }
f0c7c524
UD
479 else
480 {
481 /* The buffer doesn't have enough space, finish to build. */
482 pstr->cur_state = prev_st;
483 break;
484 }
485 }
612546c6 486 pstr->valid_len = byte_idx;
37369d1c
UD
487 pstr->valid_raw_len = src_idx;
488 return REG_NOERROR;
612546c6
UD
489}
490
491/* Skip characters until the index becomes greater than NEW_RAW_IDX.
492 Return the index. */
493
eb04c213
AZ
494static Idx
495re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
612546c6
UD
496{
497 mbstate_t prev_st;
eb04c213 498 Idx rawbuf_idx;
88764ae2 499 size_t mbclen;
4f08104c 500 wint_t wc = WEOF;
612546c6
UD
501
502 /* Skip the characters which are not necessary to check. */
37369d1c 503 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
612546c6
UD
504 rawbuf_idx < new_raw_idx;)
505 {
4f08104c 506 wchar_t wc2;
eb04c213 507 Idx remain_len = pstr->raw_len - rawbuf_idx;
612546c6 508 prev_st = pstr->cur_state;
4f08104c 509 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
b3918c7d 510 remain_len, &pstr->cur_state);
eb04c213 511 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
1b2c2628 512 {
33e63e79
UD
513 /* We treat these cases as a single byte character. */
514 if (mbclen == 0 || remain_len == 0)
515 wc = L'\0';
516 else
517 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1b2c2628
UD
518 mbclen = 1;
519 pstr->cur_state = prev_st;
520 }
4f08104c 521 else
eb04c213 522 wc = wc2;
612546c6
UD
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
525 }
4f08104c 526 *last_wc = wc;
612546c6 527 return rawbuf_idx;
3b0bdc72
UD
528}
529#endif /* RE_ENABLE_I18N */
530
1b2c2628 531/* Build the buffer PSTR->MBS, and apply the translation if we need.
612546c6
UD
532 This function is used in case of REG_ICASE. */
533
534static void
e2f55264 535build_upper_buffer (re_string_t *pstr)
3b0bdc72 536{
eb04c213 537 Idx char_idx, end_idx;
612546c6 538 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
3b0bdc72 539
612546c6 540 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
3b0bdc72 541 {
612546c6 542 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
37369d1c
UD
543 if (BE (pstr->trans != NULL, 0))
544 ch = pstr->trans[ch];
eb04c213 545 pstr->mbs[char_idx] = toupper (ch);
3b0bdc72 546 }
612546c6 547 pstr->valid_len = char_idx;
37369d1c 548 pstr->valid_raw_len = char_idx;
3b0bdc72
UD
549}
550
612546c6 551/* Apply TRANS to the buffer in PSTR. */
3b0bdc72 552
612546c6 553static void
e2f55264 554re_string_translate_buffer (re_string_t *pstr)
3b0bdc72 555{
eb04c213 556 Idx buf_idx, end_idx;
612546c6
UD
557 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
3b0bdc72 560 {
612546c6 561 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
37369d1c 562 pstr->mbs[buf_idx] = pstr->trans[ch];
3b0bdc72 563 }
612546c6
UD
564
565 pstr->valid_len = buf_idx;
37369d1c 566 pstr->valid_raw_len = buf_idx;
612546c6
UD
567}
568
569/* This function re-construct the buffers.
3c0fb574 570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
612546c6
UD
571 convert to upper case in case of REG_ICASE, apply translation. */
572
573static reg_errcode_t
b41bd5bc 574__attribute_warn_unused_result__
eb04c213 575re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
612546c6 576{
eb04c213
AZ
577 Idx offset;
578
579 if (BE (pstr->raw_mbs_idx <= idx, 0))
580 offset = idx - pstr->raw_mbs_idx;
581 else
612546c6
UD
582 {
583 /* Reset buffer. */
434d3784 584#ifdef RE_ENABLE_I18N
3c0fb574 585 if (pstr->mb_cur_max > 1)
1b2c2628 586 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
434d3784 587#endif /* RE_ENABLE_I18N */
37369d1c
UD
588 pstr->len = pstr->raw_len;
589 pstr->stop = pstr->raw_stop;
590 pstr->valid_len = 0;
591 pstr->raw_mbs_idx = 0;
592 pstr->valid_raw_len = 0;
593 pstr->offsets_needed = 0;
612546c6 594 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1b2c2628 595 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
37369d1c 596 if (!pstr->mbs_allocated)
1b2c2628 597 pstr->mbs = (unsigned char *) pstr->raw_mbs;
612546c6
UD
598 offset = idx;
599 }
600
bb677c95 601 if (BE (offset != 0, 1))
612546c6 602 {
ba40cc15
UD
603 /* Should the already checked characters be kept? */
604 if (BE (offset < pstr->valid_raw_len, 1))
a334319f
UD
605 {
606 /* Yes, move them to the front of the buffer. */
0ecb606c 607#ifdef RE_ENABLE_I18N
ba40cc15
UD
608 if (BE (pstr->offsets_needed, 0))
609 {
eb04c213 610 Idx low = 0, high = pstr->valid_len, mid;
ba40cc15
UD
611 do
612 {
613 mid = (high + low) / 2;
614 if (pstr->offsets[mid] > offset)
615 high = mid;
616 else if (pstr->offsets[mid] < offset)
617 low = mid + 1;
618 else
619 break;
620 }
621 while (low < high);
622 if (pstr->offsets[mid] < offset)
623 ++mid;
624 pstr->tip_context = re_string_context_at (pstr, mid - 1,
625 eflags);
626 /* This can be quite complicated, so handle specially
627 only the common and easy case where the character with
628 different length representation of lower and upper
629 case is present at or after offset. */
630 if (pstr->valid_len > offset
631 && mid == offset && pstr->offsets[mid] == offset)
632 {
633 memmove (pstr->wcs, pstr->wcs + offset,
634 (pstr->valid_len - offset) * sizeof (wint_t));
635 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636 pstr->valid_len -= offset;
637 pstr->valid_raw_len -= offset;
638 for (low = 0; low < pstr->valid_len; low++)
639 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640 }
641 else
642 {
643 /* Otherwise, just find out how long the partial multibyte
644 character at offset is and fill it with WEOF/255. */
645 pstr->len = pstr->raw_len - idx + offset;
646 pstr->stop = pstr->raw_stop - idx + offset;
647 pstr->offsets_needed = 0;
648 while (mid > 0 && pstr->offsets[mid - 1] == offset)
649 --mid;
650 while (mid < pstr->valid_len)
651 if (pstr->wcs[mid] != WEOF)
652 break;
653 else
654 ++mid;
655 if (mid == pstr->valid_len)
656 pstr->valid_len = 0;
657 else
658 {
659 pstr->valid_len = pstr->offsets[mid] - offset;
660 if (pstr->valid_len)
661 {
662 for (low = 0; low < pstr->valid_len; ++low)
663 pstr->wcs[low] = WEOF;
664 memset (pstr->mbs, 255, pstr->valid_len);
665 }
666 }
667 pstr->valid_raw_len = pstr->valid_len;
668 }
669 }
670 else
671#endif
672 {
673 pstr->tip_context = re_string_context_at (pstr, offset - 1,
674 eflags);
675#ifdef RE_ENABLE_I18N
676 if (pstr->mb_cur_max > 1)
677 memmove (pstr->wcs, pstr->wcs + offset,
678 (pstr->valid_len - offset) * sizeof (wint_t));
612546c6 679#endif /* RE_ENABLE_I18N */
ba40cc15
UD
680 if (BE (pstr->mbs_allocated, 0))
681 memmove (pstr->mbs, pstr->mbs + offset,
682 pstr->valid_len - offset);
683 pstr->valid_len -= offset;
684 pstr->valid_raw_len -= offset;
78dd658a 685#if defined DEBUG && DEBUG
ba40cc15 686 assert (pstr->valid_len > 0);
3b0bdc72 687#endif
ba40cc15 688 }
1b2c2628 689 }
612546c6 690 else
1b2c2628 691 {
ed8ae46b 692#ifdef RE_ENABLE_I18N
1b2c2628 693 /* No, skip all characters until IDX. */
eb04c213 694 Idx prev_valid_len = pstr->valid_len;
ba40cc15 695
37369d1c
UD
696 if (BE (pstr->offsets_needed, 0))
697 {
698 pstr->len = pstr->raw_len - idx + offset;
699 pstr->stop = pstr->raw_stop - idx + offset;
700 pstr->offsets_needed = 0;
701 }
702#endif
1b2c2628 703 pstr->valid_len = 0;
3b0bdc72 704#ifdef RE_ENABLE_I18N
3c0fb574 705 if (pstr->mb_cur_max > 1)
1b2c2628 706 {
eb04c213 707 Idx wcs_idx;
3c0fb574
UD
708 wint_t wc = WEOF;
709
710 if (pstr->is_utf8)
711 {
0dae5d4e 712 const unsigned char *raw, *p, *end;
3c0fb574
UD
713
714 /* Special case UTF-8. Multi-byte chars start with any
715 byte other than 0x80 - 0xbf. */
716 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
97fd3a30 717 end = raw + (offset - pstr->mb_cur_max);
ba40cc15
UD
718 if (end < pstr->raw_mbs)
719 end = pstr->raw_mbs;
01ed6ceb
UD
720 p = raw + offset - 1;
721#ifdef _LIBC
722 /* We know the wchar_t encoding is UCS4, so for the simple
723 case, ASCII characters, skip the conversion step. */
724 if (isascii (*p) && BE (pstr->trans == NULL, 1))
725 {
726 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
ba40cc15 727 /* pstr->valid_len = 0; */
01ed6ceb
UD
728 wc = (wchar_t) *p;
729 }
730 else
731#endif
732 for (; p >= end; --p)
733 if ((*p & 0xc0) != 0x80)
734 {
735 mbstate_t cur_state;
736 wchar_t wc2;
eb04c213 737 Idx mlen = raw + pstr->len - p;
01ed6ceb
UD
738 unsigned char buf[6];
739 size_t mbclen;
740
5e2b63c6 741 const unsigned char *pp = p;
01ed6ceb
UD
742 if (BE (pstr->trans != NULL, 0))
743 {
744 int i = mlen < 6 ? mlen : 6;
745 while (--i >= 0)
746 buf[i] = pstr->trans[p[i]];
5e2b63c6 747 pp = buf;
01ed6ceb
UD
748 }
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state, 0, sizeof (cur_state));
5e2b63c6 752 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
b3918c7d 753 &cur_state);
01ed6ceb
UD
754 if (raw + offset - p <= mbclen
755 && mbclen < (size_t) -2)
756 {
757 memset (&pstr->cur_state, '\0',
758 sizeof (mbstate_t));
759 pstr->valid_len = mbclen - (raw + offset - p);
760 wc = wc2;
761 }
762 break;
763 }
3c0fb574 764 }
e40a38b3 765
3c0fb574 766 if (wc == WEOF)
97fd3a30 767 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
33e63e79
UD
768 if (wc == WEOF)
769 pstr->tip_context
ba40cc15 770 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
33e63e79
UD
771 else
772 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 && IS_WIDE_WORD_CHAR (wc))
774 ? CONTEXT_WORD
775 : ((IS_WIDE_NEWLINE (wc)
776 && pstr->newline_anchor)
777 ? CONTEXT_NEWLINE : 0));
37369d1c
UD
778 if (BE (pstr->valid_len, 0))
779 {
780 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 pstr->wcs[wcs_idx] = WEOF;
782 if (pstr->mbs_allocated)
783 memset (pstr->mbs, 255, pstr->valid_len);
784 }
785 pstr->valid_raw_len = pstr->valid_len;
1b2c2628 786 }
1843975c 787 else
612546c6 788#endif /* RE_ENABLE_I18N */
1843975c
RM
789 {
790 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
33e63e79 791 pstr->valid_raw_len = 0;
1843975c
RM
792 if (pstr->trans)
793 c = pstr->trans[c];
56b168be
UD
794 pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 ? CONTEXT_WORD
796 : ((IS_NEWLINE (c) && pstr->newline_anchor)
1843975c
RM
797 ? CONTEXT_NEWLINE : 0));
798 }
1b2c2628 799 }
bb677c95 800 if (!BE (pstr->mbs_allocated, 0))
37369d1c 801 pstr->mbs += offset;
3b0bdc72 802 }
612546c6
UD
803 pstr->raw_mbs_idx = idx;
804 pstr->len -= offset;
ac3d553b 805 pstr->stop -= offset;
612546c6
UD
806
807 /* Then build the buffers. */
808#ifdef RE_ENABLE_I18N
3c0fb574 809 if (pstr->mb_cur_max > 1)
3b0bdc72 810 {
612546c6 811 if (pstr->icase)
37369d1c 812 {
4f7e7f8e 813 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
37369d1c
UD
814 if (BE (ret != REG_NOERROR, 0))
815 return ret;
816 }
612546c6 817 else
1b2c2628 818 build_wcs_buffer (pstr);
3b0bdc72
UD
819 }
820 else
612546c6 821#endif /* RE_ENABLE_I18N */
01ed6ceb
UD
822 if (BE (pstr->mbs_allocated, 0))
823 {
824 if (pstr->icase)
825 build_upper_buffer (pstr);
826 else if (pstr->trans != NULL)
827 re_string_translate_buffer (pstr);
828 }
829 else
830 pstr->valid_len = pstr->len;
612546c6 831
bb677c95 832 pstr->cur_idx = 0;
3b0bdc72
UD
833 return REG_NOERROR;
834}
835
37369d1c 836static unsigned char
eb04c213
AZ
837__attribute__ ((pure))
838re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
37369d1c 839{
eb04c213
AZ
840 int ch;
841 Idx off;
37369d1c
UD
842
843 /* Handle the common (easiest) cases first. */
f39eef7b 844 if (BE (!pstr->mbs_allocated, 1))
37369d1c
UD
845 return re_string_peek_byte (pstr, idx);
846
847#ifdef RE_ENABLE_I18N
848 if (pstr->mb_cur_max > 1
849 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850 return re_string_peek_byte (pstr, idx);
851#endif
852
853 off = pstr->cur_idx + idx;
854#ifdef RE_ENABLE_I18N
855 if (pstr->offsets_needed)
856 off = pstr->offsets[off];
857#endif
858
859 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
860
861#ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr->offsets_needed && !isascii (ch))
867 return re_string_peek_byte (pstr, idx);
868#endif
869
870 return ch;
871}
872
873static unsigned char
e2f55264 874re_string_fetch_byte_case (re_string_t *pstr)
37369d1c 875{
f39eef7b 876 if (BE (!pstr->mbs_allocated, 1))
37369d1c
UD
877 return re_string_fetch_byte (pstr);
878
879#ifdef RE_ENABLE_I18N
880 if (pstr->offsets_needed)
881 {
eb04c213
AZ
882 Idx off;
883 int ch;
c0d5034e 884
37369d1c
UD
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
891
892 if (!re_string_first_byte (pstr, pstr->cur_idx))
893 return re_string_fetch_byte (pstr);
894
895 off = pstr->offsets[pstr->cur_idx];
896 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
897
898 if (! isascii (ch))
899 return re_string_fetch_byte (pstr);
900
901 re_string_skip_bytes (pstr,
902 re_string_char_size_at (pstr, pstr->cur_idx));
903 return ch;
904 }
905#endif
906
f39eef7b 907 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
37369d1c
UD
908}
909
3b0bdc72 910static void
e2f55264 911re_string_destruct (re_string_t *pstr)
3b0bdc72
UD
912{
913#ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
37369d1c 915 re_free (pstr->offsets);
3b0bdc72 916#endif /* RE_ENABLE_I18N */
37369d1c 917 if (pstr->mbs_allocated)
612546c6 918 re_free (pstr->mbs);
3b0bdc72
UD
919}
920
921/* Return the context at IDX in INPUT. */
612546c6 922
3b0bdc72 923static unsigned int
eb04c213 924re_string_context_at (const re_string_t *input, Idx idx, int eflags)
3b0bdc72
UD
925{
926 int c;
bb677c95
UD
927 if (BE (idx < 0, 0))
928 /* In this case, we use the value stored in input->tip_context,
929 since we can't know the character in input->mbs[-1] here. */
930 return input->tip_context;
931 if (BE (idx == input->len, 0))
932 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
933 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
a0756231 934#ifdef RE_ENABLE_I18N
3c0fb574 935 if (input->mb_cur_max > 1)
1843975c
RM
936 {
937 wint_t wc;
eb04c213 938 Idx wc_idx = idx;
1843975c
RM
939 while(input->wcs[wc_idx] == WEOF)
940 {
78dd658a 941#if defined DEBUG && DEBUG
1843975c
RM
942 /* It must not happen. */
943 assert (wc_idx >= 0);
944#endif
945 --wc_idx;
946 if (wc_idx < 0)
947 return input->tip_context;
948 }
949 wc = input->wcs[wc_idx];
56b168be 950 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
1843975c 951 return CONTEXT_WORD;
56b168be
UD
952 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 ? CONTEXT_NEWLINE : 0);
1843975c 954 }
a0756231
RM
955 else
956#endif
957 {
958 c = re_string_byte_at (input, idx);
56b168be 959 if (bitset_contain (input->word_char, c))
a0756231 960 return CONTEXT_WORD;
56b168be 961 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
a0756231 962 }
3b0bdc72
UD
963}
964\f
965/* Functions for set operation. */
966
967static reg_errcode_t
b41bd5bc 968__attribute_warn_unused_result__
eb04c213 969re_node_set_alloc (re_node_set *set, Idx size)
3b0bdc72
UD
970{
971 set->alloc = size;
972 set->nelem = 0;
eb04c213
AZ
973 set->elems = re_malloc (Idx, size);
974 if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
3b0bdc72
UD
975 return REG_ESPACE;
976 return REG_NOERROR;
977}
978
979static reg_errcode_t
b41bd5bc 980__attribute_warn_unused_result__
eb04c213 981re_node_set_init_1 (re_node_set *set, Idx elem)
3b0bdc72
UD
982{
983 set->alloc = 1;
984 set->nelem = 1;
eb04c213 985 set->elems = re_malloc (Idx, 1);
bc15410e 986 if (BE (set->elems == NULL, 0))
6291ee3c
UD
987 {
988 set->alloc = set->nelem = 0;
989 return REG_ESPACE;
990 }
3b0bdc72
UD
991 set->elems[0] = elem;
992 return REG_NOERROR;
993}
994
995static reg_errcode_t
b41bd5bc 996__attribute_warn_unused_result__
eb04c213 997re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
3b0bdc72
UD
998{
999 set->alloc = 2;
eb04c213 1000 set->elems = re_malloc (Idx, 2);
bc15410e 1001 if (BE (set->elems == NULL, 0))
3b0bdc72
UD
1002 return REG_ESPACE;
1003 if (elem1 == elem2)
1004 {
1005 set->nelem = 1;
1006 set->elems[0] = elem1;
1007 }
1008 else
1009 {
1010 set->nelem = 2;
1011 if (elem1 < elem2)
1b2c2628
UD
1012 {
1013 set->elems[0] = elem1;
1014 set->elems[1] = elem2;
1015 }
3b0bdc72 1016 else
1b2c2628
UD
1017 {
1018 set->elems[0] = elem2;
1019 set->elems[1] = elem1;
1020 }
3b0bdc72
UD
1021 }
1022 return REG_NOERROR;
1023}
1024
1025static reg_errcode_t
b41bd5bc 1026__attribute_warn_unused_result__
e2f55264 1027re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
3b0bdc72
UD
1028{
1029 dest->nelem = src->nelem;
1030 if (src->nelem > 0)
1031 {
1032 dest->alloc = dest->nelem;
eb04c213 1033 dest->elems = re_malloc (Idx, dest->alloc);
bc15410e 1034 if (BE (dest->elems == NULL, 0))
6291ee3c
UD
1035 {
1036 dest->alloc = dest->nelem = 0;
1037 return REG_ESPACE;
1038 }
eb04c213 1039 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
3b0bdc72
UD
1040 }
1041 else
1042 re_node_set_init_empty (dest);
1043 return REG_NOERROR;
1044}
1045
a9388965
UD
1046/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049
3b0bdc72 1050static reg_errcode_t
b41bd5bc 1051__attribute_warn_unused_result__
e2f55264
UD
1052re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1053 const re_node_set *src2)
3b0bdc72 1054{
eb04c213 1055 Idx i1, i2, is, id, delta, sbase;
59e7ebcc
UD
1056 if (src1->nelem == 0 || src2->nelem == 0)
1057 return REG_NOERROR;
1058
1059 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060 conservative estimate. */
1061 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
3b0bdc72 1062 {
eb04c213
AZ
1063 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
59e7ebcc 1065 if (BE (new_elems == NULL, 0))
2da42bc0 1066 return REG_ESPACE;
59e7ebcc
UD
1067 dest->elems = new_elems;
1068 dest->alloc = new_alloc;
3b0bdc72 1069 }
3b0bdc72 1070
59e7ebcc
UD
1071 /* Find the items in the intersection of SRC1 and SRC2, and copy
1072 into the top of DEST those that are not already in DEST itself. */
1073 sbase = dest->nelem + src1->nelem + src2->nelem;
1074 i1 = src1->nelem - 1;
1075 i2 = src2->nelem - 1;
1076 id = dest->nelem - 1;
1077 for (;;)
3b0bdc72 1078 {
59e7ebcc 1079 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1080 {
59e7ebcc
UD
1081 /* Try to find the item in DEST. Maybe we could binary search? */
1082 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1083 --id;
1084
2da42bc0 1085 if (id < 0 || dest->elems[id] != src1->elems[i1])
eb04c213 1086 dest->elems[--sbase] = src1->elems[i1];
59e7ebcc
UD
1087
1088 if (--i1 < 0 || --i2 < 0)
1089 break;
1b2c2628 1090 }
59e7ebcc
UD
1091
1092 /* Lower the highest of the two items. */
1093 else if (src1->elems[i1] < src2->elems[i2])
1b2c2628 1094 {
59e7ebcc
UD
1095 if (--i2 < 0)
1096 break;
1097 }
1098 else
1099 {
1100 if (--i1 < 0)
1101 break;
1b2c2628 1102 }
3b0bdc72 1103 }
59e7ebcc
UD
1104
1105 id = dest->nelem - 1;
1106 is = dest->nelem + src1->nelem + src2->nelem - 1;
1107 delta = is - sbase + 1;
1108
1109 /* Now copy. When DELTA becomes zero, the remaining
1110 DEST elements are already in place; this is more or
1111 less the same loop that is in re_node_set_merge. */
1112 dest->nelem += delta;
1113 if (delta > 0 && id >= 0)
1114 for (;;)
1115 {
2da42bc0
UD
1116 if (dest->elems[is] > dest->elems[id])
1117 {
1118 /* Copy from the top. */
1119 dest->elems[id + delta--] = dest->elems[is--];
1120 if (delta == 0)
1121 break;
1122 }
1123 else
1124 {
1125 /* Slide from the bottom. */
1126 dest->elems[id + delta] = dest->elems[id];
1127 if (--id < 0)
1128 break;
1129 }
59e7ebcc
UD
1130 }
1131
1132 /* Copy remaining SRC elements. */
eb04c213 1133 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
59e7ebcc 1134
3b0bdc72
UD
1135 return REG_NOERROR;
1136}
1137
a9388965
UD
1138/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140
3b0bdc72 1141static reg_errcode_t
b41bd5bc 1142__attribute_warn_unused_result__
e2f55264
UD
1143re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1144 const re_node_set *src2)
3b0bdc72 1145{
eb04c213 1146 Idx i1, i2, id;
3b0bdc72
UD
1147 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148 {
1149 dest->alloc = src1->nelem + src2->nelem;
eb04c213 1150 dest->elems = re_malloc (Idx, dest->alloc);
bc15410e 1151 if (BE (dest->elems == NULL, 0))
1b2c2628 1152 return REG_ESPACE;
3b0bdc72
UD
1153 }
1154 else
1155 {
1156 if (src1 != NULL && src1->nelem > 0)
1b2c2628 1157 return re_node_set_init_copy (dest, src1);
3b0bdc72 1158 else if (src2 != NULL && src2->nelem > 0)
1b2c2628 1159 return re_node_set_init_copy (dest, src2);
3b0bdc72 1160 else
1b2c2628 1161 re_node_set_init_empty (dest);
3b0bdc72
UD
1162 return REG_NOERROR;
1163 }
1164 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165 {
1166 if (src1->elems[i1] > src2->elems[i2])
1b2c2628
UD
1167 {
1168 dest->elems[id++] = src2->elems[i2++];
1169 continue;
1170 }
3b0bdc72 1171 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1172 ++i2;
3b0bdc72
UD
1173 dest->elems[id++] = src1->elems[i1++];
1174 }
1175 if (i1 < src1->nelem)
1176 {
1177 memcpy (dest->elems + id, src1->elems + i1,
eb04c213 1178 (src1->nelem - i1) * sizeof (Idx));
3b0bdc72
UD
1179 id += src1->nelem - i1;
1180 }
1181 else if (i2 < src2->nelem)
1182 {
1183 memcpy (dest->elems + id, src2->elems + i2,
eb04c213 1184 (src2->nelem - i2) * sizeof (Idx));
3b0bdc72
UD
1185 id += src2->nelem - i2;
1186 }
1187 dest->nelem = id;
1188 return REG_NOERROR;
1189}
1190
a9388965
UD
1191/* Calculate the union set of the sets DEST and SRC. And store it to
1192 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193
3b0bdc72 1194static reg_errcode_t
b41bd5bc 1195__attribute_warn_unused_result__
e2f55264 1196re_node_set_merge (re_node_set *dest, const re_node_set *src)
3b0bdc72 1197{
eb04c213 1198 Idx is, id, sbase, delta;
3b0bdc72
UD
1199 if (src == NULL || src->nelem == 0)
1200 return REG_NOERROR;
59e7ebcc 1201 if (dest->alloc < 2 * src->nelem + dest->nelem)
3b0bdc72 1202 {
eb04c213
AZ
1203 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1204 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1b2c2628
UD
1205 if (BE (new_buffer == NULL, 0))
1206 return REG_ESPACE;
1207 dest->elems = new_buffer;
951d6408 1208 dest->alloc = new_alloc;
3b0bdc72
UD
1209 }
1210
59e7ebcc 1211 if (BE (dest->nelem == 0, 0))
3b0bdc72 1212 {
59e7ebcc 1213 dest->nelem = src->nelem;
eb04c213 1214 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
59e7ebcc
UD
1215 return REG_NOERROR;
1216 }
3b0bdc72 1217
59e7ebcc
UD
1218 /* Copy into the top of DEST the items of SRC that are not
1219 found in DEST. Maybe we could binary search in DEST? */
1220 for (sbase = dest->nelem + 2 * src->nelem,
1221 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1222 {
1223 if (dest->elems[id] == src->elems[is])
2da42bc0 1224 is--, id--;
59e7ebcc 1225 else if (dest->elems[id] < src->elems[is])
2da42bc0 1226 dest->elems[--sbase] = src->elems[is--];
59e7ebcc 1227 else /* if (dest->elems[id] > src->elems[is]) */
2da42bc0 1228 --id;
59e7ebcc 1229 }
3b0bdc72 1230
59e7ebcc
UD
1231 if (is >= 0)
1232 {
1233 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1234 sbase -= is + 1;
eb04c213 1235 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
3b0bdc72
UD
1236 }
1237
59e7ebcc
UD
1238 id = dest->nelem - 1;
1239 is = dest->nelem + 2 * src->nelem - 1;
1240 delta = is - sbase + 1;
1241 if (delta == 0)
1242 return REG_NOERROR;
1243
1244 /* Now copy. When DELTA becomes zero, the remaining
1245 DEST elements are already in place. */
1246 dest->nelem += delta;
1247 for (;;)
3b0bdc72 1248 {
59e7ebcc 1249 if (dest->elems[is] > dest->elems[id])
2da42bc0 1250 {
59e7ebcc 1251 /* Copy from the top. */
2da42bc0 1252 dest->elems[id + delta--] = dest->elems[is--];
59e7ebcc
UD
1253 if (delta == 0)
1254 break;
1255 }
1256 else
2da42bc0
UD
1257 {
1258 /* Slide from the bottom. */
1259 dest->elems[id + delta] = dest->elems[id];
59e7ebcc
UD
1260 if (--id < 0)
1261 {
1262 /* Copy remaining SRC elements. */
1263 memcpy (dest->elems, dest->elems + sbase,
eb04c213 1264 delta * sizeof (Idx));
59e7ebcc
UD
1265 break;
1266 }
1267 }
3b0bdc72 1268 }
59e7ebcc 1269
3b0bdc72
UD
1270 return REG_NOERROR;
1271}
1272
1273/* Insert the new element ELEM to the re_node_set* SET.
8503c987 1274 SET should not already have ELEM.
eb04c213 1275 Return true if successful. */
3b0bdc72 1276
eb04c213 1277static bool
b41bd5bc 1278__attribute_warn_unused_result__
eb04c213 1279re_node_set_insert (re_node_set *set, Idx elem)
3b0bdc72 1280{
eb04c213 1281 Idx idx;
8503c987
UD
1282 /* In case the set is empty. */
1283 if (set->alloc == 0)
eb04c213 1284 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
3b0bdc72 1285
8503c987 1286 if (BE (set->nelem, 0) == 0)
3b0bdc72 1287 {
8503c987
UD
1288 /* We already guaranteed above that set->alloc != 0. */
1289 set->elems[0] = elem;
1290 ++set->nelem;
eb04c213 1291 return true;
3b0bdc72
UD
1292 }
1293
1294 /* Realloc if we need. */
8503c987 1295 if (set->alloc == set->nelem)
3b0bdc72 1296 {
eb04c213 1297 Idx *new_elems;
3b0bdc72 1298 set->alloc = set->alloc * 2;
eb04c213 1299 new_elems = re_realloc (set->elems, Idx, set->alloc);
2d87db5b 1300 if (BE (new_elems == NULL, 0))
eb04c213 1301 return false;
2d87db5b 1302 set->elems = new_elems;
3b0bdc72 1303 }
8503c987
UD
1304
1305 /* Move the elements which follows the new element. Test the
1306 first element separately to skip a check in the inner loop. */
1307 if (elem < set->elems[0])
1308 {
1309 idx = 0;
59e7ebcc 1310 for (idx = set->nelem; idx > 0; idx--)
2da42bc0 1311 set->elems[idx] = set->elems[idx - 1];
8503c987 1312 }
3b0bdc72
UD
1313 else
1314 {
8503c987 1315 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2da42bc0 1316 set->elems[idx] = set->elems[idx - 1];
3b0bdc72 1317 }
8503c987 1318
3b0bdc72
UD
1319 /* Insert the new element. */
1320 set->elems[idx] = elem;
1321 ++set->nelem;
eb04c213 1322 return true;
3b0bdc72
UD
1323}
1324
20dc2f79
UD
1325/* Insert the new element ELEM to the re_node_set* SET.
1326 SET should not already have any element greater than or equal to ELEM.
eb04c213 1327 Return true if successful. */
20dc2f79 1328
eb04c213 1329static bool
b41bd5bc 1330__attribute_warn_unused_result__
eb04c213 1331re_node_set_insert_last (re_node_set *set, Idx elem)
20dc2f79
UD
1332{
1333 /* Realloc if we need. */
1334 if (set->alloc == set->nelem)
1335 {
eb04c213 1336 Idx *new_elems;
20dc2f79 1337 set->alloc = (set->alloc + 1) * 2;
eb04c213 1338 new_elems = re_realloc (set->elems, Idx, set->alloc);
2d87db5b 1339 if (BE (new_elems == NULL, 0))
eb04c213 1340 return false;
2d87db5b 1341 set->elems = new_elems;
20dc2f79
UD
1342 }
1343
1344 /* Insert the new element. */
1345 set->elems[set->nelem++] = elem;
eb04c213 1346 return true;
20dc2f79
UD
1347}
1348
3b0bdc72 1349/* Compare two node sets SET1 and SET2.
eb04c213 1350 Return true if SET1 and SET2 are equivalent. */
3b0bdc72 1351
eb04c213
AZ
1352static bool
1353__attribute__ ((pure))
e2f55264 1354re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
3b0bdc72 1355{
eb04c213 1356 Idx i;
3b0bdc72 1357 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
eb04c213 1358 return false;
59e7ebcc 1359 for (i = set1->nelem ; --i >= 0 ; )
3b0bdc72 1360 if (set1->elems[i] != set2->elems[i])
eb04c213
AZ
1361 return false;
1362 return true;
3b0bdc72
UD
1363}
1364
0742e48e 1365/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
3b0bdc72 1366
eb04c213
AZ
1367static Idx
1368__attribute__ ((pure))
1369re_node_set_contains (const re_node_set *set, Idx elem)
3b0bdc72 1370{
eb04c213 1371 __re_size_t idx, right, mid;
3b0bdc72
UD
1372 if (set->nelem <= 0)
1373 return 0;
1374
1375 /* Binary search the element. */
1376 idx = 0;
1377 right = set->nelem - 1;
1378 while (idx < right)
1379 {
1380 mid = (idx + right) / 2;
1381 if (set->elems[mid] < elem)
1b2c2628 1382 idx = mid + 1;
3b0bdc72 1383 else
1b2c2628 1384 right = mid;
3b0bdc72 1385 }
0742e48e 1386 return set->elems[idx] == elem ? idx + 1 : 0;
3b0bdc72
UD
1387}
1388
1389static void
eb04c213 1390re_node_set_remove_at (re_node_set *set, Idx idx)
3b0bdc72
UD
1391{
1392 if (idx < 0 || idx >= set->nelem)
1393 return;
3b0bdc72 1394 --set->nelem;
59e7ebcc
UD
1395 for (; idx < set->nelem; idx++)
1396 set->elems[idx] = set->elems[idx + 1];
3b0bdc72
UD
1397}
1398\f
1399
1400/* Add the token TOKEN to dfa->nodes, and return the index of the token.
eb04c213 1401 Or return -1 if an error occurred. */
3b0bdc72 1402
eb04c213 1403static Idx
e2f55264 1404re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
3b0bdc72 1405{
18e3dc56 1406 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
3b0bdc72 1407 {
01ed6ceb 1408 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
eb04c213 1409 Idx *new_nexts, *new_indices;
963d8d78 1410 re_node_set *new_edests, *new_eclosures;
01ed6ceb
UD
1411 re_token_t *new_nodes;
1412
22364644
UD
1413 /* Avoid overflows in realloc. */
1414 const size_t max_object_size = MAX (sizeof (re_token_t),
1415 MAX (sizeof (re_node_set),
eb04c213
AZ
1416 sizeof (Idx)));
1417 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
01ed6ceb 1418 return -1;
02f3550c 1419
01ed6ceb 1420 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
2d87db5b 1421 if (BE (new_nodes == NULL, 0))
0ecb606c 1422 return -1;
2d87db5b 1423 dfa->nodes = new_nodes;
eb04c213
AZ
1424 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
02f3550c
UD
1426 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
02f3550c 1428 if (BE (new_nexts == NULL || new_indices == NULL
963d8d78 1429 || new_edests == NULL || new_eclosures == NULL, 0))
eb04c213
AZ
1430 {
1431 re_free (new_nexts);
1432 re_free (new_indices);
1433 re_free (new_edests);
1434 re_free (new_eclosures);
1435 return -1;
1436 }
02f3550c
UD
1437 dfa->nexts = new_nexts;
1438 dfa->org_indices = new_indices;
1439 dfa->edests = new_edests;
1440 dfa->eclosures = new_eclosures;
951d6408 1441 dfa->nodes_alloc = new_nodes_alloc;
3b0bdc72
UD
1442 }
1443 dfa->nodes[dfa->nodes_len] = token;
485d775d 1444 dfa->nodes[dfa->nodes_len].constraint = 0;
02f3550c
UD
1445#ifdef RE_ENABLE_I18N
1446 dfa->nodes[dfa->nodes_len].accept_mb =
eb04c213
AZ
1447 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448 || token.type == COMPLEX_BRACKET);
02f3550c
UD
1449#endif
1450 dfa->nexts[dfa->nodes_len] = -1;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
3b0bdc72
UD
1453 return dfa->nodes_len++;
1454}
1455
eb04c213 1456static re_hashval_t
e2f55264 1457calc_state_hash (const re_node_set *nodes, unsigned int context)
3b0bdc72 1458{
eb04c213
AZ
1459 re_hashval_t hash = nodes->nelem + context;
1460 Idx i;
3b0bdc72
UD
1461 for (i = 0 ; i < nodes->nelem ; i++)
1462 hash += nodes->elems[i];
1463 return hash;
1464}
1465
1466/* Search for the state whose node_set is equivalent to NODES.
1467 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1468 Otherwise create the new one and return it. In case of an error
1469 return NULL and set the error code in ERR.
1470 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1471 return value is NULL and ERR is REG_NOERROR.
1472 - We never return non-NULL value in case of any errors, it is for
1473 optimization. */
a9388965 1474
e2f55264 1475static re_dfastate_t *
b41bd5bc 1476__attribute_warn_unused_result__
e2f55264
UD
1477re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478 const re_node_set *nodes)
3b0bdc72 1479{
eb04c213 1480 re_hashval_t hash;
a9388965 1481 re_dfastate_t *new_state;
3b0bdc72 1482 struct re_state_table_entry *spot;
eb04c213
AZ
1483 Idx i;
1484#if defined GCC_LINT || defined lint
1485 /* Suppress bogus uninitialized-variable warnings. */
1486 *err = REG_NOERROR;
1487#endif
bc15410e 1488 if (BE (nodes->nelem == 0, 0))
a9388965
UD
1489 {
1490 *err = REG_NOERROR;
1491 return NULL;
1492 }
3b0bdc72
UD
1493 hash = calc_state_hash (nodes, 0);
1494 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
bc15410e 1496 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1497 {
bc15410e
UD
1498 re_dfastate_t *state = spot->array[i];
1499 if (hash != state->hash)
1b2c2628 1500 continue;
bc15410e 1501 if (re_node_set_compare (&state->nodes, nodes))
1b2c2628 1502 return state;
3b0bdc72 1503 }
3b0bdc72
UD
1504
1505 /* There are no appropriate state in the dfa, create the new one. */
a9388965 1506 new_state = create_ci_newstate (dfa, nodes, hash);
2d87db5b
UD
1507 if (BE (new_state == NULL, 0))
1508 *err = REG_ESPACE;
1509
1510 return new_state;
3b0bdc72
UD
1511}
1512
1513/* Search for the state whose node_set is equivalent to NODES and
1514 whose context is equivalent to CONTEXT.
1515 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1516 Otherwise create the new one and return it. In case of an error
1517 return NULL and set the error code in ERR.
1518 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1519 return value is NULL and ERR is REG_NOERROR.
1520 - We never return non-NULL value in case of any errors, it is for
1521 optimization. */
a9388965 1522
db26cb75 1523static re_dfastate_t *
b41bd5bc 1524__attribute_warn_unused_result__
e2f55264
UD
1525re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526 const re_node_set *nodes, unsigned int context)
3b0bdc72 1527{
eb04c213 1528 re_hashval_t hash;
a9388965 1529 re_dfastate_t *new_state;
3b0bdc72 1530 struct re_state_table_entry *spot;
eb04c213
AZ
1531 Idx i;
1532#if defined GCC_LINT || defined lint
1533 /* Suppress bogus uninitialized-variable warnings. */
1534 *err = REG_NOERROR;
1535#endif
3b0bdc72 1536 if (nodes->nelem == 0)
a9388965
UD
1537 {
1538 *err = REG_NOERROR;
1539 return NULL;
1540 }
3b0bdc72
UD
1541 hash = calc_state_hash (nodes, context);
1542 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
bc15410e 1544 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1545 {
bc15410e 1546 re_dfastate_t *state = spot->array[i];
a0a8461c
UD
1547 if (state->hash == hash
1548 && state->context == context
1549 && re_node_set_compare (state->entrance_nodes, nodes))
1b2c2628 1550 return state;
3b0bdc72 1551 }
eb04c213 1552 /* There are no appropriate state in 'dfa', create the new one. */
a9388965 1553 new_state = create_cd_newstate (dfa, nodes, context, hash);
2d87db5b
UD
1554 if (BE (new_state == NULL, 0))
1555 *err = REG_ESPACE;
1556
1557 return new_state;
3b0bdc72
UD
1558}
1559
5cf1ec52
UD
1560/* Finish initialization of the new state NEWSTATE, and using its hash value
1561 HASH put in the appropriate bucket of DFA's state table. Return value
1562 indicates the error code if failed. */
a9388965 1563
5cf1ec52 1564static reg_errcode_t
2da42bc0 1565__attribute_warn_unused_result__
e2f55264 1566register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
eb04c213 1567 re_hashval_t hash)
3b0bdc72 1568{
5cf1ec52 1569 struct re_state_table_entry *spot;
1b2c2628 1570 reg_errcode_t err;
eb04c213 1571 Idx i;
5cf1ec52
UD
1572
1573 newstate->hash = hash;
1574 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1b2c2628 1575 if (BE (err != REG_NOERROR, 0))
5cf1ec52
UD
1576 return REG_ESPACE;
1577 for (i = 0; i < newstate->nodes.nelem; i++)
1b2c2628 1578 {
eb04c213 1579 Idx elem = newstate->nodes.elems[i];
5cf1ec52 1580 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
eb04c213 1581 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
2da42bc0 1582 return REG_ESPACE;
1b2c2628 1583 }
3b0bdc72 1584
3b0bdc72 1585 spot = dfa->state_table + (hash & dfa->state_hash_mask);
951d6408 1586 if (BE (spot->alloc <= spot->num, 0))
3b0bdc72 1587 {
eb04c213 1588 Idx new_alloc = 2 * spot->num + 2;
951d6408
UD
1589 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590 new_alloc);
1b2c2628
UD
1591 if (BE (new_array == NULL, 0))
1592 return REG_ESPACE;
1593 spot->array = new_array;
951d6408 1594 spot->alloc = new_alloc;
3b0bdc72 1595 }
bc15410e 1596 spot->array[spot->num++] = newstate;
a9388965 1597 return REG_NOERROR;
3b0bdc72
UD
1598}
1599
db26cb75
UD
1600static void
1601free_state (re_dfastate_t *state)
1602{
1603 re_node_set_free (&state->non_eps_nodes);
1604 re_node_set_free (&state->inveclosure);
1605 if (state->entrance_nodes != &state->nodes)
1606 {
1607 re_node_set_free (state->entrance_nodes);
1608 re_free (state->entrance_nodes);
1609 }
1610 re_node_set_free (&state->nodes);
1611 re_free (state->word_trtable);
1612 re_free (state->trtable);
1613 re_free (state);
1614}
1615
5069ff32 1616/* Create the new state which is independent of contexts.
a9388965
UD
1617 Return the new state if succeeded, otherwise return NULL. */
1618
3b0bdc72 1619static re_dfastate_t *
b41bd5bc 1620__attribute_warn_unused_result__
e2f55264 1621create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
eb04c213 1622 re_hashval_t hash)
3b0bdc72 1623{
eb04c213 1624 Idx i;
a9388965 1625 reg_errcode_t err;
3b0bdc72 1626 re_dfastate_t *newstate;
5cf1ec52
UD
1627
1628 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
bc15410e 1629 if (BE (newstate == NULL, 0))
a9388965 1630 return NULL;
5cf1ec52
UD
1631 err = re_node_set_init_copy (&newstate->nodes, nodes);
1632 if (BE (err != REG_NOERROR, 0))
1633 {
1634 re_free (newstate);
1635 return NULL;
1636 }
3b0bdc72 1637
5cf1ec52 1638 newstate->entrance_nodes = &newstate->nodes;
3b0bdc72
UD
1639 for (i = 0 ; i < nodes->nelem ; i++)
1640 {
1641 re_token_t *node = dfa->nodes + nodes->elems[i];
1642 re_token_type_t type = node->type;
485d775d 1643 if (type == CHARACTER && !node->constraint)
1b2c2628 1644 continue;
02f3550c
UD
1645#ifdef RE_ENABLE_I18N
1646 newstate->accept_mb |= node->accept_mb;
1647#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1648
1649 /* If the state has the halt node, the state is a halt state. */
02f3550c 1650 if (type == END_OF_RE)
1b2c2628 1651 newstate->halt = 1;
3b0bdc72 1652 else if (type == OP_BACK_REF)
1b2c2628 1653 newstate->has_backref = 1;
485d775d 1654 else if (type == ANCHOR || node->constraint)
1b2c2628 1655 newstate->has_constraint = 1;
3b0bdc72 1656 }
a9388965 1657 err = register_state (dfa, newstate, hash);
1b2c2628
UD
1658 if (BE (err != REG_NOERROR, 0))
1659 {
1660 free_state (newstate);
1661 newstate = NULL;
1662 }
1663 return newstate;
3b0bdc72
UD
1664}
1665
a9388965
UD
1666/* Create the new state which is depend on the context CONTEXT.
1667 Return the new state if succeeded, otherwise return NULL. */
1668
3b0bdc72 1669static re_dfastate_t *
b41bd5bc 1670__attribute_warn_unused_result__
e2f55264 1671create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
eb04c213 1672 unsigned int context, re_hashval_t hash)
3b0bdc72 1673{
eb04c213 1674 Idx i, nctx_nodes = 0;
a9388965 1675 reg_errcode_t err;
3b0bdc72
UD
1676 re_dfastate_t *newstate;
1677
5cf1ec52 1678 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
bc15410e 1679 if (BE (newstate == NULL, 0))
a9388965 1680 return NULL;
5cf1ec52
UD
1681 err = re_node_set_init_copy (&newstate->nodes, nodes);
1682 if (BE (err != REG_NOERROR, 0))
1683 {
1684 re_free (newstate);
1685 return NULL;
1686 }
1687
3b0bdc72
UD
1688 newstate->context = context;
1689 newstate->entrance_nodes = &newstate->nodes;
1690
1691 for (i = 0 ; i < nodes->nelem ; i++)
1692 {
3b0bdc72
UD
1693 re_token_t *node = dfa->nodes + nodes->elems[i];
1694 re_token_type_t type = node->type;
0caca71a 1695 unsigned int constraint = node->constraint;
3b0bdc72 1696
485d775d 1697 if (type == CHARACTER && !constraint)
1b2c2628 1698 continue;
a334319f 1699#ifdef RE_ENABLE_I18N
02f3550c 1700 newstate->accept_mb |= node->accept_mb;
a334319f 1701#endif /* RE_ENABLE_I18N */
02f3550c
UD
1702
1703 /* If the state has the halt node, the state is a halt state. */
1704 if (type == END_OF_RE)
1705 newstate->halt = 1;
3b0bdc72 1706 else if (type == OP_BACK_REF)
1b2c2628 1707 newstate->has_backref = 1;
3b0bdc72
UD
1708
1709 if (constraint)
1b2c2628
UD
1710 {
1711 if (newstate->entrance_nodes == &newstate->nodes)
1712 {
1713 newstate->entrance_nodes = re_malloc (re_node_set, 1);
bc15410e 1714 if (BE (newstate->entrance_nodes == NULL, 0))
1b2c2628
UD
1715 {
1716 free_state (newstate);
1717 return NULL;
1718 }
2da42bc0
UD
1719 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1720 != REG_NOERROR)
1721 return NULL;
1b2c2628
UD
1722 nctx_nodes = 0;
1723 newstate->has_constraint = 1;
1724 }
1725
1726 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1727 {
1728 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1729 ++nctx_nodes;
1730 }
1731 }
3b0bdc72 1732 }
a9388965 1733 err = register_state (dfa, newstate, hash);
1b2c2628
UD
1734 if (BE (err != REG_NOERROR, 0))
1735 {
1736 free_state (newstate);
1737 newstate = NULL;
1738 }
1739 return newstate;
1740}