]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regex_internal.c
Use --enable-obsolete in build-many-glibcs.py for nios2-linux-gnu
[thirdparty/glibc.git] / posix / regex_internal.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
dff8da6b 2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
eb04c213 18 <https://www.gnu.org/licenses/>. */
3b0bdc72 19
eb04c213 20static void re_string_construct_common (const char *str, Idx len,
1b2c2628 21 re_string_t *pstr,
eb04c213 22 RE_TRANSLATE_TYPE trans, bool icase,
b41bd5bc 23 const re_dfa_t *dfa);
01ed6ceb 24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
1b2c2628 25 const re_node_set *nodes,
eb04c213 26 re_hashval_t hash);
01ed6ceb 27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
1b2c2628
UD
28 const re_node_set *nodes,
29 unsigned int context,
eb04c213
AZ
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
3b0bdc72
UD
41\f
42/* Functions for string operation. */
43
612546c6
UD
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
3b0bdc72 47static reg_errcode_t
b41bd5bc 48__attribute_warn_unused_result__
eb04c213
AZ
49re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
3b0bdc72
UD
51{
52 reg_errcode_t ret;
eb04c213 53 Idx init_buf_len;
97fd3a30
UD
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
f0c7c524 59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
f4efbdfb 62 if (__glibc_unlikely (ret != REG_NOERROR))
612546c6
UD
63 return ret;
64
56b168be
UD
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
37369d1c
UD
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
612546c6
UD
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
b41bd5bc 76__attribute_warn_unused_result__
eb04c213
AZ
77re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
612546c6
UD
79{
80 reg_errcode_t ret;
56b168be 81 memset (pstr, '\0', sizeof (re_string_t));
f0c7c524 82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
83
84 if (len > 0)
3b0bdc72 85 {
612546c6 86 ret = re_string_realloc_buffers (pstr, len + 1);
f4efbdfb 87 if (__glibc_unlikely (ret != REG_NOERROR))
1b2c2628 88 return ret;
3b0bdc72 89 }
37369d1c 90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
612546c6
UD
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
f0c7c524 95 if (dfa->mb_cur_max > 1)
37369d1c
UD
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
f4efbdfb 100 if (__glibc_unlikely (ret != REG_NOERROR))
37369d1c
UD
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
f4efbdfb 107 if (__glibc_unlikely (ret != REG_NOERROR))
37369d1c
UD
108 return ret;
109 }
110 }
612546c6 111 else
3b0bdc72 112#endif /* RE_ENABLE_I18N */
1b2c2628 113 build_upper_buffer (pstr);
612546c6
UD
114 }
115 else
3b0bdc72 116 {
612546c6 117#ifdef RE_ENABLE_I18N
f0c7c524 118 if (dfa->mb_cur_max > 1)
1b2c2628 119 build_wcs_buffer (pstr);
612546c6
UD
120 else
121#endif /* RE_ENABLE_I18N */
1b2c2628
UD
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
37369d1c
UD
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
1b2c2628 130 }
3b0bdc72 131 }
612546c6 132
3b0bdc72
UD
133 return REG_NOERROR;
134}
135
612546c6 136/* Helper functions for re_string_allocate, and re_string_construct. */
3b0bdc72
UD
137
138static reg_errcode_t
b41bd5bc 139__attribute_warn_unused_result__
eb04c213 140re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
3b0bdc72 141{
3b0bdc72 142#ifdef RE_ENABLE_I18N
3c0fb574 143 if (pstr->mb_cur_max > 1)
3b0bdc72 144 {
54dd0ab3
UD
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
eb04c213 148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
f4efbdfb
PE
149 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150 < new_buf_len))
54dd0ab3
UD
151 return REG_ESPACE;
152
153 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
f4efbdfb 154 if (__glibc_unlikely (new_wcs == NULL))
1b2c2628 155 return REG_ESPACE;
2d87db5b 156 pstr->wcs = new_wcs;
37369d1c
UD
157 if (pstr->offsets != NULL)
158 {
eb04c213 159 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
f4efbdfb 160 if (__glibc_unlikely (new_offsets == NULL))
37369d1c 161 return REG_ESPACE;
2d87db5b 162 pstr->offsets = new_offsets;
37369d1c 163 }
3b0bdc72 164 }
3b0bdc72 165#endif /* RE_ENABLE_I18N */
37369d1c 166 if (pstr->mbs_allocated)
3b0bdc72 167 {
2d87db5b
UD
168 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
169 new_buf_len);
f4efbdfb 170 if (__glibc_unlikely (new_mbs == NULL))
1b2c2628 171 return REG_ESPACE;
2d87db5b 172 pstr->mbs = new_mbs;
3b0bdc72 173 }
612546c6 174 pstr->bufs_len = new_buf_len;
3b0bdc72
UD
175 return REG_NOERROR;
176}
177
612546c6 178
3b0bdc72 179static void
eb04c213
AZ
180re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
181 RE_TRANSLATE_TYPE trans, bool icase,
e2f55264 182 const re_dfa_t *dfa)
3b0bdc72 183{
c202c2c5 184 pstr->raw_mbs = (const unsigned char *) str;
3b0bdc72 185 pstr->len = len;
37369d1c 186 pstr->raw_len = len;
997470b3 187 pstr->trans = trans;
eb04c213 188 pstr->icase = icase;
37369d1c 189 pstr->mbs_allocated = (trans != NULL || icase);
f0c7c524
UD
190 pstr->mb_cur_max = dfa->mb_cur_max;
191 pstr->is_utf8 = dfa->is_utf8;
192 pstr->map_notascii = dfa->map_notascii;
37369d1c
UD
193 pstr->stop = pstr->len;
194 pstr->raw_stop = pstr->stop;
3b0bdc72
UD
195}
196
197#ifdef RE_ENABLE_I18N
198
612546c6 199/* Build wide character buffer PSTR->WCS.
3b0bdc72
UD
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
612546c6 205 a first byte of a multibyte character.
3b0bdc72 206
612546c6
UD
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
209
210static void
e2f55264 211build_wcs_buffer (re_string_t *pstr)
3b0bdc72 212{
37369d1c 213#ifdef _LIBC
ec73fd87 214 unsigned char buf[MB_LEN_MAX];
2a0356e1 215 DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
37369d1c
UD
216#else
217 unsigned char buf[64];
218#endif
612546c6 219 mbstate_t prev_st;
eb04c213 220 Idx byte_idx, end_idx, remain_len;
88764ae2 221 size_t mbclen;
37369d1c 222
612546c6
UD
223 /* Build the buffers from pstr->valid_len to either pstr->len or
224 pstr->bufs_len. */
37369d1c 225 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
612546c6
UD
226 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227 {
228 wchar_t wc;
37369d1c
UD
229 const char *p;
230
612546c6
UD
231 remain_len = end_idx - byte_idx;
232 prev_st = pstr->cur_state;
37369d1c 233 /* Apply the translation if we need. */
f4efbdfb 234 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c
UD
235 {
236 int i, ch;
237
238 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 {
240 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
d40eb37a 241 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
37369d1c
UD
242 }
243 p = (const char *) buf;
244 }
245 else
246 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
b3918c7d 247 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
f4efbdfb
PE
248 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249 || (mbclen == (size_t) -2
250 && pstr->bufs_len >= pstr->len)))
1b2c2628
UD
251 {
252 /* We treat these cases as a singlebyte character. */
253 mbclen = 1;
254 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
f4efbdfb 255 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c 256 wc = pstr->trans[wc];
1b2c2628
UD
257 pstr->cur_state = prev_st;
258 }
f4efbdfb 259 else if (__glibc_unlikely (mbclen == (size_t) -2))
8887a920
UD
260 {
261 /* The buffer doesn't have enough space, finish to build. */
262 pstr->cur_state = prev_st;
263 break;
264 }
612546c6 265
3b0bdc72 266 /* Write wide character and padding. */
612546c6
UD
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1b2c2628 270 pstr->wcs[byte_idx++] = WEOF;
3b0bdc72 271 }
612546c6 272 pstr->valid_len = byte_idx;
37369d1c 273 pstr->valid_raw_len = byte_idx;
3b0bdc72
UD
274}
275
612546c6
UD
276/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
278
4f7e7f8e 279static reg_errcode_t
b41bd5bc 280__attribute_warn_unused_result__
e2f55264 281build_wcs_upper_buffer (re_string_t *pstr)
3b0bdc72 282{
612546c6 283 mbstate_t prev_st;
eb04c213 284 Idx src_idx, byte_idx, end_idx, remain_len;
88764ae2 285 size_t mbclen;
37369d1c 286#ifdef _LIBC
ec73fd87 287 char buf[MB_LEN_MAX];
2a0356e1 288 DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
37369d1c 289#else
1c99f950 290 char buf[64];
37369d1c
UD
291#endif
292
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
f0c7c524 295
e40a38b3
UD
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
37369d1c
UD
298 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 {
300 while (byte_idx < end_idx)
1b2c2628 301 {
f0c7c524 302 wchar_t wc;
c2a150d0 303 unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
37369d1c 304
c2a150d0 305 if (isascii (ch) && mbsinit (&pstr->cur_state))
37369d1c 306 {
37369d1c 307 /* The next step uses the assumption that wchar_t is encoded
e40a38b3 308 ASCII-safe: all ASCII values can be converted like this. */
c2a150d0
AZ
309 wchar_t wcu = __towupper (ch);
310 if (isascii (wcu))
311 {
312 pstr->mbs[byte_idx] = wcu;
313 pstr->wcs[byte_idx] = wcu;
314 byte_idx++;
315 continue;
316 }
37369d1c
UD
317 }
318
f0c7c524
UD
319 remain_len = end_idx - byte_idx;
320 prev_st = pstr->cur_state;
b3918c7d
UD
321 mbclen = __mbrtowc (&wc,
322 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 + byte_idx), remain_len, &pstr->cur_state);
f4efbdfb 324 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
1b2c2628 325 {
eb04c213
AZ
326 wchar_t wcu = __towupper (wc);
327 if (wcu != wc)
37369d1c 328 {
88764ae2 329 size_t mbcdlen;
37369d1c 330
a5f0adb3 331 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
f4efbdfb 332 if (__glibc_likely (mbclen == mbcdlen))
37369d1c
UD
333 memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 else
335 {
336 src_idx = byte_idx;
337 goto offsets_needed;
338 }
339 }
f0c7c524
UD
340 else
341 memcpy (pstr->mbs + byte_idx,
342 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
37369d1c 343 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
344 /* Write paddings. */
345 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 pstr->wcs[byte_idx++] = WEOF;
347 }
8887a920
UD
348 else if (mbclen == (size_t) -1 || mbclen == 0
349 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
f0c7c524 350 {
8887a920
UD
351 /* It is an invalid character, an incomplete character
352 at the end of the string, or '\0'. Just use the byte. */
02b50340
UD
353 pstr->mbs[byte_idx] = ch;
354 /* And also cast it to wide char. */
355 pstr->wcs[byte_idx++] = (wchar_t) ch;
f4efbdfb 356 if (__glibc_unlikely (mbclen == (size_t) -1))
f0c7c524 357 pstr->cur_state = prev_st;
1b2c2628 358 }
1b2c2628 359 else
f0c7c524
UD
360 {
361 /* The buffer doesn't have enough space, finish to build. */
362 pstr->cur_state = prev_st;
363 break;
364 }
1b2c2628 365 }
37369d1c
UD
366 pstr->valid_len = byte_idx;
367 pstr->valid_raw_len = byte_idx;
368 return REG_NOERROR;
369 }
f0c7c524 370 else
37369d1c 371 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
f0c7c524
UD
372 {
373 wchar_t wc;
37369d1c 374 const char *p;
e40a38b3 375 offsets_needed:
f0c7c524
UD
376 remain_len = end_idx - byte_idx;
377 prev_st = pstr->cur_state;
f4efbdfb 378 if (__glibc_unlikely (pstr->trans != NULL))
f0c7c524 379 {
37369d1c
UD
380 int i, ch;
381
382 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
f0c7c524 383 {
37369d1c
UD
384 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
385 buf[i] = pstr->trans[ch];
f0c7c524 386 }
37369d1c 387 p = (const char *) buf;
f0c7c524 388 }
37369d1c
UD
389 else
390 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
b3918c7d 391 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
f4efbdfb 392 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
f0c7c524 393 {
eb04c213
AZ
394 wchar_t wcu = __towupper (wc);
395 if (wcu != wc)
37369d1c 396 {
88764ae2 397 size_t mbcdlen;
37369d1c 398
a5f0adb3 399 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
f4efbdfb 400 if (__glibc_likely (mbclen == mbcdlen))
37369d1c 401 memcpy (pstr->mbs + byte_idx, buf, mbclen);
88764ae2 402 else if (mbcdlen != (size_t) -1)
37369d1c 403 {
88764ae2 404 size_t i;
37369d1c
UD
405
406 if (byte_idx + mbcdlen > pstr->bufs_len)
407 {
408 pstr->cur_state = prev_st;
409 break;
410 }
411
412 if (pstr->offsets == NULL)
413 {
eb04c213 414 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
37369d1c
UD
415
416 if (pstr->offsets == NULL)
417 return REG_ESPACE;
418 }
419 if (!pstr->offsets_needed)
420 {
88764ae2 421 for (i = 0; i < (size_t) byte_idx; ++i)
37369d1c
UD
422 pstr->offsets[i] = i;
423 pstr->offsets_needed = 1;
424 }
425
426 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427 pstr->wcs[byte_idx] = wcu;
428 pstr->offsets[byte_idx] = src_idx;
429 for (i = 1; i < mbcdlen; ++i)
430 {
431 pstr->offsets[byte_idx + i]
432 = src_idx + (i < mbclen ? i : mbclen - 1);
433 pstr->wcs[byte_idx + i] = WEOF;
434 }
435 pstr->len += mbcdlen - mbclen;
436 if (pstr->raw_stop > src_idx)
437 pstr->stop += mbcdlen - mbclen;
438 end_idx = (pstr->bufs_len > pstr->len)
439 ? pstr->len : pstr->bufs_len;
440 byte_idx += mbcdlen;
441 src_idx += mbclen;
442 continue;
443 }
2da42bc0
UD
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
37369d1c 446 }
f0c7c524 447 else
37369d1c
UD
448 memcpy (pstr->mbs + byte_idx, p, mbclen);
449
f4efbdfb 450 if (__glibc_unlikely (pstr->offsets_needed != 0))
37369d1c 451 {
88764ae2 452 size_t i;
37369d1c
UD
453 for (i = 0; i < mbclen; ++i)
454 pstr->offsets[byte_idx + i] = src_idx + i;
455 }
456 src_idx += mbclen;
457
458 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
459 /* Write paddings. */
460 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461 pstr->wcs[byte_idx++] = WEOF;
462 }
8887a920
UD
463 else if (mbclen == (size_t) -1 || mbclen == 0
464 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
02b50340 465 {
37369d1c
UD
466 /* It is an invalid character or '\0'. Just use the byte. */
467 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468
f4efbdfb 469 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c 470 ch = pstr->trans [ch];
02b50340 471 pstr->mbs[byte_idx] = ch;
37369d1c 472
f4efbdfb 473 if (__glibc_unlikely (pstr->offsets_needed != 0))
37369d1c
UD
474 pstr->offsets[byte_idx] = src_idx;
475 ++src_idx;
476
02b50340
UD
477 /* And also cast it to wide char. */
478 pstr->wcs[byte_idx++] = (wchar_t) ch;
f4efbdfb 479 if (__glibc_unlikely (mbclen == (size_t) -1))
02b50340
UD
480 pstr->cur_state = prev_st;
481 }
f0c7c524
UD
482 else
483 {
484 /* The buffer doesn't have enough space, finish to build. */
485 pstr->cur_state = prev_st;
486 break;
487 }
488 }
612546c6 489 pstr->valid_len = byte_idx;
37369d1c
UD
490 pstr->valid_raw_len = src_idx;
491 return REG_NOERROR;
612546c6
UD
492}
493
494/* Skip characters until the index becomes greater than NEW_RAW_IDX.
495 Return the index. */
496
eb04c213
AZ
497static Idx
498re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
612546c6
UD
499{
500 mbstate_t prev_st;
eb04c213 501 Idx rawbuf_idx;
88764ae2 502 size_t mbclen;
4f08104c 503 wint_t wc = WEOF;
612546c6
UD
504
505 /* Skip the characters which are not necessary to check. */
37369d1c 506 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
612546c6
UD
507 rawbuf_idx < new_raw_idx;)
508 {
4f08104c 509 wchar_t wc2;
eb04c213 510 Idx remain_len = pstr->raw_len - rawbuf_idx;
612546c6 511 prev_st = pstr->cur_state;
4f08104c 512 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
b3918c7d 513 remain_len, &pstr->cur_state);
f4efbdfb
PE
514 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
515 || mbclen == 0))
1b2c2628 516 {
33e63e79
UD
517 /* We treat these cases as a single byte character. */
518 if (mbclen == 0 || remain_len == 0)
519 wc = L'\0';
520 else
521 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1b2c2628
UD
522 mbclen = 1;
523 pstr->cur_state = prev_st;
524 }
4f08104c 525 else
eb04c213 526 wc = wc2;
612546c6
UD
527 /* Then proceed the next character. */
528 rawbuf_idx += mbclen;
529 }
4f08104c 530 *last_wc = wc;
612546c6 531 return rawbuf_idx;
3b0bdc72
UD
532}
533#endif /* RE_ENABLE_I18N */
534
1b2c2628 535/* Build the buffer PSTR->MBS, and apply the translation if we need.
612546c6
UD
536 This function is used in case of REG_ICASE. */
537
538static void
e2f55264 539build_upper_buffer (re_string_t *pstr)
3b0bdc72 540{
eb04c213 541 Idx char_idx, end_idx;
612546c6 542 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
3b0bdc72 543
612546c6 544 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
3b0bdc72 545 {
612546c6 546 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
f4efbdfb 547 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c 548 ch = pstr->trans[ch];
eb04c213 549 pstr->mbs[char_idx] = toupper (ch);
3b0bdc72 550 }
612546c6 551 pstr->valid_len = char_idx;
37369d1c 552 pstr->valid_raw_len = char_idx;
3b0bdc72
UD
553}
554
612546c6 555/* Apply TRANS to the buffer in PSTR. */
3b0bdc72 556
612546c6 557static void
e2f55264 558re_string_translate_buffer (re_string_t *pstr)
3b0bdc72 559{
eb04c213 560 Idx buf_idx, end_idx;
612546c6
UD
561 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562
563 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
3b0bdc72 564 {
612546c6 565 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
37369d1c 566 pstr->mbs[buf_idx] = pstr->trans[ch];
3b0bdc72 567 }
612546c6
UD
568
569 pstr->valid_len = buf_idx;
37369d1c 570 pstr->valid_raw_len = buf_idx;
612546c6
UD
571}
572
573/* This function re-construct the buffers.
3c0fb574 574 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
612546c6
UD
575 convert to upper case in case of REG_ICASE, apply translation. */
576
577static reg_errcode_t
b41bd5bc 578__attribute_warn_unused_result__
eb04c213 579re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
612546c6 580{
eb04c213
AZ
581 Idx offset;
582
f4efbdfb 583 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
eb04c213
AZ
584 offset = idx - pstr->raw_mbs_idx;
585 else
612546c6
UD
586 {
587 /* Reset buffer. */
434d3784 588#ifdef RE_ENABLE_I18N
3c0fb574 589 if (pstr->mb_cur_max > 1)
1b2c2628 590 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
434d3784 591#endif /* RE_ENABLE_I18N */
37369d1c
UD
592 pstr->len = pstr->raw_len;
593 pstr->stop = pstr->raw_stop;
594 pstr->valid_len = 0;
595 pstr->raw_mbs_idx = 0;
596 pstr->valid_raw_len = 0;
597 pstr->offsets_needed = 0;
612546c6 598 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1b2c2628 599 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
37369d1c 600 if (!pstr->mbs_allocated)
1b2c2628 601 pstr->mbs = (unsigned char *) pstr->raw_mbs;
612546c6
UD
602 offset = idx;
603 }
604
f4efbdfb 605 if (__glibc_likely (offset != 0))
612546c6 606 {
ba40cc15 607 /* Should the already checked characters be kept? */
f4efbdfb 608 if (__glibc_likely (offset < pstr->valid_raw_len))
a334319f
UD
609 {
610 /* Yes, move them to the front of the buffer. */
0ecb606c 611#ifdef RE_ENABLE_I18N
f4efbdfb 612 if (__glibc_unlikely (pstr->offsets_needed))
ba40cc15 613 {
eb04c213 614 Idx low = 0, high = pstr->valid_len, mid;
ba40cc15
UD
615 do
616 {
617 mid = (high + low) / 2;
618 if (pstr->offsets[mid] > offset)
619 high = mid;
620 else if (pstr->offsets[mid] < offset)
621 low = mid + 1;
622 else
623 break;
624 }
625 while (low < high);
626 if (pstr->offsets[mid] < offset)
627 ++mid;
628 pstr->tip_context = re_string_context_at (pstr, mid - 1,
629 eflags);
630 /* This can be quite complicated, so handle specially
631 only the common and easy case where the character with
632 different length representation of lower and upper
633 case is present at or after offset. */
634 if (pstr->valid_len > offset
635 && mid == offset && pstr->offsets[mid] == offset)
636 {
637 memmove (pstr->wcs, pstr->wcs + offset,
638 (pstr->valid_len - offset) * sizeof (wint_t));
639 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
640 pstr->valid_len -= offset;
641 pstr->valid_raw_len -= offset;
642 for (low = 0; low < pstr->valid_len; low++)
643 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
644 }
645 else
646 {
647 /* Otherwise, just find out how long the partial multibyte
648 character at offset is and fill it with WEOF/255. */
649 pstr->len = pstr->raw_len - idx + offset;
650 pstr->stop = pstr->raw_stop - idx + offset;
651 pstr->offsets_needed = 0;
652 while (mid > 0 && pstr->offsets[mid - 1] == offset)
653 --mid;
654 while (mid < pstr->valid_len)
655 if (pstr->wcs[mid] != WEOF)
656 break;
657 else
658 ++mid;
659 if (mid == pstr->valid_len)
660 pstr->valid_len = 0;
661 else
662 {
663 pstr->valid_len = pstr->offsets[mid] - offset;
664 if (pstr->valid_len)
665 {
666 for (low = 0; low < pstr->valid_len; ++low)
667 pstr->wcs[low] = WEOF;
668 memset (pstr->mbs, 255, pstr->valid_len);
669 }
670 }
671 pstr->valid_raw_len = pstr->valid_len;
672 }
673 }
674 else
675#endif
676 {
677 pstr->tip_context = re_string_context_at (pstr, offset - 1,
678 eflags);
679#ifdef RE_ENABLE_I18N
680 if (pstr->mb_cur_max > 1)
681 memmove (pstr->wcs, pstr->wcs + offset,
682 (pstr->valid_len - offset) * sizeof (wint_t));
612546c6 683#endif /* RE_ENABLE_I18N */
f4efbdfb 684 if (__glibc_unlikely (pstr->mbs_allocated))
ba40cc15
UD
685 memmove (pstr->mbs, pstr->mbs + offset,
686 pstr->valid_len - offset);
687 pstr->valid_len -= offset;
688 pstr->valid_raw_len -= offset;
2a0356e1 689 DEBUG_ASSERT (pstr->valid_len > 0);
ba40cc15 690 }
1b2c2628 691 }
612546c6 692 else
1b2c2628 693 {
ed8ae46b 694#ifdef RE_ENABLE_I18N
1b2c2628 695 /* No, skip all characters until IDX. */
eb04c213 696 Idx prev_valid_len = pstr->valid_len;
ba40cc15 697
f4efbdfb 698 if (__glibc_unlikely (pstr->offsets_needed))
37369d1c
UD
699 {
700 pstr->len = pstr->raw_len - idx + offset;
701 pstr->stop = pstr->raw_stop - idx + offset;
702 pstr->offsets_needed = 0;
703 }
704#endif
1b2c2628 705 pstr->valid_len = 0;
3b0bdc72 706#ifdef RE_ENABLE_I18N
3c0fb574 707 if (pstr->mb_cur_max > 1)
1b2c2628 708 {
eb04c213 709 Idx wcs_idx;
3c0fb574
UD
710 wint_t wc = WEOF;
711
712 if (pstr->is_utf8)
713 {
0dae5d4e 714 const unsigned char *raw, *p, *end;
3c0fb574
UD
715
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
97fd3a30 719 end = raw + (offset - pstr->mb_cur_max);
ba40cc15
UD
720 if (end < pstr->raw_mbs)
721 end = pstr->raw_mbs;
01ed6ceb
UD
722 p = raw + offset - 1;
723#ifdef _LIBC
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
f4efbdfb 726 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
01ed6ceb
UD
727 {
728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
ba40cc15 729 /* pstr->valid_len = 0; */
01ed6ceb
UD
730 wc = (wchar_t) *p;
731 }
732 else
733#endif
734 for (; p >= end; --p)
735 if ((*p & 0xc0) != 0x80)
736 {
737 mbstate_t cur_state;
738 wchar_t wc2;
eb04c213 739 Idx mlen = raw + pstr->len - p;
01ed6ceb
UD
740 unsigned char buf[6];
741 size_t mbclen;
742
5e2b63c6 743 const unsigned char *pp = p;
f4efbdfb 744 if (__glibc_unlikely (pstr->trans != NULL))
01ed6ceb
UD
745 {
746 int i = mlen < 6 ? mlen : 6;
747 while (--i >= 0)
748 buf[i] = pstr->trans[p[i]];
5e2b63c6 749 pp = buf;
01ed6ceb
UD
750 }
751 /* XXX Don't use mbrtowc, we know which conversion
752 to use (UTF-8 -> UCS4). */
753 memset (&cur_state, 0, sizeof (cur_state));
5e2b63c6 754 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
b3918c7d 755 &cur_state);
01ed6ceb
UD
756 if (raw + offset - p <= mbclen
757 && mbclen < (size_t) -2)
758 {
759 memset (&pstr->cur_state, '\0',
760 sizeof (mbstate_t));
761 pstr->valid_len = mbclen - (raw + offset - p);
762 wc = wc2;
763 }
764 break;
765 }
3c0fb574 766 }
e40a38b3 767
3c0fb574 768 if (wc == WEOF)
97fd3a30 769 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
33e63e79
UD
770 if (wc == WEOF)
771 pstr->tip_context
ba40cc15 772 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
33e63e79 773 else
f4efbdfb 774 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
33e63e79
UD
775 && IS_WIDE_WORD_CHAR (wc))
776 ? CONTEXT_WORD
777 : ((IS_WIDE_NEWLINE (wc)
778 && pstr->newline_anchor)
779 ? CONTEXT_NEWLINE : 0));
f4efbdfb 780 if (__glibc_unlikely (pstr->valid_len))
37369d1c
UD
781 {
782 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 pstr->wcs[wcs_idx] = WEOF;
784 if (pstr->mbs_allocated)
785 memset (pstr->mbs, 255, pstr->valid_len);
786 }
787 pstr->valid_raw_len = pstr->valid_len;
1b2c2628 788 }
1843975c 789 else
612546c6 790#endif /* RE_ENABLE_I18N */
1843975c
RM
791 {
792 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
33e63e79 793 pstr->valid_raw_len = 0;
1843975c
RM
794 if (pstr->trans)
795 c = pstr->trans[c];
56b168be
UD
796 pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 ? CONTEXT_WORD
798 : ((IS_NEWLINE (c) && pstr->newline_anchor)
1843975c
RM
799 ? CONTEXT_NEWLINE : 0));
800 }
1b2c2628 801 }
f4efbdfb 802 if (!__glibc_unlikely (pstr->mbs_allocated))
37369d1c 803 pstr->mbs += offset;
3b0bdc72 804 }
612546c6
UD
805 pstr->raw_mbs_idx = idx;
806 pstr->len -= offset;
ac3d553b 807 pstr->stop -= offset;
612546c6
UD
808
809 /* Then build the buffers. */
810#ifdef RE_ENABLE_I18N
3c0fb574 811 if (pstr->mb_cur_max > 1)
3b0bdc72 812 {
612546c6 813 if (pstr->icase)
37369d1c 814 {
4f7e7f8e 815 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
f4efbdfb 816 if (__glibc_unlikely (ret != REG_NOERROR))
37369d1c
UD
817 return ret;
818 }
612546c6 819 else
1b2c2628 820 build_wcs_buffer (pstr);
3b0bdc72
UD
821 }
822 else
612546c6 823#endif /* RE_ENABLE_I18N */
f4efbdfb 824 if (__glibc_unlikely (pstr->mbs_allocated))
01ed6ceb
UD
825 {
826 if (pstr->icase)
827 build_upper_buffer (pstr);
828 else if (pstr->trans != NULL)
829 re_string_translate_buffer (pstr);
830 }
831 else
832 pstr->valid_len = pstr->len;
612546c6 833
bb677c95 834 pstr->cur_idx = 0;
3b0bdc72
UD
835 return REG_NOERROR;
836}
837
37369d1c 838static unsigned char
eb04c213
AZ
839__attribute__ ((pure))
840re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
37369d1c 841{
eb04c213
AZ
842 int ch;
843 Idx off;
37369d1c
UD
844
845 /* Handle the common (easiest) cases first. */
f4efbdfb 846 if (__glibc_likely (!pstr->mbs_allocated))
37369d1c
UD
847 return re_string_peek_byte (pstr, idx);
848
849#ifdef RE_ENABLE_I18N
850 if (pstr->mb_cur_max > 1
851 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852 return re_string_peek_byte (pstr, idx);
853#endif
854
855 off = pstr->cur_idx + idx;
856#ifdef RE_ENABLE_I18N
857 if (pstr->offsets_needed)
858 off = pstr->offsets[off];
859#endif
860
861 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
862
863#ifdef RE_ENABLE_I18N
864 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865 this function returns CAPITAL LETTER I instead of first byte of
866 DOTLESS SMALL LETTER I. The latter would confuse the parser,
867 since peek_byte_case doesn't advance cur_idx in any way. */
868 if (pstr->offsets_needed && !isascii (ch))
869 return re_string_peek_byte (pstr, idx);
870#endif
871
872 return ch;
873}
874
875static unsigned char
e2f55264 876re_string_fetch_byte_case (re_string_t *pstr)
37369d1c 877{
f4efbdfb 878 if (__glibc_likely (!pstr->mbs_allocated))
37369d1c
UD
879 return re_string_fetch_byte (pstr);
880
881#ifdef RE_ENABLE_I18N
882 if (pstr->offsets_needed)
883 {
eb04c213
AZ
884 Idx off;
885 int ch;
c0d5034e 886
37369d1c
UD
887 /* For tr_TR.UTF-8 [[:islower:]] there is
888 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
889 in that case the whole multi-byte character and return
890 the original letter. On the other side, with
891 [[: DOTLESS SMALL LETTER I return [[:I, as doing
892 anything else would complicate things too much. */
893
894 if (!re_string_first_byte (pstr, pstr->cur_idx))
895 return re_string_fetch_byte (pstr);
896
897 off = pstr->offsets[pstr->cur_idx];
898 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
899
900 if (! isascii (ch))
901 return re_string_fetch_byte (pstr);
902
903 re_string_skip_bytes (pstr,
904 re_string_char_size_at (pstr, pstr->cur_idx));
905 return ch;
906 }
907#endif
908
f39eef7b 909 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
37369d1c
UD
910}
911
3b0bdc72 912static void
e2f55264 913re_string_destruct (re_string_t *pstr)
3b0bdc72
UD
914{
915#ifdef RE_ENABLE_I18N
916 re_free (pstr->wcs);
37369d1c 917 re_free (pstr->offsets);
3b0bdc72 918#endif /* RE_ENABLE_I18N */
37369d1c 919 if (pstr->mbs_allocated)
612546c6 920 re_free (pstr->mbs);
3b0bdc72
UD
921}
922
923/* Return the context at IDX in INPUT. */
612546c6 924
3b0bdc72 925static unsigned int
eb04c213 926re_string_context_at (const re_string_t *input, Idx idx, int eflags)
3b0bdc72
UD
927{
928 int c;
f4efbdfb 929 if (__glibc_unlikely (idx < 0))
bb677c95
UD
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input->tip_context;
f4efbdfb 933 if (__glibc_unlikely (idx == input->len))
bb677c95
UD
934 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
a0756231 936#ifdef RE_ENABLE_I18N
3c0fb574 937 if (input->mb_cur_max > 1)
1843975c
RM
938 {
939 wint_t wc;
eb04c213 940 Idx wc_idx = idx;
1843975c
RM
941 while(input->wcs[wc_idx] == WEOF)
942 {
2a0356e1 943 DEBUG_ASSERT (wc_idx >= 0);
1843975c
RM
944 --wc_idx;
945 if (wc_idx < 0)
946 return input->tip_context;
947 }
948 wc = input->wcs[wc_idx];
f4efbdfb
PE
949 if (__glibc_unlikely (input->word_ops_used != 0)
950 && IS_WIDE_WORD_CHAR (wc))
1843975c 951 return CONTEXT_WORD;
56b168be
UD
952 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 ? CONTEXT_NEWLINE : 0);
1843975c 954 }
a0756231
RM
955 else
956#endif
957 {
958 c = re_string_byte_at (input, idx);
56b168be 959 if (bitset_contain (input->word_char, c))
a0756231 960 return CONTEXT_WORD;
56b168be 961 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
a0756231 962 }
3b0bdc72
UD
963}
964\f
965/* Functions for set operation. */
966
967static reg_errcode_t
b41bd5bc 968__attribute_warn_unused_result__
eb04c213 969re_node_set_alloc (re_node_set *set, Idx size)
3b0bdc72
UD
970{
971 set->alloc = size;
972 set->nelem = 0;
eb04c213 973 set->elems = re_malloc (Idx, size);
f4efbdfb
PE
974 if (__glibc_unlikely (set->elems == NULL)
975 && (MALLOC_0_IS_NONNULL || size != 0))
3b0bdc72
UD
976 return REG_ESPACE;
977 return REG_NOERROR;
978}
979
980static reg_errcode_t
b41bd5bc 981__attribute_warn_unused_result__
eb04c213 982re_node_set_init_1 (re_node_set *set, Idx elem)
3b0bdc72
UD
983{
984 set->alloc = 1;
985 set->nelem = 1;
eb04c213 986 set->elems = re_malloc (Idx, 1);
f4efbdfb 987 if (__glibc_unlikely (set->elems == NULL))
6291ee3c
UD
988 {
989 set->alloc = set->nelem = 0;
990 return REG_ESPACE;
991 }
3b0bdc72
UD
992 set->elems[0] = elem;
993 return REG_NOERROR;
994}
995
996static reg_errcode_t
b41bd5bc 997__attribute_warn_unused_result__
eb04c213 998re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
3b0bdc72
UD
999{
1000 set->alloc = 2;
eb04c213 1001 set->elems = re_malloc (Idx, 2);
f4efbdfb 1002 if (__glibc_unlikely (set->elems == NULL))
3b0bdc72
UD
1003 return REG_ESPACE;
1004 if (elem1 == elem2)
1005 {
1006 set->nelem = 1;
1007 set->elems[0] = elem1;
1008 }
1009 else
1010 {
1011 set->nelem = 2;
1012 if (elem1 < elem2)
1b2c2628
UD
1013 {
1014 set->elems[0] = elem1;
1015 set->elems[1] = elem2;
1016 }
3b0bdc72 1017 else
1b2c2628
UD
1018 {
1019 set->elems[0] = elem2;
1020 set->elems[1] = elem1;
1021 }
3b0bdc72
UD
1022 }
1023 return REG_NOERROR;
1024}
1025
1026static reg_errcode_t
b41bd5bc 1027__attribute_warn_unused_result__
e2f55264 1028re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
3b0bdc72
UD
1029{
1030 dest->nelem = src->nelem;
1031 if (src->nelem > 0)
1032 {
1033 dest->alloc = dest->nelem;
eb04c213 1034 dest->elems = re_malloc (Idx, dest->alloc);
f4efbdfb 1035 if (__glibc_unlikely (dest->elems == NULL))
6291ee3c
UD
1036 {
1037 dest->alloc = dest->nelem = 0;
1038 return REG_ESPACE;
1039 }
eb04c213 1040 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
3b0bdc72
UD
1041 }
1042 else
1043 re_node_set_init_empty (dest);
1044 return REG_NOERROR;
1045}
1046
a9388965
UD
1047/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1050
3b0bdc72 1051static reg_errcode_t
b41bd5bc 1052__attribute_warn_unused_result__
e2f55264
UD
1053re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054 const re_node_set *src2)
3b0bdc72 1055{
eb04c213 1056 Idx i1, i2, is, id, delta, sbase;
59e7ebcc
UD
1057 if (src1->nelem == 0 || src2->nelem == 0)
1058 return REG_NOERROR;
1059
1060 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061 conservative estimate. */
1062 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
3b0bdc72 1063 {
eb04c213
AZ
1064 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
f4efbdfb 1066 if (__glibc_unlikely (new_elems == NULL))
2da42bc0 1067 return REG_ESPACE;
59e7ebcc
UD
1068 dest->elems = new_elems;
1069 dest->alloc = new_alloc;
3b0bdc72 1070 }
3b0bdc72 1071
59e7ebcc
UD
1072 /* Find the items in the intersection of SRC1 and SRC2, and copy
1073 into the top of DEST those that are not already in DEST itself. */
1074 sbase = dest->nelem + src1->nelem + src2->nelem;
1075 i1 = src1->nelem - 1;
1076 i2 = src2->nelem - 1;
1077 id = dest->nelem - 1;
1078 for (;;)
3b0bdc72 1079 {
59e7ebcc 1080 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1081 {
59e7ebcc
UD
1082 /* Try to find the item in DEST. Maybe we could binary search? */
1083 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1084 --id;
1085
2da42bc0 1086 if (id < 0 || dest->elems[id] != src1->elems[i1])
eb04c213 1087 dest->elems[--sbase] = src1->elems[i1];
59e7ebcc
UD
1088
1089 if (--i1 < 0 || --i2 < 0)
1090 break;
1b2c2628 1091 }
59e7ebcc
UD
1092
1093 /* Lower the highest of the two items. */
1094 else if (src1->elems[i1] < src2->elems[i2])
1b2c2628 1095 {
59e7ebcc
UD
1096 if (--i2 < 0)
1097 break;
1098 }
1099 else
1100 {
1101 if (--i1 < 0)
1102 break;
1b2c2628 1103 }
3b0bdc72 1104 }
59e7ebcc
UD
1105
1106 id = dest->nelem - 1;
1107 is = dest->nelem + src1->nelem + src2->nelem - 1;
1108 delta = is - sbase + 1;
1109
1110 /* Now copy. When DELTA becomes zero, the remaining
1111 DEST elements are already in place; this is more or
1112 less the same loop that is in re_node_set_merge. */
1113 dest->nelem += delta;
1114 if (delta > 0 && id >= 0)
1115 for (;;)
1116 {
2da42bc0
UD
1117 if (dest->elems[is] > dest->elems[id])
1118 {
1119 /* Copy from the top. */
1120 dest->elems[id + delta--] = dest->elems[is--];
1121 if (delta == 0)
1122 break;
1123 }
1124 else
1125 {
1126 /* Slide from the bottom. */
1127 dest->elems[id + delta] = dest->elems[id];
1128 if (--id < 0)
1129 break;
1130 }
59e7ebcc
UD
1131 }
1132
1133 /* Copy remaining SRC elements. */
eb04c213 1134 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
59e7ebcc 1135
3b0bdc72
UD
1136 return REG_NOERROR;
1137}
1138
a9388965
UD
1139/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1141
3b0bdc72 1142static reg_errcode_t
b41bd5bc 1143__attribute_warn_unused_result__
e2f55264
UD
1144re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145 const re_node_set *src2)
3b0bdc72 1146{
eb04c213 1147 Idx i1, i2, id;
3b0bdc72
UD
1148 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149 {
1150 dest->alloc = src1->nelem + src2->nelem;
eb04c213 1151 dest->elems = re_malloc (Idx, dest->alloc);
f4efbdfb 1152 if (__glibc_unlikely (dest->elems == NULL))
1b2c2628 1153 return REG_ESPACE;
3b0bdc72
UD
1154 }
1155 else
1156 {
1157 if (src1 != NULL && src1->nelem > 0)
1b2c2628 1158 return re_node_set_init_copy (dest, src1);
3b0bdc72 1159 else if (src2 != NULL && src2->nelem > 0)
1b2c2628 1160 return re_node_set_init_copy (dest, src2);
3b0bdc72 1161 else
1b2c2628 1162 re_node_set_init_empty (dest);
3b0bdc72
UD
1163 return REG_NOERROR;
1164 }
1165 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166 {
1167 if (src1->elems[i1] > src2->elems[i2])
1b2c2628
UD
1168 {
1169 dest->elems[id++] = src2->elems[i2++];
1170 continue;
1171 }
3b0bdc72 1172 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1173 ++i2;
3b0bdc72
UD
1174 dest->elems[id++] = src1->elems[i1++];
1175 }
1176 if (i1 < src1->nelem)
1177 {
1178 memcpy (dest->elems + id, src1->elems + i1,
eb04c213 1179 (src1->nelem - i1) * sizeof (Idx));
3b0bdc72
UD
1180 id += src1->nelem - i1;
1181 }
1182 else if (i2 < src2->nelem)
1183 {
1184 memcpy (dest->elems + id, src2->elems + i2,
eb04c213 1185 (src2->nelem - i2) * sizeof (Idx));
3b0bdc72
UD
1186 id += src2->nelem - i2;
1187 }
1188 dest->nelem = id;
1189 return REG_NOERROR;
1190}
1191
a9388965
UD
1192/* Calculate the union set of the sets DEST and SRC. And store it to
1193 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1194
3b0bdc72 1195static reg_errcode_t
b41bd5bc 1196__attribute_warn_unused_result__
e2f55264 1197re_node_set_merge (re_node_set *dest, const re_node_set *src)
3b0bdc72 1198{
eb04c213 1199 Idx is, id, sbase, delta;
3b0bdc72
UD
1200 if (src == NULL || src->nelem == 0)
1201 return REG_NOERROR;
59e7ebcc 1202 if (dest->alloc < 2 * src->nelem + dest->nelem)
3b0bdc72 1203 {
eb04c213
AZ
1204 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
f4efbdfb 1206 if (__glibc_unlikely (new_buffer == NULL))
1b2c2628
UD
1207 return REG_ESPACE;
1208 dest->elems = new_buffer;
951d6408 1209 dest->alloc = new_alloc;
3b0bdc72
UD
1210 }
1211
f4efbdfb 1212 if (__glibc_unlikely (dest->nelem == 0))
3b0bdc72 1213 {
0b5ca7c3
PE
1214 /* Although we already guaranteed above that dest->alloc != 0 and
1215 therefore dest->elems != NULL, add a debug assertion to pacify
1216 GCC 11.2.1's -fanalyzer. */
1217 DEBUG_ASSERT (dest->elems);
59e7ebcc 1218 dest->nelem = src->nelem;
eb04c213 1219 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
59e7ebcc
UD
1220 return REG_NOERROR;
1221 }
3b0bdc72 1222
59e7ebcc
UD
1223 /* Copy into the top of DEST the items of SRC that are not
1224 found in DEST. Maybe we could binary search in DEST? */
1225 for (sbase = dest->nelem + 2 * src->nelem,
1226 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1227 {
1228 if (dest->elems[id] == src->elems[is])
2da42bc0 1229 is--, id--;
59e7ebcc 1230 else if (dest->elems[id] < src->elems[is])
2da42bc0 1231 dest->elems[--sbase] = src->elems[is--];
59e7ebcc 1232 else /* if (dest->elems[id] > src->elems[is]) */
2da42bc0 1233 --id;
59e7ebcc 1234 }
3b0bdc72 1235
59e7ebcc
UD
1236 if (is >= 0)
1237 {
1238 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1239 sbase -= is + 1;
eb04c213 1240 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
3b0bdc72
UD
1241 }
1242
59e7ebcc
UD
1243 id = dest->nelem - 1;
1244 is = dest->nelem + 2 * src->nelem - 1;
1245 delta = is - sbase + 1;
1246 if (delta == 0)
1247 return REG_NOERROR;
1248
1249 /* Now copy. When DELTA becomes zero, the remaining
1250 DEST elements are already in place. */
1251 dest->nelem += delta;
1252 for (;;)
3b0bdc72 1253 {
59e7ebcc 1254 if (dest->elems[is] > dest->elems[id])
2da42bc0 1255 {
59e7ebcc 1256 /* Copy from the top. */
2da42bc0 1257 dest->elems[id + delta--] = dest->elems[is--];
59e7ebcc
UD
1258 if (delta == 0)
1259 break;
1260 }
1261 else
2da42bc0
UD
1262 {
1263 /* Slide from the bottom. */
1264 dest->elems[id + delta] = dest->elems[id];
59e7ebcc
UD
1265 if (--id < 0)
1266 {
1267 /* Copy remaining SRC elements. */
1268 memcpy (dest->elems, dest->elems + sbase,
eb04c213 1269 delta * sizeof (Idx));
59e7ebcc
UD
1270 break;
1271 }
1272 }
3b0bdc72 1273 }
59e7ebcc 1274
3b0bdc72
UD
1275 return REG_NOERROR;
1276}
1277
1278/* Insert the new element ELEM to the re_node_set* SET.
8503c987 1279 SET should not already have ELEM.
eb04c213 1280 Return true if successful. */
3b0bdc72 1281
eb04c213 1282static bool
b41bd5bc 1283__attribute_warn_unused_result__
eb04c213 1284re_node_set_insert (re_node_set *set, Idx elem)
3b0bdc72 1285{
eb04c213 1286 Idx idx;
8503c987
UD
1287 /* In case the set is empty. */
1288 if (set->alloc == 0)
f4efbdfb 1289 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
3b0bdc72 1290
f4efbdfb 1291 if (__glibc_unlikely (set->nelem) == 0)
3b0bdc72 1292 {
0b5ca7c3
PE
1293 /* Although we already guaranteed above that set->alloc != 0 and
1294 therefore set->elems != NULL, add a debug assertion to pacify
1295 GCC 11.2 -fanalyzer. */
1296 DEBUG_ASSERT (set->elems);
8503c987
UD
1297 set->elems[0] = elem;
1298 ++set->nelem;
eb04c213 1299 return true;
3b0bdc72
UD
1300 }
1301
1302 /* Realloc if we need. */
8503c987 1303 if (set->alloc == set->nelem)
3b0bdc72 1304 {
eb04c213 1305 Idx *new_elems;
3b0bdc72 1306 set->alloc = set->alloc * 2;
eb04c213 1307 new_elems = re_realloc (set->elems, Idx, set->alloc);
f4efbdfb 1308 if (__glibc_unlikely (new_elems == NULL))
eb04c213 1309 return false;
2d87db5b 1310 set->elems = new_elems;
3b0bdc72 1311 }
8503c987
UD
1312
1313 /* Move the elements which follows the new element. Test the
1314 first element separately to skip a check in the inner loop. */
1315 if (elem < set->elems[0])
1316 {
59e7ebcc 1317 for (idx = set->nelem; idx > 0; idx--)
2da42bc0 1318 set->elems[idx] = set->elems[idx - 1];
8503c987 1319 }
3b0bdc72
UD
1320 else
1321 {
8503c987 1322 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2da42bc0 1323 set->elems[idx] = set->elems[idx - 1];
0b5ca7c3 1324 DEBUG_ASSERT (set->elems[idx - 1] < elem);
3b0bdc72 1325 }
8503c987 1326
3b0bdc72
UD
1327 /* Insert the new element. */
1328 set->elems[idx] = elem;
1329 ++set->nelem;
eb04c213 1330 return true;
3b0bdc72
UD
1331}
1332
20dc2f79
UD
1333/* Insert the new element ELEM to the re_node_set* SET.
1334 SET should not already have any element greater than or equal to ELEM.
eb04c213 1335 Return true if successful. */
20dc2f79 1336
eb04c213 1337static bool
b41bd5bc 1338__attribute_warn_unused_result__
eb04c213 1339re_node_set_insert_last (re_node_set *set, Idx elem)
20dc2f79
UD
1340{
1341 /* Realloc if we need. */
1342 if (set->alloc == set->nelem)
1343 {
eb04c213 1344 Idx *new_elems;
20dc2f79 1345 set->alloc = (set->alloc + 1) * 2;
eb04c213 1346 new_elems = re_realloc (set->elems, Idx, set->alloc);
f4efbdfb 1347 if (__glibc_unlikely (new_elems == NULL))
eb04c213 1348 return false;
2d87db5b 1349 set->elems = new_elems;
20dc2f79
UD
1350 }
1351
1352 /* Insert the new element. */
1353 set->elems[set->nelem++] = elem;
eb04c213 1354 return true;
20dc2f79
UD
1355}
1356
3b0bdc72 1357/* Compare two node sets SET1 and SET2.
eb04c213 1358 Return true if SET1 and SET2 are equivalent. */
3b0bdc72 1359
eb04c213
AZ
1360static bool
1361__attribute__ ((pure))
e2f55264 1362re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
3b0bdc72 1363{
eb04c213 1364 Idx i;
3b0bdc72 1365 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
eb04c213 1366 return false;
59e7ebcc 1367 for (i = set1->nelem ; --i >= 0 ; )
3b0bdc72 1368 if (set1->elems[i] != set2->elems[i])
eb04c213
AZ
1369 return false;
1370 return true;
3b0bdc72
UD
1371}
1372
0742e48e 1373/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
3b0bdc72 1374
eb04c213
AZ
1375static Idx
1376__attribute__ ((pure))
1377re_node_set_contains (const re_node_set *set, Idx elem)
3b0bdc72 1378{
eb04c213 1379 __re_size_t idx, right, mid;
3b0bdc72
UD
1380 if (set->nelem <= 0)
1381 return 0;
1382
1383 /* Binary search the element. */
1384 idx = 0;
1385 right = set->nelem - 1;
1386 while (idx < right)
1387 {
1388 mid = (idx + right) / 2;
1389 if (set->elems[mid] < elem)
1b2c2628 1390 idx = mid + 1;
3b0bdc72 1391 else
1b2c2628 1392 right = mid;
3b0bdc72 1393 }
0742e48e 1394 return set->elems[idx] == elem ? idx + 1 : 0;
3b0bdc72
UD
1395}
1396
1397static void
eb04c213 1398re_node_set_remove_at (re_node_set *set, Idx idx)
3b0bdc72
UD
1399{
1400 if (idx < 0 || idx >= set->nelem)
1401 return;
3b0bdc72 1402 --set->nelem;
59e7ebcc
UD
1403 for (; idx < set->nelem; idx++)
1404 set->elems[idx] = set->elems[idx + 1];
3b0bdc72
UD
1405}
1406\f
1407
1408/* Add the token TOKEN to dfa->nodes, and return the index of the token.
eb04c213 1409 Or return -1 if an error occurred. */
3b0bdc72 1410
eb04c213 1411static Idx
e2f55264 1412re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
3b0bdc72 1413{
f4efbdfb 1414 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
3b0bdc72 1415 {
01ed6ceb 1416 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
eb04c213 1417 Idx *new_nexts, *new_indices;
963d8d78 1418 re_node_set *new_edests, *new_eclosures;
01ed6ceb
UD
1419 re_token_t *new_nodes;
1420
22364644
UD
1421 /* Avoid overflows in realloc. */
1422 const size_t max_object_size = MAX (sizeof (re_token_t),
1423 MAX (sizeof (re_node_set),
eb04c213 1424 sizeof (Idx)));
f4efbdfb
PE
1425 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1426 < new_nodes_alloc))
01ed6ceb 1427 return -1;
02f3550c 1428
01ed6ceb 1429 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
f4efbdfb 1430 if (__glibc_unlikely (new_nodes == NULL))
0ecb606c 1431 return -1;
2d87db5b 1432 dfa->nodes = new_nodes;
eb04c213
AZ
1433 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1434 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
02f3550c
UD
1435 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1436 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
f4efbdfb
PE
1437 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1438 || new_edests == NULL || new_eclosures == NULL))
eb04c213
AZ
1439 {
1440 re_free (new_nexts);
1441 re_free (new_indices);
1442 re_free (new_edests);
1443 re_free (new_eclosures);
1444 return -1;
1445 }
02f3550c
UD
1446 dfa->nexts = new_nexts;
1447 dfa->org_indices = new_indices;
1448 dfa->edests = new_edests;
1449 dfa->eclosures = new_eclosures;
951d6408 1450 dfa->nodes_alloc = new_nodes_alloc;
3b0bdc72
UD
1451 }
1452 dfa->nodes[dfa->nodes_len] = token;
485d775d 1453 dfa->nodes[dfa->nodes_len].constraint = 0;
02f3550c
UD
1454#ifdef RE_ENABLE_I18N
1455 dfa->nodes[dfa->nodes_len].accept_mb =
eb04c213
AZ
1456 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1457 || token.type == COMPLEX_BRACKET);
02f3550c
UD
1458#endif
1459 dfa->nexts[dfa->nodes_len] = -1;
1460 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1461 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
3b0bdc72
UD
1462 return dfa->nodes_len++;
1463}
1464
eb04c213 1465static re_hashval_t
e2f55264 1466calc_state_hash (const re_node_set *nodes, unsigned int context)
3b0bdc72 1467{
eb04c213
AZ
1468 re_hashval_t hash = nodes->nelem + context;
1469 Idx i;
3b0bdc72
UD
1470 for (i = 0 ; i < nodes->nelem ; i++)
1471 hash += nodes->elems[i];
1472 return hash;
1473}
1474
1475/* Search for the state whose node_set is equivalent to NODES.
1476 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1477 Otherwise create the new one and return it. In case of an error
1478 return NULL and set the error code in ERR.
1479 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1480 return value is NULL and ERR is REG_NOERROR.
1481 - We never return non-NULL value in case of any errors, it is for
1482 optimization. */
a9388965 1483
e2f55264 1484static re_dfastate_t *
b41bd5bc 1485__attribute_warn_unused_result__
e2f55264
UD
1486re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1487 const re_node_set *nodes)
3b0bdc72 1488{
eb04c213 1489 re_hashval_t hash;
a9388965 1490 re_dfastate_t *new_state;
3b0bdc72 1491 struct re_state_table_entry *spot;
eb04c213
AZ
1492 Idx i;
1493#if defined GCC_LINT || defined lint
1494 /* Suppress bogus uninitialized-variable warnings. */
1495 *err = REG_NOERROR;
1496#endif
f4efbdfb 1497 if (__glibc_unlikely (nodes->nelem == 0))
a9388965
UD
1498 {
1499 *err = REG_NOERROR;
1500 return NULL;
1501 }
3b0bdc72
UD
1502 hash = calc_state_hash (nodes, 0);
1503 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1504
bc15410e 1505 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1506 {
bc15410e
UD
1507 re_dfastate_t *state = spot->array[i];
1508 if (hash != state->hash)
1b2c2628 1509 continue;
bc15410e 1510 if (re_node_set_compare (&state->nodes, nodes))
1b2c2628 1511 return state;
3b0bdc72 1512 }
3b0bdc72
UD
1513
1514 /* There are no appropriate state in the dfa, create the new one. */
a9388965 1515 new_state = create_ci_newstate (dfa, nodes, hash);
f4efbdfb 1516 if (__glibc_unlikely (new_state == NULL))
2d87db5b
UD
1517 *err = REG_ESPACE;
1518
1519 return new_state;
3b0bdc72
UD
1520}
1521
1522/* Search for the state whose node_set is equivalent to NODES and
1523 whose context is equivalent to CONTEXT.
1524 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1525 Otherwise create the new one and return it. In case of an error
1526 return NULL and set the error code in ERR.
1527 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1528 return value is NULL and ERR is REG_NOERROR.
1529 - We never return non-NULL value in case of any errors, it is for
1530 optimization. */
a9388965 1531
db26cb75 1532static re_dfastate_t *
b41bd5bc 1533__attribute_warn_unused_result__
e2f55264
UD
1534re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1535 const re_node_set *nodes, unsigned int context)
3b0bdc72 1536{
eb04c213 1537 re_hashval_t hash;
a9388965 1538 re_dfastate_t *new_state;
3b0bdc72 1539 struct re_state_table_entry *spot;
eb04c213
AZ
1540 Idx i;
1541#if defined GCC_LINT || defined lint
1542 /* Suppress bogus uninitialized-variable warnings. */
1543 *err = REG_NOERROR;
1544#endif
3b0bdc72 1545 if (nodes->nelem == 0)
a9388965
UD
1546 {
1547 *err = REG_NOERROR;
1548 return NULL;
1549 }
3b0bdc72
UD
1550 hash = calc_state_hash (nodes, context);
1551 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1552
bc15410e 1553 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1554 {
bc15410e 1555 re_dfastate_t *state = spot->array[i];
a0a8461c
UD
1556 if (state->hash == hash
1557 && state->context == context
1558 && re_node_set_compare (state->entrance_nodes, nodes))
1b2c2628 1559 return state;
3b0bdc72 1560 }
eb04c213 1561 /* There are no appropriate state in 'dfa', create the new one. */
a9388965 1562 new_state = create_cd_newstate (dfa, nodes, context, hash);
f4efbdfb 1563 if (__glibc_unlikely (new_state == NULL))
2d87db5b
UD
1564 *err = REG_ESPACE;
1565
1566 return new_state;
3b0bdc72
UD
1567}
1568
5cf1ec52
UD
1569/* Finish initialization of the new state NEWSTATE, and using its hash value
1570 HASH put in the appropriate bucket of DFA's state table. Return value
1571 indicates the error code if failed. */
a9388965 1572
5cf1ec52 1573static reg_errcode_t
2da42bc0 1574__attribute_warn_unused_result__
e2f55264 1575register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
eb04c213 1576 re_hashval_t hash)
3b0bdc72 1577{
5cf1ec52 1578 struct re_state_table_entry *spot;
1b2c2628 1579 reg_errcode_t err;
eb04c213 1580 Idx i;
5cf1ec52
UD
1581
1582 newstate->hash = hash;
1583 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
f4efbdfb 1584 if (__glibc_unlikely (err != REG_NOERROR))
5cf1ec52
UD
1585 return REG_ESPACE;
1586 for (i = 0; i < newstate->nodes.nelem; i++)
1b2c2628 1587 {
eb04c213 1588 Idx elem = newstate->nodes.elems[i];
5cf1ec52 1589 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
eb04c213 1590 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
2da42bc0 1591 return REG_ESPACE;
1b2c2628 1592 }
3b0bdc72 1593
3b0bdc72 1594 spot = dfa->state_table + (hash & dfa->state_hash_mask);
f4efbdfb 1595 if (__glibc_unlikely (spot->alloc <= spot->num))
3b0bdc72 1596 {
eb04c213 1597 Idx new_alloc = 2 * spot->num + 2;
951d6408
UD
1598 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1599 new_alloc);
f4efbdfb 1600 if (__glibc_unlikely (new_array == NULL))
1b2c2628
UD
1601 return REG_ESPACE;
1602 spot->array = new_array;
951d6408 1603 spot->alloc = new_alloc;
3b0bdc72 1604 }
bc15410e 1605 spot->array[spot->num++] = newstate;
a9388965 1606 return REG_NOERROR;
3b0bdc72
UD
1607}
1608
db26cb75
UD
1609static void
1610free_state (re_dfastate_t *state)
1611{
1612 re_node_set_free (&state->non_eps_nodes);
1613 re_node_set_free (&state->inveclosure);
1614 if (state->entrance_nodes != &state->nodes)
1615 {
1616 re_node_set_free (state->entrance_nodes);
1617 re_free (state->entrance_nodes);
1618 }
1619 re_node_set_free (&state->nodes);
1620 re_free (state->word_trtable);
1621 re_free (state->trtable);
1622 re_free (state);
1623}
1624
5069ff32 1625/* Create the new state which is independent of contexts.
a9388965
UD
1626 Return the new state if succeeded, otherwise return NULL. */
1627
3b0bdc72 1628static re_dfastate_t *
b41bd5bc 1629__attribute_warn_unused_result__
e2f55264 1630create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
eb04c213 1631 re_hashval_t hash)
3b0bdc72 1632{
eb04c213 1633 Idx i;
a9388965 1634 reg_errcode_t err;
3b0bdc72 1635 re_dfastate_t *newstate;
5cf1ec52
UD
1636
1637 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
f4efbdfb 1638 if (__glibc_unlikely (newstate == NULL))
a9388965 1639 return NULL;
5cf1ec52 1640 err = re_node_set_init_copy (&newstate->nodes, nodes);
f4efbdfb 1641 if (__glibc_unlikely (err != REG_NOERROR))
5cf1ec52
UD
1642 {
1643 re_free (newstate);
1644 return NULL;
1645 }
3b0bdc72 1646
5cf1ec52 1647 newstate->entrance_nodes = &newstate->nodes;
3b0bdc72
UD
1648 for (i = 0 ; i < nodes->nelem ; i++)
1649 {
1650 re_token_t *node = dfa->nodes + nodes->elems[i];
1651 re_token_type_t type = node->type;
485d775d 1652 if (type == CHARACTER && !node->constraint)
1b2c2628 1653 continue;
02f3550c
UD
1654#ifdef RE_ENABLE_I18N
1655 newstate->accept_mb |= node->accept_mb;
1656#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1657
1658 /* If the state has the halt node, the state is a halt state. */
02f3550c 1659 if (type == END_OF_RE)
1b2c2628 1660 newstate->halt = 1;
3b0bdc72 1661 else if (type == OP_BACK_REF)
1b2c2628 1662 newstate->has_backref = 1;
485d775d 1663 else if (type == ANCHOR || node->constraint)
1b2c2628 1664 newstate->has_constraint = 1;
3b0bdc72 1665 }
a9388965 1666 err = register_state (dfa, newstate, hash);
f4efbdfb 1667 if (__glibc_unlikely (err != REG_NOERROR))
1b2c2628
UD
1668 {
1669 free_state (newstate);
1670 newstate = NULL;
1671 }
1672 return newstate;
3b0bdc72
UD
1673}
1674
a9388965
UD
1675/* Create the new state which is depend on the context CONTEXT.
1676 Return the new state if succeeded, otherwise return NULL. */
1677
3b0bdc72 1678static re_dfastate_t *
b41bd5bc 1679__attribute_warn_unused_result__
e2f55264 1680create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
eb04c213 1681 unsigned int context, re_hashval_t hash)
3b0bdc72 1682{
eb04c213 1683 Idx i, nctx_nodes = 0;
a9388965 1684 reg_errcode_t err;
3b0bdc72
UD
1685 re_dfastate_t *newstate;
1686
5cf1ec52 1687 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
f4efbdfb 1688 if (__glibc_unlikely (newstate == NULL))
a9388965 1689 return NULL;
5cf1ec52 1690 err = re_node_set_init_copy (&newstate->nodes, nodes);
f4efbdfb 1691 if (__glibc_unlikely (err != REG_NOERROR))
5cf1ec52
UD
1692 {
1693 re_free (newstate);
1694 return NULL;
1695 }
1696
3b0bdc72
UD
1697 newstate->context = context;
1698 newstate->entrance_nodes = &newstate->nodes;
1699
1700 for (i = 0 ; i < nodes->nelem ; i++)
1701 {
3b0bdc72
UD
1702 re_token_t *node = dfa->nodes + nodes->elems[i];
1703 re_token_type_t type = node->type;
0caca71a 1704 unsigned int constraint = node->constraint;
3b0bdc72 1705
485d775d 1706 if (type == CHARACTER && !constraint)
1b2c2628 1707 continue;
a334319f 1708#ifdef RE_ENABLE_I18N
02f3550c 1709 newstate->accept_mb |= node->accept_mb;
a334319f 1710#endif /* RE_ENABLE_I18N */
02f3550c
UD
1711
1712 /* If the state has the halt node, the state is a halt state. */
1713 if (type == END_OF_RE)
1714 newstate->halt = 1;
3b0bdc72 1715 else if (type == OP_BACK_REF)
1b2c2628 1716 newstate->has_backref = 1;
3b0bdc72
UD
1717
1718 if (constraint)
1b2c2628
UD
1719 {
1720 if (newstate->entrance_nodes == &newstate->nodes)
1721 {
8a80ee5e
PE
1722 re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1723 if (__glibc_unlikely (entrance_nodes == NULL))
1b2c2628
UD
1724 {
1725 free_state (newstate);
1726 return NULL;
1727 }
8a80ee5e 1728 newstate->entrance_nodes = entrance_nodes;
2da42bc0
UD
1729 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1730 != REG_NOERROR)
8a80ee5e
PE
1731 {
1732 free_state (newstate);
1733 return NULL;
1734 }
1b2c2628
UD
1735 nctx_nodes = 0;
1736 newstate->has_constraint = 1;
1737 }
1738
1739 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1740 {
1741 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1742 ++nctx_nodes;
1743 }
1744 }
3b0bdc72 1745 }
a9388965 1746 err = register_state (dfa, newstate, hash);
f4efbdfb 1747 if (__glibc_unlikely (err != REG_NOERROR))
1b2c2628
UD
1748 {
1749 free_state (newstate);
1750 newstate = NULL;
1751 }
1752 return newstate;
1753}