]> git.ipfire.org Git - thirdparty/glibc.git/blame - posix/regex_internal.c
posix: remove some iso-8859-encoded characters
[thirdparty/glibc.git] / posix / regex_internal.c
CommitLineData
3b0bdc72 1/* Extended regular expression matching and search library.
2b778ceb 2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3b0bdc72
UD
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
eb04c213 18 <https://www.gnu.org/licenses/>. */
3b0bdc72 19
eb04c213 20static void re_string_construct_common (const char *str, Idx len,
1b2c2628 21 re_string_t *pstr,
eb04c213 22 RE_TRANSLATE_TYPE trans, bool icase,
b41bd5bc 23 const re_dfa_t *dfa);
01ed6ceb 24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
1b2c2628 25 const re_node_set *nodes,
eb04c213 26 re_hashval_t hash);
01ed6ceb 27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
1b2c2628
UD
28 const re_node_set *nodes,
29 unsigned int context,
eb04c213
AZ
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
3b0bdc72
UD
41\f
42/* Functions for string operation. */
43
612546c6
UD
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
3b0bdc72 47static reg_errcode_t
b41bd5bc 48__attribute_warn_unused_result__
eb04c213
AZ
49re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
3b0bdc72
UD
51{
52 reg_errcode_t ret;
eb04c213 53 Idx init_buf_len;
97fd3a30
UD
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
f0c7c524 59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
f4efbdfb 62 if (__glibc_unlikely (ret != REG_NOERROR))
612546c6
UD
63 return ret;
64
56b168be
UD
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
37369d1c
UD
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
612546c6
UD
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
b41bd5bc 76__attribute_warn_unused_result__
eb04c213
AZ
77re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
612546c6
UD
79{
80 reg_errcode_t ret;
56b168be 81 memset (pstr, '\0', sizeof (re_string_t));
f0c7c524 82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
612546c6
UD
83
84 if (len > 0)
3b0bdc72 85 {
612546c6 86 ret = re_string_realloc_buffers (pstr, len + 1);
f4efbdfb 87 if (__glibc_unlikely (ret != REG_NOERROR))
1b2c2628 88 return ret;
3b0bdc72 89 }
37369d1c 90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
612546c6
UD
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
f0c7c524 95 if (dfa->mb_cur_max > 1)
37369d1c
UD
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
f4efbdfb 100 if (__glibc_unlikely (ret != REG_NOERROR))
37369d1c
UD
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
f4efbdfb 107 if (__glibc_unlikely (ret != REG_NOERROR))
37369d1c
UD
108 return ret;
109 }
110 }
612546c6 111 else
3b0bdc72 112#endif /* RE_ENABLE_I18N */
1b2c2628 113 build_upper_buffer (pstr);
612546c6
UD
114 }
115 else
3b0bdc72 116 {
612546c6 117#ifdef RE_ENABLE_I18N
f0c7c524 118 if (dfa->mb_cur_max > 1)
1b2c2628 119 build_wcs_buffer (pstr);
612546c6
UD
120 else
121#endif /* RE_ENABLE_I18N */
1b2c2628
UD
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
37369d1c
UD
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
1b2c2628 130 }
3b0bdc72 131 }
612546c6 132
3b0bdc72
UD
133 return REG_NOERROR;
134}
135
612546c6 136/* Helper functions for re_string_allocate, and re_string_construct. */
3b0bdc72
UD
137
138static reg_errcode_t
b41bd5bc 139__attribute_warn_unused_result__
eb04c213 140re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
3b0bdc72 141{
3b0bdc72 142#ifdef RE_ENABLE_I18N
3c0fb574 143 if (pstr->mb_cur_max > 1)
3b0bdc72 144 {
54dd0ab3
UD
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
eb04c213 148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
f4efbdfb
PE
149 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150 < new_buf_len))
54dd0ab3
UD
151 return REG_ESPACE;
152
153 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
f4efbdfb 154 if (__glibc_unlikely (new_wcs == NULL))
1b2c2628 155 return REG_ESPACE;
2d87db5b 156 pstr->wcs = new_wcs;
37369d1c
UD
157 if (pstr->offsets != NULL)
158 {
eb04c213 159 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
f4efbdfb 160 if (__glibc_unlikely (new_offsets == NULL))
37369d1c 161 return REG_ESPACE;
2d87db5b 162 pstr->offsets = new_offsets;
37369d1c 163 }
3b0bdc72 164 }
3b0bdc72 165#endif /* RE_ENABLE_I18N */
37369d1c 166 if (pstr->mbs_allocated)
3b0bdc72 167 {
2d87db5b
UD
168 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
169 new_buf_len);
f4efbdfb 170 if (__glibc_unlikely (new_mbs == NULL))
1b2c2628 171 return REG_ESPACE;
2d87db5b 172 pstr->mbs = new_mbs;
3b0bdc72 173 }
612546c6 174 pstr->bufs_len = new_buf_len;
3b0bdc72
UD
175 return REG_NOERROR;
176}
177
612546c6 178
3b0bdc72 179static void
eb04c213
AZ
180re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
181 RE_TRANSLATE_TYPE trans, bool icase,
e2f55264 182 const re_dfa_t *dfa)
3b0bdc72 183{
c202c2c5 184 pstr->raw_mbs = (const unsigned char *) str;
3b0bdc72 185 pstr->len = len;
37369d1c 186 pstr->raw_len = len;
997470b3 187 pstr->trans = trans;
eb04c213 188 pstr->icase = icase;
37369d1c 189 pstr->mbs_allocated = (trans != NULL || icase);
f0c7c524
UD
190 pstr->mb_cur_max = dfa->mb_cur_max;
191 pstr->is_utf8 = dfa->is_utf8;
192 pstr->map_notascii = dfa->map_notascii;
37369d1c
UD
193 pstr->stop = pstr->len;
194 pstr->raw_stop = pstr->stop;
3b0bdc72
UD
195}
196
197#ifdef RE_ENABLE_I18N
198
612546c6 199/* Build wide character buffer PSTR->WCS.
3b0bdc72
UD
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
612546c6 205 a first byte of a multibyte character.
3b0bdc72 206
612546c6
UD
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
209
210static void
e2f55264 211build_wcs_buffer (re_string_t *pstr)
3b0bdc72 212{
37369d1c 213#ifdef _LIBC
ec73fd87 214 unsigned char buf[MB_LEN_MAX];
2a0356e1 215 DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
37369d1c
UD
216#else
217 unsigned char buf[64];
218#endif
612546c6 219 mbstate_t prev_st;
eb04c213 220 Idx byte_idx, end_idx, remain_len;
88764ae2 221 size_t mbclen;
37369d1c 222
612546c6
UD
223 /* Build the buffers from pstr->valid_len to either pstr->len or
224 pstr->bufs_len. */
37369d1c 225 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
612546c6
UD
226 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227 {
228 wchar_t wc;
37369d1c
UD
229 const char *p;
230
612546c6
UD
231 remain_len = end_idx - byte_idx;
232 prev_st = pstr->cur_state;
37369d1c 233 /* Apply the translation if we need. */
f4efbdfb 234 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c
UD
235 {
236 int i, ch;
237
238 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 {
240 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
d40eb37a 241 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
37369d1c
UD
242 }
243 p = (const char *) buf;
244 }
245 else
246 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
b3918c7d 247 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
f4efbdfb
PE
248 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249 || (mbclen == (size_t) -2
250 && pstr->bufs_len >= pstr->len)))
1b2c2628
UD
251 {
252 /* We treat these cases as a singlebyte character. */
253 mbclen = 1;
254 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
f4efbdfb 255 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c 256 wc = pstr->trans[wc];
1b2c2628
UD
257 pstr->cur_state = prev_st;
258 }
f4efbdfb 259 else if (__glibc_unlikely (mbclen == (size_t) -2))
8887a920
UD
260 {
261 /* The buffer doesn't have enough space, finish to build. */
262 pstr->cur_state = prev_st;
263 break;
264 }
612546c6 265
3b0bdc72 266 /* Write wide character and padding. */
612546c6
UD
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1b2c2628 270 pstr->wcs[byte_idx++] = WEOF;
3b0bdc72 271 }
612546c6 272 pstr->valid_len = byte_idx;
37369d1c 273 pstr->valid_raw_len = byte_idx;
3b0bdc72
UD
274}
275
612546c6
UD
276/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
278
4f7e7f8e 279static reg_errcode_t
b41bd5bc 280__attribute_warn_unused_result__
e2f55264 281build_wcs_upper_buffer (re_string_t *pstr)
3b0bdc72 282{
612546c6 283 mbstate_t prev_st;
eb04c213 284 Idx src_idx, byte_idx, end_idx, remain_len;
88764ae2 285 size_t mbclen;
37369d1c 286#ifdef _LIBC
ec73fd87 287 char buf[MB_LEN_MAX];
2a0356e1 288 DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
37369d1c 289#else
1c99f950 290 char buf[64];
37369d1c
UD
291#endif
292
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
f0c7c524 295
e40a38b3
UD
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
37369d1c
UD
298 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 {
300 while (byte_idx < end_idx)
1b2c2628 301 {
f0c7c524 302 wchar_t wc;
c2a150d0 303 unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
37369d1c 304
c2a150d0 305 if (isascii (ch) && mbsinit (&pstr->cur_state))
37369d1c 306 {
37369d1c 307 /* The next step uses the assumption that wchar_t is encoded
e40a38b3 308 ASCII-safe: all ASCII values can be converted like this. */
c2a150d0
AZ
309 wchar_t wcu = __towupper (ch);
310 if (isascii (wcu))
311 {
312 pstr->mbs[byte_idx] = wcu;
313 pstr->wcs[byte_idx] = wcu;
314 byte_idx++;
315 continue;
316 }
37369d1c
UD
317 }
318
f0c7c524
UD
319 remain_len = end_idx - byte_idx;
320 prev_st = pstr->cur_state;
b3918c7d
UD
321 mbclen = __mbrtowc (&wc,
322 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 + byte_idx), remain_len, &pstr->cur_state);
f4efbdfb 324 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
1b2c2628 325 {
eb04c213
AZ
326 wchar_t wcu = __towupper (wc);
327 if (wcu != wc)
37369d1c 328 {
88764ae2 329 size_t mbcdlen;
37369d1c 330
a5f0adb3 331 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
f4efbdfb 332 if (__glibc_likely (mbclen == mbcdlen))
37369d1c
UD
333 memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 else
335 {
336 src_idx = byte_idx;
337 goto offsets_needed;
338 }
339 }
f0c7c524
UD
340 else
341 memcpy (pstr->mbs + byte_idx,
342 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
37369d1c 343 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
344 /* Write paddings. */
345 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 pstr->wcs[byte_idx++] = WEOF;
347 }
8887a920
UD
348 else if (mbclen == (size_t) -1 || mbclen == 0
349 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
f0c7c524 350 {
8887a920
UD
351 /* It is an invalid character, an incomplete character
352 at the end of the string, or '\0'. Just use the byte. */
02b50340
UD
353 pstr->mbs[byte_idx] = ch;
354 /* And also cast it to wide char. */
355 pstr->wcs[byte_idx++] = (wchar_t) ch;
f4efbdfb 356 if (__glibc_unlikely (mbclen == (size_t) -1))
f0c7c524 357 pstr->cur_state = prev_st;
1b2c2628 358 }
1b2c2628 359 else
f0c7c524
UD
360 {
361 /* The buffer doesn't have enough space, finish to build. */
362 pstr->cur_state = prev_st;
363 break;
364 }
1b2c2628 365 }
37369d1c
UD
366 pstr->valid_len = byte_idx;
367 pstr->valid_raw_len = byte_idx;
368 return REG_NOERROR;
369 }
f0c7c524 370 else
37369d1c 371 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
f0c7c524
UD
372 {
373 wchar_t wc;
37369d1c 374 const char *p;
e40a38b3 375 offsets_needed:
f0c7c524
UD
376 remain_len = end_idx - byte_idx;
377 prev_st = pstr->cur_state;
f4efbdfb 378 if (__glibc_unlikely (pstr->trans != NULL))
f0c7c524 379 {
37369d1c
UD
380 int i, ch;
381
382 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
f0c7c524 383 {
37369d1c
UD
384 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
385 buf[i] = pstr->trans[ch];
f0c7c524 386 }
37369d1c 387 p = (const char *) buf;
f0c7c524 388 }
37369d1c
UD
389 else
390 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
b3918c7d 391 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
f4efbdfb 392 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
f0c7c524 393 {
eb04c213
AZ
394 wchar_t wcu = __towupper (wc);
395 if (wcu != wc)
37369d1c 396 {
88764ae2 397 size_t mbcdlen;
37369d1c 398
a5f0adb3 399 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
f4efbdfb 400 if (__glibc_likely (mbclen == mbcdlen))
37369d1c 401 memcpy (pstr->mbs + byte_idx, buf, mbclen);
88764ae2 402 else if (mbcdlen != (size_t) -1)
37369d1c 403 {
88764ae2 404 size_t i;
37369d1c
UD
405
406 if (byte_idx + mbcdlen > pstr->bufs_len)
407 {
408 pstr->cur_state = prev_st;
409 break;
410 }
411
412 if (pstr->offsets == NULL)
413 {
eb04c213 414 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
37369d1c
UD
415
416 if (pstr->offsets == NULL)
417 return REG_ESPACE;
418 }
419 if (!pstr->offsets_needed)
420 {
88764ae2 421 for (i = 0; i < (size_t) byte_idx; ++i)
37369d1c
UD
422 pstr->offsets[i] = i;
423 pstr->offsets_needed = 1;
424 }
425
426 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427 pstr->wcs[byte_idx] = wcu;
428 pstr->offsets[byte_idx] = src_idx;
429 for (i = 1; i < mbcdlen; ++i)
430 {
431 pstr->offsets[byte_idx + i]
432 = src_idx + (i < mbclen ? i : mbclen - 1);
433 pstr->wcs[byte_idx + i] = WEOF;
434 }
435 pstr->len += mbcdlen - mbclen;
436 if (pstr->raw_stop > src_idx)
437 pstr->stop += mbcdlen - mbclen;
438 end_idx = (pstr->bufs_len > pstr->len)
439 ? pstr->len : pstr->bufs_len;
440 byte_idx += mbcdlen;
441 src_idx += mbclen;
442 continue;
443 }
2da42bc0
UD
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
37369d1c 446 }
f0c7c524 447 else
37369d1c
UD
448 memcpy (pstr->mbs + byte_idx, p, mbclen);
449
f4efbdfb 450 if (__glibc_unlikely (pstr->offsets_needed != 0))
37369d1c 451 {
88764ae2 452 size_t i;
37369d1c
UD
453 for (i = 0; i < mbclen; ++i)
454 pstr->offsets[byte_idx + i] = src_idx + i;
455 }
456 src_idx += mbclen;
457
458 pstr->wcs[byte_idx++] = wcu;
f0c7c524
UD
459 /* Write paddings. */
460 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461 pstr->wcs[byte_idx++] = WEOF;
462 }
8887a920
UD
463 else if (mbclen == (size_t) -1 || mbclen == 0
464 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
02b50340 465 {
37369d1c
UD
466 /* It is an invalid character or '\0'. Just use the byte. */
467 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468
f4efbdfb 469 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c 470 ch = pstr->trans [ch];
02b50340 471 pstr->mbs[byte_idx] = ch;
37369d1c 472
f4efbdfb 473 if (__glibc_unlikely (pstr->offsets_needed != 0))
37369d1c
UD
474 pstr->offsets[byte_idx] = src_idx;
475 ++src_idx;
476
02b50340
UD
477 /* And also cast it to wide char. */
478 pstr->wcs[byte_idx++] = (wchar_t) ch;
f4efbdfb 479 if (__glibc_unlikely (mbclen == (size_t) -1))
02b50340
UD
480 pstr->cur_state = prev_st;
481 }
f0c7c524
UD
482 else
483 {
484 /* The buffer doesn't have enough space, finish to build. */
485 pstr->cur_state = prev_st;
486 break;
487 }
488 }
612546c6 489 pstr->valid_len = byte_idx;
37369d1c
UD
490 pstr->valid_raw_len = src_idx;
491 return REG_NOERROR;
612546c6
UD
492}
493
494/* Skip characters until the index becomes greater than NEW_RAW_IDX.
495 Return the index. */
496
eb04c213
AZ
497static Idx
498re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
612546c6
UD
499{
500 mbstate_t prev_st;
eb04c213 501 Idx rawbuf_idx;
88764ae2 502 size_t mbclen;
4f08104c 503 wint_t wc = WEOF;
612546c6
UD
504
505 /* Skip the characters which are not necessary to check. */
37369d1c 506 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
612546c6
UD
507 rawbuf_idx < new_raw_idx;)
508 {
4f08104c 509 wchar_t wc2;
eb04c213 510 Idx remain_len = pstr->raw_len - rawbuf_idx;
612546c6 511 prev_st = pstr->cur_state;
4f08104c 512 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
b3918c7d 513 remain_len, &pstr->cur_state);
f4efbdfb
PE
514 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
515 || mbclen == 0))
1b2c2628 516 {
33e63e79
UD
517 /* We treat these cases as a single byte character. */
518 if (mbclen == 0 || remain_len == 0)
519 wc = L'\0';
520 else
521 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1b2c2628
UD
522 mbclen = 1;
523 pstr->cur_state = prev_st;
524 }
4f08104c 525 else
eb04c213 526 wc = wc2;
612546c6
UD
527 /* Then proceed the next character. */
528 rawbuf_idx += mbclen;
529 }
4f08104c 530 *last_wc = wc;
612546c6 531 return rawbuf_idx;
3b0bdc72
UD
532}
533#endif /* RE_ENABLE_I18N */
534
1b2c2628 535/* Build the buffer PSTR->MBS, and apply the translation if we need.
612546c6
UD
536 This function is used in case of REG_ICASE. */
537
538static void
e2f55264 539build_upper_buffer (re_string_t *pstr)
3b0bdc72 540{
eb04c213 541 Idx char_idx, end_idx;
612546c6 542 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
3b0bdc72 543
612546c6 544 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
3b0bdc72 545 {
612546c6 546 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
f4efbdfb 547 if (__glibc_unlikely (pstr->trans != NULL))
37369d1c 548 ch = pstr->trans[ch];
eb04c213 549 pstr->mbs[char_idx] = toupper (ch);
3b0bdc72 550 }
612546c6 551 pstr->valid_len = char_idx;
37369d1c 552 pstr->valid_raw_len = char_idx;
3b0bdc72
UD
553}
554
612546c6 555/* Apply TRANS to the buffer in PSTR. */
3b0bdc72 556
612546c6 557static void
e2f55264 558re_string_translate_buffer (re_string_t *pstr)
3b0bdc72 559{
eb04c213 560 Idx buf_idx, end_idx;
612546c6
UD
561 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562
563 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
3b0bdc72 564 {
612546c6 565 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
37369d1c 566 pstr->mbs[buf_idx] = pstr->trans[ch];
3b0bdc72 567 }
612546c6
UD
568
569 pstr->valid_len = buf_idx;
37369d1c 570 pstr->valid_raw_len = buf_idx;
612546c6
UD
571}
572
573/* This function re-construct the buffers.
3c0fb574 574 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
612546c6
UD
575 convert to upper case in case of REG_ICASE, apply translation. */
576
577static reg_errcode_t
b41bd5bc 578__attribute_warn_unused_result__
eb04c213 579re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
612546c6 580{
eb04c213
AZ
581 Idx offset;
582
f4efbdfb 583 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
eb04c213
AZ
584 offset = idx - pstr->raw_mbs_idx;
585 else
612546c6
UD
586 {
587 /* Reset buffer. */
434d3784 588#ifdef RE_ENABLE_I18N
3c0fb574 589 if (pstr->mb_cur_max > 1)
1b2c2628 590 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
434d3784 591#endif /* RE_ENABLE_I18N */
37369d1c
UD
592 pstr->len = pstr->raw_len;
593 pstr->stop = pstr->raw_stop;
594 pstr->valid_len = 0;
595 pstr->raw_mbs_idx = 0;
596 pstr->valid_raw_len = 0;
597 pstr->offsets_needed = 0;
612546c6 598 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1b2c2628 599 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
37369d1c 600 if (!pstr->mbs_allocated)
1b2c2628 601 pstr->mbs = (unsigned char *) pstr->raw_mbs;
612546c6
UD
602 offset = idx;
603 }
604
f4efbdfb 605 if (__glibc_likely (offset != 0))
612546c6 606 {
ba40cc15 607 /* Should the already checked characters be kept? */
f4efbdfb 608 if (__glibc_likely (offset < pstr->valid_raw_len))
a334319f
UD
609 {
610 /* Yes, move them to the front of the buffer. */
0ecb606c 611#ifdef RE_ENABLE_I18N
f4efbdfb 612 if (__glibc_unlikely (pstr->offsets_needed))
ba40cc15 613 {
eb04c213 614 Idx low = 0, high = pstr->valid_len, mid;
ba40cc15
UD
615 do
616 {
617 mid = (high + low) / 2;
618 if (pstr->offsets[mid] > offset)
619 high = mid;
620 else if (pstr->offsets[mid] < offset)
621 low = mid + 1;
622 else
623 break;
624 }
625 while (low < high);
626 if (pstr->offsets[mid] < offset)
627 ++mid;
628 pstr->tip_context = re_string_context_at (pstr, mid - 1,
629 eflags);
630 /* This can be quite complicated, so handle specially
631 only the common and easy case where the character with
632 different length representation of lower and upper
633 case is present at or after offset. */
634 if (pstr->valid_len > offset
635 && mid == offset && pstr->offsets[mid] == offset)
636 {
637 memmove (pstr->wcs, pstr->wcs + offset,
638 (pstr->valid_len - offset) * sizeof (wint_t));
639 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
640 pstr->valid_len -= offset;
641 pstr->valid_raw_len -= offset;
642 for (low = 0; low < pstr->valid_len; low++)
643 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
644 }
645 else
646 {
647 /* Otherwise, just find out how long the partial multibyte
648 character at offset is and fill it with WEOF/255. */
649 pstr->len = pstr->raw_len - idx + offset;
650 pstr->stop = pstr->raw_stop - idx + offset;
651 pstr->offsets_needed = 0;
652 while (mid > 0 && pstr->offsets[mid - 1] == offset)
653 --mid;
654 while (mid < pstr->valid_len)
655 if (pstr->wcs[mid] != WEOF)
656 break;
657 else
658 ++mid;
659 if (mid == pstr->valid_len)
660 pstr->valid_len = 0;
661 else
662 {
663 pstr->valid_len = pstr->offsets[mid] - offset;
664 if (pstr->valid_len)
665 {
666 for (low = 0; low < pstr->valid_len; ++low)
667 pstr->wcs[low] = WEOF;
668 memset (pstr->mbs, 255, pstr->valid_len);
669 }
670 }
671 pstr->valid_raw_len = pstr->valid_len;
672 }
673 }
674 else
675#endif
676 {
677 pstr->tip_context = re_string_context_at (pstr, offset - 1,
678 eflags);
679#ifdef RE_ENABLE_I18N
680 if (pstr->mb_cur_max > 1)
681 memmove (pstr->wcs, pstr->wcs + offset,
682 (pstr->valid_len - offset) * sizeof (wint_t));
612546c6 683#endif /* RE_ENABLE_I18N */
f4efbdfb 684 if (__glibc_unlikely (pstr->mbs_allocated))
ba40cc15
UD
685 memmove (pstr->mbs, pstr->mbs + offset,
686 pstr->valid_len - offset);
687 pstr->valid_len -= offset;
688 pstr->valid_raw_len -= offset;
2a0356e1 689 DEBUG_ASSERT (pstr->valid_len > 0);
ba40cc15 690 }
1b2c2628 691 }
612546c6 692 else
1b2c2628 693 {
ed8ae46b 694#ifdef RE_ENABLE_I18N
1b2c2628 695 /* No, skip all characters until IDX. */
eb04c213 696 Idx prev_valid_len = pstr->valid_len;
ba40cc15 697
f4efbdfb 698 if (__glibc_unlikely (pstr->offsets_needed))
37369d1c
UD
699 {
700 pstr->len = pstr->raw_len - idx + offset;
701 pstr->stop = pstr->raw_stop - idx + offset;
702 pstr->offsets_needed = 0;
703 }
704#endif
1b2c2628 705 pstr->valid_len = 0;
3b0bdc72 706#ifdef RE_ENABLE_I18N
3c0fb574 707 if (pstr->mb_cur_max > 1)
1b2c2628 708 {
eb04c213 709 Idx wcs_idx;
3c0fb574
UD
710 wint_t wc = WEOF;
711
712 if (pstr->is_utf8)
713 {
0dae5d4e 714 const unsigned char *raw, *p, *end;
3c0fb574
UD
715
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
97fd3a30 719 end = raw + (offset - pstr->mb_cur_max);
ba40cc15
UD
720 if (end < pstr->raw_mbs)
721 end = pstr->raw_mbs;
01ed6ceb
UD
722 p = raw + offset - 1;
723#ifdef _LIBC
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
f4efbdfb 726 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
01ed6ceb
UD
727 {
728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
ba40cc15 729 /* pstr->valid_len = 0; */
01ed6ceb
UD
730 wc = (wchar_t) *p;
731 }
732 else
733#endif
734 for (; p >= end; --p)
735 if ((*p & 0xc0) != 0x80)
736 {
737 mbstate_t cur_state;
738 wchar_t wc2;
eb04c213 739 Idx mlen = raw + pstr->len - p;
01ed6ceb
UD
740 unsigned char buf[6];
741 size_t mbclen;
742
5e2b63c6 743 const unsigned char *pp = p;
f4efbdfb 744 if (__glibc_unlikely (pstr->trans != NULL))
01ed6ceb
UD
745 {
746 int i = mlen < 6 ? mlen : 6;
747 while (--i >= 0)
748 buf[i] = pstr->trans[p[i]];
5e2b63c6 749 pp = buf;
01ed6ceb
UD
750 }
751 /* XXX Don't use mbrtowc, we know which conversion
752 to use (UTF-8 -> UCS4). */
753 memset (&cur_state, 0, sizeof (cur_state));
5e2b63c6 754 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
b3918c7d 755 &cur_state);
01ed6ceb
UD
756 if (raw + offset - p <= mbclen
757 && mbclen < (size_t) -2)
758 {
759 memset (&pstr->cur_state, '\0',
760 sizeof (mbstate_t));
761 pstr->valid_len = mbclen - (raw + offset - p);
762 wc = wc2;
763 }
764 break;
765 }
3c0fb574 766 }
e40a38b3 767
3c0fb574 768 if (wc == WEOF)
97fd3a30 769 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
33e63e79
UD
770 if (wc == WEOF)
771 pstr->tip_context
ba40cc15 772 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
33e63e79 773 else
f4efbdfb 774 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
33e63e79
UD
775 && IS_WIDE_WORD_CHAR (wc))
776 ? CONTEXT_WORD
777 : ((IS_WIDE_NEWLINE (wc)
778 && pstr->newline_anchor)
779 ? CONTEXT_NEWLINE : 0));
f4efbdfb 780 if (__glibc_unlikely (pstr->valid_len))
37369d1c
UD
781 {
782 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 pstr->wcs[wcs_idx] = WEOF;
784 if (pstr->mbs_allocated)
785 memset (pstr->mbs, 255, pstr->valid_len);
786 }
787 pstr->valid_raw_len = pstr->valid_len;
1b2c2628 788 }
1843975c 789 else
612546c6 790#endif /* RE_ENABLE_I18N */
1843975c
RM
791 {
792 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
33e63e79 793 pstr->valid_raw_len = 0;
1843975c
RM
794 if (pstr->trans)
795 c = pstr->trans[c];
56b168be
UD
796 pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 ? CONTEXT_WORD
798 : ((IS_NEWLINE (c) && pstr->newline_anchor)
1843975c
RM
799 ? CONTEXT_NEWLINE : 0));
800 }
1b2c2628 801 }
f4efbdfb 802 if (!__glibc_unlikely (pstr->mbs_allocated))
37369d1c 803 pstr->mbs += offset;
3b0bdc72 804 }
612546c6
UD
805 pstr->raw_mbs_idx = idx;
806 pstr->len -= offset;
ac3d553b 807 pstr->stop -= offset;
612546c6
UD
808
809 /* Then build the buffers. */
810#ifdef RE_ENABLE_I18N
3c0fb574 811 if (pstr->mb_cur_max > 1)
3b0bdc72 812 {
612546c6 813 if (pstr->icase)
37369d1c 814 {
4f7e7f8e 815 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
f4efbdfb 816 if (__glibc_unlikely (ret != REG_NOERROR))
37369d1c
UD
817 return ret;
818 }
612546c6 819 else
1b2c2628 820 build_wcs_buffer (pstr);
3b0bdc72
UD
821 }
822 else
612546c6 823#endif /* RE_ENABLE_I18N */
f4efbdfb 824 if (__glibc_unlikely (pstr->mbs_allocated))
01ed6ceb
UD
825 {
826 if (pstr->icase)
827 build_upper_buffer (pstr);
828 else if (pstr->trans != NULL)
829 re_string_translate_buffer (pstr);
830 }
831 else
832 pstr->valid_len = pstr->len;
612546c6 833
bb677c95 834 pstr->cur_idx = 0;
3b0bdc72
UD
835 return REG_NOERROR;
836}
837
37369d1c 838static unsigned char
eb04c213
AZ
839__attribute__ ((pure))
840re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
37369d1c 841{
eb04c213
AZ
842 int ch;
843 Idx off;
37369d1c
UD
844
845 /* Handle the common (easiest) cases first. */
f4efbdfb 846 if (__glibc_likely (!pstr->mbs_allocated))
37369d1c
UD
847 return re_string_peek_byte (pstr, idx);
848
849#ifdef RE_ENABLE_I18N
850 if (pstr->mb_cur_max > 1
851 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852 return re_string_peek_byte (pstr, idx);
853#endif
854
855 off = pstr->cur_idx + idx;
856#ifdef RE_ENABLE_I18N
857 if (pstr->offsets_needed)
858 off = pstr->offsets[off];
859#endif
860
861 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
862
863#ifdef RE_ENABLE_I18N
864 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865 this function returns CAPITAL LETTER I instead of first byte of
866 DOTLESS SMALL LETTER I. The latter would confuse the parser,
867 since peek_byte_case doesn't advance cur_idx in any way. */
868 if (pstr->offsets_needed && !isascii (ch))
869 return re_string_peek_byte (pstr, idx);
870#endif
871
872 return ch;
873}
874
875static unsigned char
e2f55264 876re_string_fetch_byte_case (re_string_t *pstr)
37369d1c 877{
f4efbdfb 878 if (__glibc_likely (!pstr->mbs_allocated))
37369d1c
UD
879 return re_string_fetch_byte (pstr);
880
881#ifdef RE_ENABLE_I18N
882 if (pstr->offsets_needed)
883 {
eb04c213
AZ
884 Idx off;
885 int ch;
c0d5034e 886
37369d1c
UD
887 /* For tr_TR.UTF-8 [[:islower:]] there is
888 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
889 in that case the whole multi-byte character and return
890 the original letter. On the other side, with
891 [[: DOTLESS SMALL LETTER I return [[:I, as doing
892 anything else would complicate things too much. */
893
894 if (!re_string_first_byte (pstr, pstr->cur_idx))
895 return re_string_fetch_byte (pstr);
896
897 off = pstr->offsets[pstr->cur_idx];
898 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
37369d1c
UD
899
900 if (! isascii (ch))
901 return re_string_fetch_byte (pstr);
902
903 re_string_skip_bytes (pstr,
904 re_string_char_size_at (pstr, pstr->cur_idx));
905 return ch;
906 }
907#endif
908
f39eef7b 909 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
37369d1c
UD
910}
911
3b0bdc72 912static void
e2f55264 913re_string_destruct (re_string_t *pstr)
3b0bdc72
UD
914{
915#ifdef RE_ENABLE_I18N
916 re_free (pstr->wcs);
37369d1c 917 re_free (pstr->offsets);
3b0bdc72 918#endif /* RE_ENABLE_I18N */
37369d1c 919 if (pstr->mbs_allocated)
612546c6 920 re_free (pstr->mbs);
3b0bdc72
UD
921}
922
923/* Return the context at IDX in INPUT. */
612546c6 924
3b0bdc72 925static unsigned int
eb04c213 926re_string_context_at (const re_string_t *input, Idx idx, int eflags)
3b0bdc72
UD
927{
928 int c;
f4efbdfb 929 if (__glibc_unlikely (idx < 0))
bb677c95
UD
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input->tip_context;
f4efbdfb 933 if (__glibc_unlikely (idx == input->len))
bb677c95
UD
934 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
a0756231 936#ifdef RE_ENABLE_I18N
3c0fb574 937 if (input->mb_cur_max > 1)
1843975c
RM
938 {
939 wint_t wc;
eb04c213 940 Idx wc_idx = idx;
1843975c
RM
941 while(input->wcs[wc_idx] == WEOF)
942 {
2a0356e1 943 DEBUG_ASSERT (wc_idx >= 0);
1843975c
RM
944 --wc_idx;
945 if (wc_idx < 0)
946 return input->tip_context;
947 }
948 wc = input->wcs[wc_idx];
f4efbdfb
PE
949 if (__glibc_unlikely (input->word_ops_used != 0)
950 && IS_WIDE_WORD_CHAR (wc))
1843975c 951 return CONTEXT_WORD;
56b168be
UD
952 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 ? CONTEXT_NEWLINE : 0);
1843975c 954 }
a0756231
RM
955 else
956#endif
957 {
958 c = re_string_byte_at (input, idx);
56b168be 959 if (bitset_contain (input->word_char, c))
a0756231 960 return CONTEXT_WORD;
56b168be 961 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
a0756231 962 }
3b0bdc72
UD
963}
964\f
965/* Functions for set operation. */
966
967static reg_errcode_t
b41bd5bc 968__attribute_warn_unused_result__
eb04c213 969re_node_set_alloc (re_node_set *set, Idx size)
3b0bdc72
UD
970{
971 set->alloc = size;
972 set->nelem = 0;
eb04c213 973 set->elems = re_malloc (Idx, size);
f4efbdfb
PE
974 if (__glibc_unlikely (set->elems == NULL)
975 && (MALLOC_0_IS_NONNULL || size != 0))
3b0bdc72
UD
976 return REG_ESPACE;
977 return REG_NOERROR;
978}
979
980static reg_errcode_t
b41bd5bc 981__attribute_warn_unused_result__
eb04c213 982re_node_set_init_1 (re_node_set *set, Idx elem)
3b0bdc72
UD
983{
984 set->alloc = 1;
985 set->nelem = 1;
eb04c213 986 set->elems = re_malloc (Idx, 1);
f4efbdfb 987 if (__glibc_unlikely (set->elems == NULL))
6291ee3c
UD
988 {
989 set->alloc = set->nelem = 0;
990 return REG_ESPACE;
991 }
3b0bdc72
UD
992 set->elems[0] = elem;
993 return REG_NOERROR;
994}
995
996static reg_errcode_t
b41bd5bc 997__attribute_warn_unused_result__
eb04c213 998re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
3b0bdc72
UD
999{
1000 set->alloc = 2;
eb04c213 1001 set->elems = re_malloc (Idx, 2);
f4efbdfb 1002 if (__glibc_unlikely (set->elems == NULL))
3b0bdc72
UD
1003 return REG_ESPACE;
1004 if (elem1 == elem2)
1005 {
1006 set->nelem = 1;
1007 set->elems[0] = elem1;
1008 }
1009 else
1010 {
1011 set->nelem = 2;
1012 if (elem1 < elem2)
1b2c2628
UD
1013 {
1014 set->elems[0] = elem1;
1015 set->elems[1] = elem2;
1016 }
3b0bdc72 1017 else
1b2c2628
UD
1018 {
1019 set->elems[0] = elem2;
1020 set->elems[1] = elem1;
1021 }
3b0bdc72
UD
1022 }
1023 return REG_NOERROR;
1024}
1025
1026static reg_errcode_t
b41bd5bc 1027__attribute_warn_unused_result__
e2f55264 1028re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
3b0bdc72
UD
1029{
1030 dest->nelem = src->nelem;
1031 if (src->nelem > 0)
1032 {
1033 dest->alloc = dest->nelem;
eb04c213 1034 dest->elems = re_malloc (Idx, dest->alloc);
f4efbdfb 1035 if (__glibc_unlikely (dest->elems == NULL))
6291ee3c
UD
1036 {
1037 dest->alloc = dest->nelem = 0;
1038 return REG_ESPACE;
1039 }
eb04c213 1040 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
3b0bdc72
UD
1041 }
1042 else
1043 re_node_set_init_empty (dest);
1044 return REG_NOERROR;
1045}
1046
a9388965
UD
1047/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1050
3b0bdc72 1051static reg_errcode_t
b41bd5bc 1052__attribute_warn_unused_result__
e2f55264
UD
1053re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054 const re_node_set *src2)
3b0bdc72 1055{
eb04c213 1056 Idx i1, i2, is, id, delta, sbase;
59e7ebcc
UD
1057 if (src1->nelem == 0 || src2->nelem == 0)
1058 return REG_NOERROR;
1059
1060 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061 conservative estimate. */
1062 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
3b0bdc72 1063 {
eb04c213
AZ
1064 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
f4efbdfb 1066 if (__glibc_unlikely (new_elems == NULL))
2da42bc0 1067 return REG_ESPACE;
59e7ebcc
UD
1068 dest->elems = new_elems;
1069 dest->alloc = new_alloc;
3b0bdc72 1070 }
3b0bdc72 1071
59e7ebcc
UD
1072 /* Find the items in the intersection of SRC1 and SRC2, and copy
1073 into the top of DEST those that are not already in DEST itself. */
1074 sbase = dest->nelem + src1->nelem + src2->nelem;
1075 i1 = src1->nelem - 1;
1076 i2 = src2->nelem - 1;
1077 id = dest->nelem - 1;
1078 for (;;)
3b0bdc72 1079 {
59e7ebcc 1080 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1081 {
59e7ebcc
UD
1082 /* Try to find the item in DEST. Maybe we could binary search? */
1083 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1084 --id;
1085
2da42bc0 1086 if (id < 0 || dest->elems[id] != src1->elems[i1])
eb04c213 1087 dest->elems[--sbase] = src1->elems[i1];
59e7ebcc
UD
1088
1089 if (--i1 < 0 || --i2 < 0)
1090 break;
1b2c2628 1091 }
59e7ebcc
UD
1092
1093 /* Lower the highest of the two items. */
1094 else if (src1->elems[i1] < src2->elems[i2])
1b2c2628 1095 {
59e7ebcc
UD
1096 if (--i2 < 0)
1097 break;
1098 }
1099 else
1100 {
1101 if (--i1 < 0)
1102 break;
1b2c2628 1103 }
3b0bdc72 1104 }
59e7ebcc
UD
1105
1106 id = dest->nelem - 1;
1107 is = dest->nelem + src1->nelem + src2->nelem - 1;
1108 delta = is - sbase + 1;
1109
1110 /* Now copy. When DELTA becomes zero, the remaining
1111 DEST elements are already in place; this is more or
1112 less the same loop that is in re_node_set_merge. */
1113 dest->nelem += delta;
1114 if (delta > 0 && id >= 0)
1115 for (;;)
1116 {
2da42bc0
UD
1117 if (dest->elems[is] > dest->elems[id])
1118 {
1119 /* Copy from the top. */
1120 dest->elems[id + delta--] = dest->elems[is--];
1121 if (delta == 0)
1122 break;
1123 }
1124 else
1125 {
1126 /* Slide from the bottom. */
1127 dest->elems[id + delta] = dest->elems[id];
1128 if (--id < 0)
1129 break;
1130 }
59e7ebcc
UD
1131 }
1132
1133 /* Copy remaining SRC elements. */
eb04c213 1134 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
59e7ebcc 1135
3b0bdc72
UD
1136 return REG_NOERROR;
1137}
1138
a9388965
UD
1139/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1141
3b0bdc72 1142static reg_errcode_t
b41bd5bc 1143__attribute_warn_unused_result__
e2f55264
UD
1144re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145 const re_node_set *src2)
3b0bdc72 1146{
eb04c213 1147 Idx i1, i2, id;
3b0bdc72
UD
1148 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149 {
1150 dest->alloc = src1->nelem + src2->nelem;
eb04c213 1151 dest->elems = re_malloc (Idx, dest->alloc);
f4efbdfb 1152 if (__glibc_unlikely (dest->elems == NULL))
1b2c2628 1153 return REG_ESPACE;
3b0bdc72
UD
1154 }
1155 else
1156 {
1157 if (src1 != NULL && src1->nelem > 0)
1b2c2628 1158 return re_node_set_init_copy (dest, src1);
3b0bdc72 1159 else if (src2 != NULL && src2->nelem > 0)
1b2c2628 1160 return re_node_set_init_copy (dest, src2);
3b0bdc72 1161 else
1b2c2628 1162 re_node_set_init_empty (dest);
3b0bdc72
UD
1163 return REG_NOERROR;
1164 }
1165 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166 {
1167 if (src1->elems[i1] > src2->elems[i2])
1b2c2628
UD
1168 {
1169 dest->elems[id++] = src2->elems[i2++];
1170 continue;
1171 }
3b0bdc72 1172 if (src1->elems[i1] == src2->elems[i2])
1b2c2628 1173 ++i2;
3b0bdc72
UD
1174 dest->elems[id++] = src1->elems[i1++];
1175 }
1176 if (i1 < src1->nelem)
1177 {
1178 memcpy (dest->elems + id, src1->elems + i1,
eb04c213 1179 (src1->nelem - i1) * sizeof (Idx));
3b0bdc72
UD
1180 id += src1->nelem - i1;
1181 }
1182 else if (i2 < src2->nelem)
1183 {
1184 memcpy (dest->elems + id, src2->elems + i2,
eb04c213 1185 (src2->nelem - i2) * sizeof (Idx));
3b0bdc72
UD
1186 id += src2->nelem - i2;
1187 }
1188 dest->nelem = id;
1189 return REG_NOERROR;
1190}
1191
a9388965
UD
1192/* Calculate the union set of the sets DEST and SRC. And store it to
1193 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1194
3b0bdc72 1195static reg_errcode_t
b41bd5bc 1196__attribute_warn_unused_result__
e2f55264 1197re_node_set_merge (re_node_set *dest, const re_node_set *src)
3b0bdc72 1198{
eb04c213 1199 Idx is, id, sbase, delta;
3b0bdc72
UD
1200 if (src == NULL || src->nelem == 0)
1201 return REG_NOERROR;
59e7ebcc 1202 if (dest->alloc < 2 * src->nelem + dest->nelem)
3b0bdc72 1203 {
eb04c213
AZ
1204 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
f4efbdfb 1206 if (__glibc_unlikely (new_buffer == NULL))
1b2c2628
UD
1207 return REG_ESPACE;
1208 dest->elems = new_buffer;
951d6408 1209 dest->alloc = new_alloc;
3b0bdc72
UD
1210 }
1211
f4efbdfb 1212 if (__glibc_unlikely (dest->nelem == 0))
3b0bdc72 1213 {
59e7ebcc 1214 dest->nelem = src->nelem;
eb04c213 1215 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
59e7ebcc
UD
1216 return REG_NOERROR;
1217 }
3b0bdc72 1218
59e7ebcc
UD
1219 /* Copy into the top of DEST the items of SRC that are not
1220 found in DEST. Maybe we could binary search in DEST? */
1221 for (sbase = dest->nelem + 2 * src->nelem,
1222 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1223 {
1224 if (dest->elems[id] == src->elems[is])
2da42bc0 1225 is--, id--;
59e7ebcc 1226 else if (dest->elems[id] < src->elems[is])
2da42bc0 1227 dest->elems[--sbase] = src->elems[is--];
59e7ebcc 1228 else /* if (dest->elems[id] > src->elems[is]) */
2da42bc0 1229 --id;
59e7ebcc 1230 }
3b0bdc72 1231
59e7ebcc
UD
1232 if (is >= 0)
1233 {
1234 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1235 sbase -= is + 1;
eb04c213 1236 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
3b0bdc72
UD
1237 }
1238
59e7ebcc
UD
1239 id = dest->nelem - 1;
1240 is = dest->nelem + 2 * src->nelem - 1;
1241 delta = is - sbase + 1;
1242 if (delta == 0)
1243 return REG_NOERROR;
1244
1245 /* Now copy. When DELTA becomes zero, the remaining
1246 DEST elements are already in place. */
1247 dest->nelem += delta;
1248 for (;;)
3b0bdc72 1249 {
59e7ebcc 1250 if (dest->elems[is] > dest->elems[id])
2da42bc0 1251 {
59e7ebcc 1252 /* Copy from the top. */
2da42bc0 1253 dest->elems[id + delta--] = dest->elems[is--];
59e7ebcc
UD
1254 if (delta == 0)
1255 break;
1256 }
1257 else
2da42bc0
UD
1258 {
1259 /* Slide from the bottom. */
1260 dest->elems[id + delta] = dest->elems[id];
59e7ebcc
UD
1261 if (--id < 0)
1262 {
1263 /* Copy remaining SRC elements. */
1264 memcpy (dest->elems, dest->elems + sbase,
eb04c213 1265 delta * sizeof (Idx));
59e7ebcc
UD
1266 break;
1267 }
1268 }
3b0bdc72 1269 }
59e7ebcc 1270
3b0bdc72
UD
1271 return REG_NOERROR;
1272}
1273
1274/* Insert the new element ELEM to the re_node_set* SET.
8503c987 1275 SET should not already have ELEM.
eb04c213 1276 Return true if successful. */
3b0bdc72 1277
eb04c213 1278static bool
b41bd5bc 1279__attribute_warn_unused_result__
eb04c213 1280re_node_set_insert (re_node_set *set, Idx elem)
3b0bdc72 1281{
eb04c213 1282 Idx idx;
8503c987
UD
1283 /* In case the set is empty. */
1284 if (set->alloc == 0)
f4efbdfb 1285 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
3b0bdc72 1286
f4efbdfb 1287 if (__glibc_unlikely (set->nelem) == 0)
3b0bdc72 1288 {
8503c987
UD
1289 /* We already guaranteed above that set->alloc != 0. */
1290 set->elems[0] = elem;
1291 ++set->nelem;
eb04c213 1292 return true;
3b0bdc72
UD
1293 }
1294
1295 /* Realloc if we need. */
8503c987 1296 if (set->alloc == set->nelem)
3b0bdc72 1297 {
eb04c213 1298 Idx *new_elems;
3b0bdc72 1299 set->alloc = set->alloc * 2;
eb04c213 1300 new_elems = re_realloc (set->elems, Idx, set->alloc);
f4efbdfb 1301 if (__glibc_unlikely (new_elems == NULL))
eb04c213 1302 return false;
2d87db5b 1303 set->elems = new_elems;
3b0bdc72 1304 }
8503c987
UD
1305
1306 /* Move the elements which follows the new element. Test the
1307 first element separately to skip a check in the inner loop. */
1308 if (elem < set->elems[0])
1309 {
59e7ebcc 1310 for (idx = set->nelem; idx > 0; idx--)
2da42bc0 1311 set->elems[idx] = set->elems[idx - 1];
8503c987 1312 }
3b0bdc72
UD
1313 else
1314 {
8503c987 1315 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2da42bc0 1316 set->elems[idx] = set->elems[idx - 1];
3b0bdc72 1317 }
8503c987 1318
3b0bdc72
UD
1319 /* Insert the new element. */
1320 set->elems[idx] = elem;
1321 ++set->nelem;
eb04c213 1322 return true;
3b0bdc72
UD
1323}
1324
20dc2f79
UD
1325/* Insert the new element ELEM to the re_node_set* SET.
1326 SET should not already have any element greater than or equal to ELEM.
eb04c213 1327 Return true if successful. */
20dc2f79 1328
eb04c213 1329static bool
b41bd5bc 1330__attribute_warn_unused_result__
eb04c213 1331re_node_set_insert_last (re_node_set *set, Idx elem)
20dc2f79
UD
1332{
1333 /* Realloc if we need. */
1334 if (set->alloc == set->nelem)
1335 {
eb04c213 1336 Idx *new_elems;
20dc2f79 1337 set->alloc = (set->alloc + 1) * 2;
eb04c213 1338 new_elems = re_realloc (set->elems, Idx, set->alloc);
f4efbdfb 1339 if (__glibc_unlikely (new_elems == NULL))
eb04c213 1340 return false;
2d87db5b 1341 set->elems = new_elems;
20dc2f79
UD
1342 }
1343
1344 /* Insert the new element. */
1345 set->elems[set->nelem++] = elem;
eb04c213 1346 return true;
20dc2f79
UD
1347}
1348
3b0bdc72 1349/* Compare two node sets SET1 and SET2.
eb04c213 1350 Return true if SET1 and SET2 are equivalent. */
3b0bdc72 1351
eb04c213
AZ
1352static bool
1353__attribute__ ((pure))
e2f55264 1354re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
3b0bdc72 1355{
eb04c213 1356 Idx i;
3b0bdc72 1357 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
eb04c213 1358 return false;
59e7ebcc 1359 for (i = set1->nelem ; --i >= 0 ; )
3b0bdc72 1360 if (set1->elems[i] != set2->elems[i])
eb04c213
AZ
1361 return false;
1362 return true;
3b0bdc72
UD
1363}
1364
0742e48e 1365/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
3b0bdc72 1366
eb04c213
AZ
1367static Idx
1368__attribute__ ((pure))
1369re_node_set_contains (const re_node_set *set, Idx elem)
3b0bdc72 1370{
eb04c213 1371 __re_size_t idx, right, mid;
3b0bdc72
UD
1372 if (set->nelem <= 0)
1373 return 0;
1374
1375 /* Binary search the element. */
1376 idx = 0;
1377 right = set->nelem - 1;
1378 while (idx < right)
1379 {
1380 mid = (idx + right) / 2;
1381 if (set->elems[mid] < elem)
1b2c2628 1382 idx = mid + 1;
3b0bdc72 1383 else
1b2c2628 1384 right = mid;
3b0bdc72 1385 }
0742e48e 1386 return set->elems[idx] == elem ? idx + 1 : 0;
3b0bdc72
UD
1387}
1388
1389static void
eb04c213 1390re_node_set_remove_at (re_node_set *set, Idx idx)
3b0bdc72
UD
1391{
1392 if (idx < 0 || idx >= set->nelem)
1393 return;
3b0bdc72 1394 --set->nelem;
59e7ebcc
UD
1395 for (; idx < set->nelem; idx++)
1396 set->elems[idx] = set->elems[idx + 1];
3b0bdc72
UD
1397}
1398\f
1399
1400/* Add the token TOKEN to dfa->nodes, and return the index of the token.
eb04c213 1401 Or return -1 if an error occurred. */
3b0bdc72 1402
eb04c213 1403static Idx
e2f55264 1404re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
3b0bdc72 1405{
f4efbdfb 1406 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
3b0bdc72 1407 {
01ed6ceb 1408 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
eb04c213 1409 Idx *new_nexts, *new_indices;
963d8d78 1410 re_node_set *new_edests, *new_eclosures;
01ed6ceb
UD
1411 re_token_t *new_nodes;
1412
22364644
UD
1413 /* Avoid overflows in realloc. */
1414 const size_t max_object_size = MAX (sizeof (re_token_t),
1415 MAX (sizeof (re_node_set),
eb04c213 1416 sizeof (Idx)));
f4efbdfb
PE
1417 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1418 < new_nodes_alloc))
01ed6ceb 1419 return -1;
02f3550c 1420
01ed6ceb 1421 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
f4efbdfb 1422 if (__glibc_unlikely (new_nodes == NULL))
0ecb606c 1423 return -1;
2d87db5b 1424 dfa->nodes = new_nodes;
eb04c213
AZ
1425 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1426 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
02f3550c
UD
1427 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1428 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
f4efbdfb
PE
1429 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1430 || new_edests == NULL || new_eclosures == NULL))
eb04c213
AZ
1431 {
1432 re_free (new_nexts);
1433 re_free (new_indices);
1434 re_free (new_edests);
1435 re_free (new_eclosures);
1436 return -1;
1437 }
02f3550c
UD
1438 dfa->nexts = new_nexts;
1439 dfa->org_indices = new_indices;
1440 dfa->edests = new_edests;
1441 dfa->eclosures = new_eclosures;
951d6408 1442 dfa->nodes_alloc = new_nodes_alloc;
3b0bdc72
UD
1443 }
1444 dfa->nodes[dfa->nodes_len] = token;
485d775d 1445 dfa->nodes[dfa->nodes_len].constraint = 0;
02f3550c
UD
1446#ifdef RE_ENABLE_I18N
1447 dfa->nodes[dfa->nodes_len].accept_mb =
eb04c213
AZ
1448 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1449 || token.type == COMPLEX_BRACKET);
02f3550c
UD
1450#endif
1451 dfa->nexts[dfa->nodes_len] = -1;
1452 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
3b0bdc72
UD
1454 return dfa->nodes_len++;
1455}
1456
eb04c213 1457static re_hashval_t
e2f55264 1458calc_state_hash (const re_node_set *nodes, unsigned int context)
3b0bdc72 1459{
eb04c213
AZ
1460 re_hashval_t hash = nodes->nelem + context;
1461 Idx i;
3b0bdc72
UD
1462 for (i = 0 ; i < nodes->nelem ; i++)
1463 hash += nodes->elems[i];
1464 return hash;
1465}
1466
1467/* Search for the state whose node_set is equivalent to NODES.
1468 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1469 Otherwise create the new one and return it. In case of an error
1470 return NULL and set the error code in ERR.
1471 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1472 return value is NULL and ERR is REG_NOERROR.
1473 - We never return non-NULL value in case of any errors, it is for
1474 optimization. */
a9388965 1475
e2f55264 1476static re_dfastate_t *
b41bd5bc 1477__attribute_warn_unused_result__
e2f55264
UD
1478re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479 const re_node_set *nodes)
3b0bdc72 1480{
eb04c213 1481 re_hashval_t hash;
a9388965 1482 re_dfastate_t *new_state;
3b0bdc72 1483 struct re_state_table_entry *spot;
eb04c213
AZ
1484 Idx i;
1485#if defined GCC_LINT || defined lint
1486 /* Suppress bogus uninitialized-variable warnings. */
1487 *err = REG_NOERROR;
1488#endif
f4efbdfb 1489 if (__glibc_unlikely (nodes->nelem == 0))
a9388965
UD
1490 {
1491 *err = REG_NOERROR;
1492 return NULL;
1493 }
3b0bdc72
UD
1494 hash = calc_state_hash (nodes, 0);
1495 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
bc15410e 1497 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1498 {
bc15410e
UD
1499 re_dfastate_t *state = spot->array[i];
1500 if (hash != state->hash)
1b2c2628 1501 continue;
bc15410e 1502 if (re_node_set_compare (&state->nodes, nodes))
1b2c2628 1503 return state;
3b0bdc72 1504 }
3b0bdc72
UD
1505
1506 /* There are no appropriate state in the dfa, create the new one. */
a9388965 1507 new_state = create_ci_newstate (dfa, nodes, hash);
f4efbdfb 1508 if (__glibc_unlikely (new_state == NULL))
2d87db5b
UD
1509 *err = REG_ESPACE;
1510
1511 return new_state;
3b0bdc72
UD
1512}
1513
1514/* Search for the state whose node_set is equivalent to NODES and
1515 whose context is equivalent to CONTEXT.
1516 Return the pointer to the state, if we found it in the DFA.
a9388965
UD
1517 Otherwise create the new one and return it. In case of an error
1518 return NULL and set the error code in ERR.
1519 Note: - We assume NULL as the invalid state, then it is possible that
1b2c2628
UD
1520 return value is NULL and ERR is REG_NOERROR.
1521 - We never return non-NULL value in case of any errors, it is for
1522 optimization. */
a9388965 1523
db26cb75 1524static re_dfastate_t *
b41bd5bc 1525__attribute_warn_unused_result__
e2f55264
UD
1526re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527 const re_node_set *nodes, unsigned int context)
3b0bdc72 1528{
eb04c213 1529 re_hashval_t hash;
a9388965 1530 re_dfastate_t *new_state;
3b0bdc72 1531 struct re_state_table_entry *spot;
eb04c213
AZ
1532 Idx i;
1533#if defined GCC_LINT || defined lint
1534 /* Suppress bogus uninitialized-variable warnings. */
1535 *err = REG_NOERROR;
1536#endif
3b0bdc72 1537 if (nodes->nelem == 0)
a9388965
UD
1538 {
1539 *err = REG_NOERROR;
1540 return NULL;
1541 }
3b0bdc72
UD
1542 hash = calc_state_hash (nodes, context);
1543 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
bc15410e 1545 for (i = 0 ; i < spot->num ; i++)
3b0bdc72 1546 {
bc15410e 1547 re_dfastate_t *state = spot->array[i];
a0a8461c
UD
1548 if (state->hash == hash
1549 && state->context == context
1550 && re_node_set_compare (state->entrance_nodes, nodes))
1b2c2628 1551 return state;
3b0bdc72 1552 }
eb04c213 1553 /* There are no appropriate state in 'dfa', create the new one. */
a9388965 1554 new_state = create_cd_newstate (dfa, nodes, context, hash);
f4efbdfb 1555 if (__glibc_unlikely (new_state == NULL))
2d87db5b
UD
1556 *err = REG_ESPACE;
1557
1558 return new_state;
3b0bdc72
UD
1559}
1560
5cf1ec52
UD
1561/* Finish initialization of the new state NEWSTATE, and using its hash value
1562 HASH put in the appropriate bucket of DFA's state table. Return value
1563 indicates the error code if failed. */
a9388965 1564
5cf1ec52 1565static reg_errcode_t
2da42bc0 1566__attribute_warn_unused_result__
e2f55264 1567register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
eb04c213 1568 re_hashval_t hash)
3b0bdc72 1569{
5cf1ec52 1570 struct re_state_table_entry *spot;
1b2c2628 1571 reg_errcode_t err;
eb04c213 1572 Idx i;
5cf1ec52
UD
1573
1574 newstate->hash = hash;
1575 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
f4efbdfb 1576 if (__glibc_unlikely (err != REG_NOERROR))
5cf1ec52
UD
1577 return REG_ESPACE;
1578 for (i = 0; i < newstate->nodes.nelem; i++)
1b2c2628 1579 {
eb04c213 1580 Idx elem = newstate->nodes.elems[i];
5cf1ec52 1581 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
eb04c213 1582 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
2da42bc0 1583 return REG_ESPACE;
1b2c2628 1584 }
3b0bdc72 1585
3b0bdc72 1586 spot = dfa->state_table + (hash & dfa->state_hash_mask);
f4efbdfb 1587 if (__glibc_unlikely (spot->alloc <= spot->num))
3b0bdc72 1588 {
eb04c213 1589 Idx new_alloc = 2 * spot->num + 2;
951d6408
UD
1590 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591 new_alloc);
f4efbdfb 1592 if (__glibc_unlikely (new_array == NULL))
1b2c2628
UD
1593 return REG_ESPACE;
1594 spot->array = new_array;
951d6408 1595 spot->alloc = new_alloc;
3b0bdc72 1596 }
bc15410e 1597 spot->array[spot->num++] = newstate;
a9388965 1598 return REG_NOERROR;
3b0bdc72
UD
1599}
1600
db26cb75
UD
1601static void
1602free_state (re_dfastate_t *state)
1603{
1604 re_node_set_free (&state->non_eps_nodes);
1605 re_node_set_free (&state->inveclosure);
1606 if (state->entrance_nodes != &state->nodes)
1607 {
1608 re_node_set_free (state->entrance_nodes);
1609 re_free (state->entrance_nodes);
1610 }
1611 re_node_set_free (&state->nodes);
1612 re_free (state->word_trtable);
1613 re_free (state->trtable);
1614 re_free (state);
1615}
1616
5069ff32 1617/* Create the new state which is independent of contexts.
a9388965
UD
1618 Return the new state if succeeded, otherwise return NULL. */
1619
3b0bdc72 1620static re_dfastate_t *
b41bd5bc 1621__attribute_warn_unused_result__
e2f55264 1622create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
eb04c213 1623 re_hashval_t hash)
3b0bdc72 1624{
eb04c213 1625 Idx i;
a9388965 1626 reg_errcode_t err;
3b0bdc72 1627 re_dfastate_t *newstate;
5cf1ec52
UD
1628
1629 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
f4efbdfb 1630 if (__glibc_unlikely (newstate == NULL))
a9388965 1631 return NULL;
5cf1ec52 1632 err = re_node_set_init_copy (&newstate->nodes, nodes);
f4efbdfb 1633 if (__glibc_unlikely (err != REG_NOERROR))
5cf1ec52
UD
1634 {
1635 re_free (newstate);
1636 return NULL;
1637 }
3b0bdc72 1638
5cf1ec52 1639 newstate->entrance_nodes = &newstate->nodes;
3b0bdc72
UD
1640 for (i = 0 ; i < nodes->nelem ; i++)
1641 {
1642 re_token_t *node = dfa->nodes + nodes->elems[i];
1643 re_token_type_t type = node->type;
485d775d 1644 if (type == CHARACTER && !node->constraint)
1b2c2628 1645 continue;
02f3550c
UD
1646#ifdef RE_ENABLE_I18N
1647 newstate->accept_mb |= node->accept_mb;
1648#endif /* RE_ENABLE_I18N */
3b0bdc72
UD
1649
1650 /* If the state has the halt node, the state is a halt state. */
02f3550c 1651 if (type == END_OF_RE)
1b2c2628 1652 newstate->halt = 1;
3b0bdc72 1653 else if (type == OP_BACK_REF)
1b2c2628 1654 newstate->has_backref = 1;
485d775d 1655 else if (type == ANCHOR || node->constraint)
1b2c2628 1656 newstate->has_constraint = 1;
3b0bdc72 1657 }
a9388965 1658 err = register_state (dfa, newstate, hash);
f4efbdfb 1659 if (__glibc_unlikely (err != REG_NOERROR))
1b2c2628
UD
1660 {
1661 free_state (newstate);
1662 newstate = NULL;
1663 }
1664 return newstate;
3b0bdc72
UD
1665}
1666
a9388965
UD
1667/* Create the new state which is depend on the context CONTEXT.
1668 Return the new state if succeeded, otherwise return NULL. */
1669
3b0bdc72 1670static re_dfastate_t *
b41bd5bc 1671__attribute_warn_unused_result__
e2f55264 1672create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
eb04c213 1673 unsigned int context, re_hashval_t hash)
3b0bdc72 1674{
eb04c213 1675 Idx i, nctx_nodes = 0;
a9388965 1676 reg_errcode_t err;
3b0bdc72
UD
1677 re_dfastate_t *newstate;
1678
5cf1ec52 1679 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
f4efbdfb 1680 if (__glibc_unlikely (newstate == NULL))
a9388965 1681 return NULL;
5cf1ec52 1682 err = re_node_set_init_copy (&newstate->nodes, nodes);
f4efbdfb 1683 if (__glibc_unlikely (err != REG_NOERROR))
5cf1ec52
UD
1684 {
1685 re_free (newstate);
1686 return NULL;
1687 }
1688
3b0bdc72
UD
1689 newstate->context = context;
1690 newstate->entrance_nodes = &newstate->nodes;
1691
1692 for (i = 0 ; i < nodes->nelem ; i++)
1693 {
3b0bdc72
UD
1694 re_token_t *node = dfa->nodes + nodes->elems[i];
1695 re_token_type_t type = node->type;
0caca71a 1696 unsigned int constraint = node->constraint;
3b0bdc72 1697
485d775d 1698 if (type == CHARACTER && !constraint)
1b2c2628 1699 continue;
a334319f 1700#ifdef RE_ENABLE_I18N
02f3550c 1701 newstate->accept_mb |= node->accept_mb;
a334319f 1702#endif /* RE_ENABLE_I18N */
02f3550c
UD
1703
1704 /* If the state has the halt node, the state is a halt state. */
1705 if (type == END_OF_RE)
1706 newstate->halt = 1;
3b0bdc72 1707 else if (type == OP_BACK_REF)
1b2c2628 1708 newstate->has_backref = 1;
3b0bdc72
UD
1709
1710 if (constraint)
1b2c2628
UD
1711 {
1712 if (newstate->entrance_nodes == &newstate->nodes)
1713 {
8a80ee5e
PE
1714 re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1715 if (__glibc_unlikely (entrance_nodes == NULL))
1b2c2628
UD
1716 {
1717 free_state (newstate);
1718 return NULL;
1719 }
8a80ee5e 1720 newstate->entrance_nodes = entrance_nodes;
2da42bc0
UD
1721 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1722 != REG_NOERROR)
8a80ee5e
PE
1723 {
1724 free_state (newstate);
1725 return NULL;
1726 }
1b2c2628
UD
1727 nctx_nodes = 0;
1728 newstate->has_constraint = 1;
1729 }
1730
1731 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1732 {
1733 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1734 ++nctx_nodes;
1735 }
1736 }
3b0bdc72 1737 }
a9388965 1738 err = register_state (dfa, newstate, hash);
f4efbdfb 1739 if (__glibc_unlikely (err != REG_NOERROR))
1b2c2628
UD
1740 {
1741 free_state (newstate);
1742 newstate = NULL;
1743 }
1744 return newstate;
1745}