posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 {
 219   reg_errcode_t ret;
 220
 221   /* And GNU code determines whether or not to get register information
 222      by passing null for the REGS argument to re_match, etc., not by
 223      setting no_sub, unless RE_NO_SUB is set.  */
 224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 225
 226   /* Match anchors at newline.  */
 227   bufp->newline_anchor = 1;
 228
 229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 230
 231   if (!ret)
 232     return NULL;
 233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 234 }
 235 #ifdef _LIBC
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237 #endif
 238
 239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 240    also be assigned to arbitrarily: each pattern buffer stores its own
 241    syntax, so it can be changed between regex compilations.  */
 242 /* This has no initializer because initialized variables in Emacs
 243    become read-only after dumping.  */
 244 reg_syntax_t re_syntax_options;
 245
 246
 247 /* Specify the precise syntax of regexps for compilation.  This provides
 248    for compatibility for various utilities which historically have
 249    different, incompatible syntaxes.
 250
 251    The argument SYNTAX is a bit mask comprised of the various bits
 252    defined in regex.h.  We return the old syntax.  */
 253
 254 reg_syntax_t
 255 re_set_syntax (syntax)
 256     reg_syntax_t syntax;
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (bufp)
 269     struct re_pattern_buffer *bufp;
 270 {
 271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 272   char *fastmap = bufp->fastmap;
 273
 274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 276   if (dfa->init_state != dfa->init_state_word)
 277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 278   if (dfa->init_state != dfa->init_state_nl)
 279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 280   if (dfa->init_state != dfa->init_state_begbuf)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 282   bufp->fastmap_accurate = 1;
 283   return 0;
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 287 #endif
 288
 289 static inline void
 290 __attribute ((always_inline))
 291 re_set_fastmap (char *fastmap, int icase, int ch)
 292 {
 293   fastmap[ch] = 1;
 294   if (icase)
 295     fastmap[tolower (ch)] = 1;
 296 }
 297
 298 /* Helper function for re_compile_fastmap.
 299    Compile fastmap for the initial_state INIT_STATE.  */
 300
 301 static void
 302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 303                          char *fastmap)
 304 {
 305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 306   int node_cnt;
 307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 309     {
 310       int node = init_state->nodes.elems[node_cnt];
 311       re_token_type_t type = dfa->nodes[node].type;
 312
 313       if (type == CHARACTER)
 314         {
 315           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 316 #ifdef RE_ENABLE_I18N
 317           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 318             {
 319               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 331                              &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, 0, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 354           int i;
 355
 356 # ifdef _LIBC
 357           /* See if we have to try all bytes which start multiple collation
 358              elements.
 359              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 360                   collation element, and don't catch 'b' since 'b' is
 361                   the only collation element which starts from 'b' (and
 362                   it is caught by SIMPLE_BRACKET).  */
 363               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 364                   && (cset->ncoll_syms || cset->nranges))
 365                 {
 366                   const int32_t *table = (const int32_t *)
 367                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 368                   for (i = 0; i < SBC_MAX; ++i)
 369                     if (table[i] < 0)
 370                       re_set_fastmap (fastmap, icase, i);
 371                 }
 372 # endif /* _LIBC */
 373
 374           /* See if we have to start the match at all multibyte characters,
 375              i.e. where we would not find an invalid sequence.  This only
 376              applies to multibyte character sets; for single byte character
 377              sets, the SIMPLE_BRACKET again suffices.  */
 378           if (dfa->mb_cur_max > 1
 379               && (cset->nchar_classes || cset->non_match || cset->nranges
 380 # ifdef _LIBC
 381                   || cset->nequiv_classes
 382 # endif /* _LIBC */
 383                  ))
 384             {
 385               unsigned char c = 0;
 386               do
 387                 {
 388                   mbstate_t mbs;
 389                   memset (&mbs, 0, sizeof (mbs));
 390                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 391                     re_set_fastmap (fastmap, false, (int) c);
 392                 }
 393               while (++c != 0);
 394             }
 395
 396           else
 397             {
 398               /* ... Else catch all bytes which can start the mbchars.  */
 399               for (i = 0; i < cset->nmbchars; ++i)
 400                 {
 401                   char buf[256];
 402                   mbstate_t state;
 403                   memset (&state, '\0', sizeof (state));
 404                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 405                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 406                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 407                     {
 408                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 409                           != (size_t) -1)
 410                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 411                     }
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      `buffer' to the compiled pattern;
 437      `used' to the length of the compiled pattern;
 438      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      `fastmap' to an allocated space for the fastmap;
 443      `fastmap_accurate' to zero;
 444      `re_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (preg, pattern, cflags)
 468     regex_t *__restrict preg;
 469     const char *__restrict pattern;
 470     int cflags;
 471 {
 472   reg_errcode_t ret;
 473   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 474                          : RE_SYNTAX_POSIX_BASIC);
 475
 476   preg->buffer = NULL;
 477   preg->allocated = 0;
 478   preg->used = 0;
 479
 480   /* Try to allocate space for the fastmap.  */
 481   preg->fastmap = re_malloc (char, SBC_MAX);
 482   if (BE (preg->fastmap == NULL, 0))
 483     return REG_ESPACE;
 484
 485   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 486
 487   /* If REG_NEWLINE is set, newlines are treated differently.  */
 488   if (cflags & REG_NEWLINE)
 489     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 490       syntax &= ~RE_DOT_NEWLINE;
 491       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 492       /* It also changes the matching behavior.  */
 493       preg->newline_anchor = 1;
 494     }
 495   else
 496     preg->newline_anchor = 0;
 497   preg->no_sub = !!(cflags & REG_NOSUB);
 498   preg->translate = NULL;
 499
 500   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 501
 502   /* POSIX doesn't distinguish between an unmatched open-group and an
 503      unmatched close-group: both are REG_EPAREN.  */
 504   if (ret == REG_ERPAREN)
 505     ret = REG_EPAREN;
 506
 507   /* We have already checked preg->fastmap != NULL.  */
 508   if (BE (ret == REG_NOERROR, 1))
 509     /* Compute the fastmap now, since regexec cannot modify the pattern
 510        buffer.  This function never fails in this implementation.  */
 511     (void) re_compile_fastmap (preg);
 512   else
 513     {
 514       /* Some error occurred while compiling the expression.  */
 515       re_free (preg->fastmap);
 516       preg->fastmap = NULL;
 517     }
 518
 519   return (int) ret;
 520 }
 521 #ifdef _LIBC
 522 weak_alias (__regcomp, regcomp)
 523 #endif
 524
 525 /* Returns a message corresponding to an error code, ERRCODE, returned
 526    from either regcomp or regexec.   We don't use PREG here.  */
 527
 528 size_t
 529 regerror (errcode, preg, errbuf, errbuf_size)
 530     int errcode;
 531     const regex_t *__restrict preg;
 532     char *__restrict errbuf;
 533     size_t errbuf_size;
 534 {
 535   const char *msg;
 536   size_t msg_size;
 537
 538   if (BE (errcode < 0
 539           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 540                                / sizeof (__re_error_msgid_idx[0])), 0))
 541     /* Only error codes returned by the rest of the code should be passed
 542        to this routine.  If we are given anything else, or if other regex
 543        code generates an invalid error code, then the program has a bug.
 544        Dump core so we can fix it.  */
 545     abort ();
 546
 547   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 548
 549   msg_size = strlen (msg) + 1; /* Includes the null.  */
 550
 551   if (BE (errbuf_size != 0, 1))
 552     {
 553       if (BE (msg_size > errbuf_size, 0))
 554         {
 555 #if defined HAVE_MEMPCPY || defined _LIBC
 556           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 557 #else
 558           memcpy (errbuf, msg, errbuf_size - 1);
 559           errbuf[errbuf_size - 1] = 0;
 560 #endif
 561         }
 562       else
 563         memcpy (errbuf, msg, msg_size);
 564     }
 565
 566   return msg_size;
 567 }
 568 #ifdef _LIBC
 569 weak_alias (__regerror, regerror)
 570 #endif
 571
 572
 573 #ifdef RE_ENABLE_I18N
 574 /* This static array is used for the map to single-byte characters when
 575    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 576    it the same all the time.  UTF-8 is the preferred encoding so this is
 577    a worthwhile optimization.  */
 578 static const bitset_t utf8_sb_map =
 579 {
 580   /* Set the first 128 bits.  */
 581   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 582 };
 583 #endif
 584
 585
 586 static void
 587 free_dfa_content (re_dfa_t *dfa)
 588 {
 589   int i, j;
 590
 591   if (dfa->nodes)
 592     for (i = 0; i < dfa->nodes_len; ++i)
 593       free_token (dfa->nodes + i);
 594   re_free (dfa->nexts);
 595   for (i = 0; i < dfa->nodes_len; ++i)
 596     {
 597       if (dfa->eclosures != NULL)
 598         re_node_set_free (dfa->eclosures + i);
 599       if (dfa->inveclosures != NULL)
 600         re_node_set_free (dfa->inveclosures + i);
 601       if (dfa->edests != NULL)
 602         re_node_set_free (dfa->edests + i);
 603     }
 604   re_free (dfa->edests);
 605   re_free (dfa->eclosures);
 606   re_free (dfa->inveclosures);
 607   re_free (dfa->nodes);
 608
 609   if (dfa->state_table)
 610     for (i = 0; i <= dfa->state_hash_mask; ++i)
 611       {
 612         struct re_state_table_entry *entry = dfa->state_table + i;
 613         for (j = 0; j < entry->num; ++j)
 614           {
 615             re_dfastate_t *state = entry->array[j];
 616             free_state (state);
 617           }
 618         re_free (entry->array);
 619       }
 620   re_free (dfa->state_table);
 621 #ifdef RE_ENABLE_I18N
 622   if (dfa->sb_char != utf8_sb_map)
 623     re_free (dfa->sb_char);
 624 #endif
 625   re_free (dfa->subexp_map);
 626 #ifdef DEBUG
 627   re_free (dfa->re_str);
 628 #endif
 629
 630   re_free (dfa);
 631 }
 632
 633
 634 /* Free dynamically allocated space used by PREG.  */
 635
 636 void
 637 regfree (preg)
 638     regex_t *preg;
 639 {
 640   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 641   if (BE (dfa != NULL, 1))
 642     free_dfa_content (dfa);
 643   preg->buffer = NULL;
 644   preg->allocated = 0;
 645
 646   re_free (preg->fastmap);
 647   preg->fastmap = NULL;
 648
 649   re_free (preg->translate);
 650   preg->translate = NULL;
 651 }
 652 #ifdef _LIBC
 653 weak_alias (__regfree, regfree)
 654 #endif
 655 \f
 656 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 657    them unless specifically requested.  */
 658
 659 #if defined _REGEX_RE_COMP || defined _LIBC
 660
 661 /* BSD has one and only one pattern buffer.  */
 662 static struct re_pattern_buffer re_comp_buf;
 663
 664 char *
 665 # ifdef _LIBC
 666 /* Make these definitions weak in libc, so POSIX programs can redefine
 667    these names if they don't use our functions, and still use
 668    regcomp/regexec above without link errors.  */
 669 weak_function
 670 # endif
 671 re_comp (s)
 672      const char *s;
 673 {
 674   reg_errcode_t ret;
 675   char *fastmap;
 676
 677   if (!s)
 678     {
 679       if (!re_comp_buf.buffer)
 680         return gettext ("No previous regular expression");
 681       return 0;
 682     }
 683
 684   if (re_comp_buf.buffer)
 685     {
 686       fastmap = re_comp_buf.fastmap;
 687       re_comp_buf.fastmap = NULL;
 688       __regfree (&re_comp_buf);
 689       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 690       re_comp_buf.fastmap = fastmap;
 691     }
 692
 693   if (re_comp_buf.fastmap == NULL)
 694     {
 695       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 696       if (re_comp_buf.fastmap == NULL)
 697         return (char *) gettext (__re_error_msgid
 698                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 699     }
 700
 701   /* Since `re_exec' always passes NULL for the `regs' argument, we
 702      don't need to initialize the pattern buffer fields which affect it.  */
 703
 704   /* Match anchors at newlines.  */
 705   re_comp_buf.newline_anchor = 1;
 706
 707   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 708
 709   if (!ret)
 710     return NULL;
 711
 712   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 713   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 714 }
 715
 716 #ifdef _LIBC
 717 libc_freeres_fn (free_mem)
 718 {
 719   __regfree (&re_comp_buf);
 720 }
 721 #endif
 722
 723 #endif /* _REGEX_RE_COMP */
 724 \f
 725 /* Internal entry point.
 726    Compile the regular expression PATTERN, whose length is LENGTH.
 727    SYNTAX indicate regular expression's syntax.  */
 728
 729 static reg_errcode_t
 730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 731                      reg_syntax_t syntax)
 732 {
 733   reg_errcode_t err = REG_NOERROR;
 734   re_dfa_t *dfa;
 735   re_string_t regexp;
 736
 737   /* Initialize the pattern buffer.  */
 738   preg->fastmap_accurate = 0;
 739   preg->syntax = syntax;
 740   preg->not_bol = preg->not_eol = 0;
 741   preg->used = 0;
 742   preg->re_nsub = 0;
 743   preg->can_be_null = 0;
 744   preg->regs_allocated = REGS_UNALLOCATED;
 745
 746   /* Initialize the dfa.  */
 747   dfa = (re_dfa_t *) preg->buffer;
 748   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 749     {
 750       /* If zero allocated, but buffer is non-null, try to realloc
 751          enough space.  This loses if buffer's address is bogus, but
 752          that is the user's responsibility.  If ->buffer is NULL this
 753          is a simple allocation.  */
 754       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 755       if (dfa == NULL)
 756         return REG_ESPACE;
 757       preg->allocated = sizeof (re_dfa_t);
 758       preg->buffer = (unsigned char *) dfa;
 759     }
 760   preg->used = sizeof (re_dfa_t);
 761
 762   err = init_dfa (dfa, length);
 763   if (BE (err != REG_NOERROR, 0))
 764     {
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770 #ifdef DEBUG
 771   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 772   dfa->re_str = re_malloc (char, length + 1);
 773   strncpy (dfa->re_str, pattern, length + 1);
 774 #endif
 775
 776   __libc_lock_init (dfa->lock);
 777
 778   err = re_string_construct (&regexp, pattern, length, preg->translate,
 779                              syntax & RE_ICASE, dfa);
 780   if (BE (err != REG_NOERROR, 0))
 781     {
 782     re_compile_internal_free_return:
 783       free_workarea_compile (preg);
 784       re_string_destruct (&regexp);
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790
 791   /* Parse the regular expression, and build a structure tree.  */
 792   preg->re_nsub = 0;
 793   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 794   if (BE (dfa->str_tree == NULL, 0))
 795     goto re_compile_internal_free_return;
 796
 797   /* Analyze the tree and create the nfa.  */
 798   err = analyze (preg);
 799   if (BE (err != REG_NOERROR, 0))
 800     goto re_compile_internal_free_return;
 801
 802 #ifdef RE_ENABLE_I18N
 803   /* If possible, do searching in single byte encoding to speed things up.  */
 804   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 805     optimize_utf8 (dfa);
 806 #endif
 807
 808   /* Then create the initial state of the dfa.  */
 809   err = create_initial_state (dfa);
 810
 811   /* Release work areas.  */
 812   free_workarea_compile (preg);
 813   re_string_destruct (&regexp);
 814
 815   if (BE (err != REG_NOERROR, 0))
 816     {
 817       free_dfa_content (dfa);
 818       preg->buffer = NULL;
 819       preg->allocated = 0;
 820     }
 821
 822   return err;
 823 }
 824
 825 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 826    as the initial length of some arrays.  */
 827
 828 static reg_errcode_t
 829 init_dfa (re_dfa_t *dfa, size_t pat_len)
 830 {
 831   unsigned int table_size;
 832 #ifndef _LIBC
 833   char *codeset_name;
 834 #endif
 835
 836   memset (dfa, '\0', sizeof (re_dfa_t));
 837
 838   /* Force allocation of str_tree_storage the first time.  */
 839   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 840
 841   /* Avoid overflows.  */
 842   if (pat_len == SIZE_MAX)
 843     return REG_ESPACE;
 844
 845   dfa->nodes_alloc = pat_len + 1;
 846   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 847
 848   /*  table_size = 2 ^ ceil(log pat_len) */
 849   for (table_size = 1; ; table_size <<= 1)
 850     if (table_size > pat_len)
 851       break;
 852
 853   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 854   dfa->state_hash_mask = table_size - 1;
 855
 856   dfa->mb_cur_max = MB_CUR_MAX;
 857 #ifdef _LIBC
 858   if (dfa->mb_cur_max == 6
 859       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 860     dfa->is_utf8 = 1;
 861   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 862                        != 0);
 863 #else
 864 # ifdef HAVE_LANGINFO_CODESET
 865   codeset_name = nl_langinfo (CODESET);
 866 # else
 867   codeset_name = getenv ("LC_ALL");
 868   if (codeset_name == NULL || codeset_name[0] == '\0')
 869     codeset_name = getenv ("LC_CTYPE");
 870   if (codeset_name == NULL || codeset_name[0] == '\0')
 871     codeset_name = getenv ("LANG");
 872   if (codeset_name == NULL)
 873     codeset_name = "";
 874   else if (strchr (codeset_name, '.') !=  NULL)
 875     codeset_name = strchr (codeset_name, '.') + 1;
 876 # endif
 877
 878   if (strcasecmp (codeset_name, "UTF-8") == 0
 879       || strcasecmp (codeset_name, "UTF8") == 0)
 880     dfa->is_utf8 = 1;
 881
 882   /* We check exhaustively in the loop below if this charset is a
 883      superset of ASCII.  */
 884   dfa->map_notascii = 0;
 885 #endif
 886
 887 #ifdef RE_ENABLE_I18N
 888   if (dfa->mb_cur_max > 1)
 889     {
 890       if (dfa->is_utf8)
 891         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 892       else
 893         {
 894           int i, j, ch;
 895
 896           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 897           if (BE (dfa->sb_char == NULL, 0))
 898             return REG_ESPACE;
 899
 900           /* Set the bits corresponding to single byte chars.  */
 901           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 902             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 903               {
 904                 wint_t wch = __btowc (ch);
 905                 if (wch != WEOF)
 906                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 907 # ifndef _LIBC
 908                 if (isascii (ch) && wch != ch)
 909                   dfa->map_notascii = 1;
 910 # endif
 911               }
 912         }
 913     }
 914 #endif
 915
 916   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 917     return REG_ESPACE;
 918   return REG_NOERROR;
 919 }
 920
 921 /* Initialize WORD_CHAR table, which indicate which character is
 922    "word".  In this case "word" means that it is the word construction
 923    character used by some operators like "\<", "\>", etc.  */
 924
 925 static void
 926 internal_function
 927 init_word_char (re_dfa_t *dfa)
 928 {
 929   int i, j, ch;
 930   dfa->word_ops_used = 1;
 931   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 932     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 933       if (isalnum (ch) || ch == '_')
 934         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 935 }
 936
 937 /* Free the work area which are only used while compiling.  */
 938
 939 static void
 940 free_workarea_compile (regex_t *preg)
 941 {
 942   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 943   bin_tree_storage_t *storage, *next;
 944   for (storage = dfa->str_tree_storage; storage; storage = next)
 945     {
 946       next = storage->next;
 947       re_free (storage);
 948     }
 949   dfa->str_tree_storage = NULL;
 950   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 951   dfa->str_tree = NULL;
 952   re_free (dfa->org_indices);
 953   dfa->org_indices = NULL;
 954 }
 955
 956 /* Create initial states for all contexts.  */
 957
 958 static reg_errcode_t
 959 create_initial_state (re_dfa_t *dfa)
 960 {
 961   int first, i;
 962   reg_errcode_t err;
 963   re_node_set init_nodes;
 964
 965   /* Initial states have the epsilon closure of the node which is
 966      the first node of the regular expression.  */
 967   first = dfa->str_tree->first->node_idx;
 968   dfa->init_node = first;
 969   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 970   if (BE (err != REG_NOERROR, 0))
 971     return err;
 972
 973   /* The back-references which are in initial states can epsilon transit,
 974      since in this case all of the subexpressions can be null.
 975      Then we add epsilon closures of the nodes which are the next nodes of
 976      the back-references.  */
 977   if (dfa->nbackref > 0)
 978     for (i = 0; i < init_nodes.nelem; ++i)
 979       {
 980         int node_idx = init_nodes.elems[i];
 981         re_token_type_t type = dfa->nodes[node_idx].type;
 982
 983         int clexp_idx;
 984         if (type != OP_BACK_REF)
 985           continue;
 986         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 987           {
 988             re_token_t *clexp_node;
 989             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 990             if (clexp_node->type == OP_CLOSE_SUBEXP
 991                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 992               break;
 993           }
 994         if (clexp_idx == init_nodes.nelem)
 995           continue;
 996
 997         if (type == OP_BACK_REF)
 998           {
 999             int dest_idx = dfa->edests[node_idx].elems[0];
1000             if (!re_node_set_contains (&init_nodes, dest_idx))
1001               {
1002                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1003                 i = 0;
1004               }
1005           }
1006       }
1007
1008   /* It must be the first time to invoke acquire_state.  */
1009   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1010   /* We don't check ERR here, since the initial state must not be NULL.  */
1011   if (BE (dfa->init_state == NULL, 0))
1012     return err;
1013   if (dfa->init_state->has_constraint)
1014     {
1015       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1016                                                        CONTEXT_WORD);
1017       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1018                                                      CONTEXT_NEWLINE);
1019       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1020                                                          &init_nodes,
1021                                                          CONTEXT_NEWLINE
1022                                                          | CONTEXT_BEGBUF);
1023       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1024               || dfa->init_state_begbuf == NULL, 0))
1025         return err;
1026     }
1027   else
1028     dfa->init_state_word = dfa->init_state_nl
1029       = dfa->init_state_begbuf = dfa->init_state;
1030
1031   re_node_set_free (&init_nodes);
1032   return REG_NOERROR;
1033 }
1034 \f
1035 #ifdef RE_ENABLE_I18N
1036 /* If it is possible to do searching in single byte encoding instead of UTF-8
1037    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1038    DFA nodes where needed.  */
1039
1040 static void
1041 optimize_utf8 (re_dfa_t *dfa)
1042 {
1043   int node, i, mb_chars = 0, has_period = 0;
1044
1045   for (node = 0; node < dfa->nodes_len; ++node)
1046     switch (dfa->nodes[node].type)
1047       {
1048       case CHARACTER:
1049         if (dfa->nodes[node].opr.c >= 0x80)
1050           mb_chars = 1;
1051         break;
1052       case ANCHOR:
1053         switch (dfa->nodes[node].opr.ctx_type)
1054           {
1055           case LINE_FIRST:
1056           case LINE_LAST:
1057           case BUF_FIRST:
1058           case BUF_LAST:
1059             break;
1060           default:
1061             /* Word anchors etc. cannot be handled.  It's okay to test
1062                opr.ctx_type since constraints (for all DFA nodes) are
1063                created by ORing one or more opr.ctx_type values.  */
1064             return;
1065           }
1066         break;
1067       case OP_PERIOD:
1068         has_period = 1;
1069         break;
1070       case OP_BACK_REF:
1071       case OP_ALT:
1072       case END_OF_RE:
1073       case OP_DUP_ASTERISK:
1074       case OP_OPEN_SUBEXP:
1075       case OP_CLOSE_SUBEXP:
1076         break;
1077       case COMPLEX_BRACKET:
1078         return;
1079       case SIMPLE_BRACKET:
1080         /* Just double check.  The non-ASCII range starts at 0x80.  */
1081         assert (0x80 % BITSET_WORD_BITS == 0);
1082         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1083           if (dfa->nodes[node].opr.sbcset[i])
1084             return;
1085         break;
1086       default:
1087         abort ();
1088       }
1089
1090   if (mb_chars || has_period)
1091     for (node = 0; node < dfa->nodes_len; ++node)
1092       {
1093         if (dfa->nodes[node].type == CHARACTER
1094             && dfa->nodes[node].opr.c >= 0x80)
1095           dfa->nodes[node].mb_partial = 0;
1096         else if (dfa->nodes[node].type == OP_PERIOD)
1097           dfa->nodes[node].type = OP_UTF8_PERIOD;
1098       }
1099
1100   /* The search can be in single byte locale.  */
1101   dfa->mb_cur_max = 1;
1102   dfa->is_utf8 = 0;
1103   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1104 }
1105 #endif
1106 \f
1107 /* Analyze the structure tree, and calculate "first", "next", "edest",
1108    "eclosure", and "inveclosure".  */
1109
1110 static reg_errcode_t
1111 analyze (regex_t *preg)
1112 {
1113   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1114   reg_errcode_t ret;
1115
1116   /* Allocate arrays.  */
1117   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1118   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1119   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1120   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1121   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1122           || dfa->eclosures == NULL, 0))
1123     return REG_ESPACE;
1124
1125   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1126   if (dfa->subexp_map != NULL)
1127     {
1128       int i;
1129       for (i = 0; i < preg->re_nsub; i++)
1130         dfa->subexp_map[i] = i;
1131       preorder (dfa->str_tree, optimize_subexps, dfa);
1132       for (i = 0; i < preg->re_nsub; i++)
1133         if (dfa->subexp_map[i] != i)
1134           break;
1135       if (i == preg->re_nsub)
1136         {
1137           free (dfa->subexp_map);
1138           dfa->subexp_map = NULL;
1139         }
1140     }
1141
1142   ret = postorder (dfa->str_tree, lower_subexps, preg);
1143   if (BE (ret != REG_NOERROR, 0))
1144     return ret;
1145   ret = postorder (dfa->str_tree, calc_first, dfa);
1146   if (BE (ret != REG_NOERROR, 0))
1147     return ret;
1148   preorder (dfa->str_tree, calc_next, dfa);
1149   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1150   if (BE (ret != REG_NOERROR, 0))
1151     return ret;
1152   ret = calc_eclosure (dfa);
1153   if (BE (ret != REG_NOERROR, 0))
1154     return ret;
1155
1156   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1157      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1158   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1159       || dfa->nbackref)
1160     {
1161       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1162       if (BE (dfa->inveclosures == NULL, 0))
1163         return REG_ESPACE;
1164       ret = calc_inveclosure (dfa);
1165     }
1166
1167   return ret;
1168 }
1169
1170 /* Our parse trees are very unbalanced, so we cannot use a stack to
1171    implement parse tree visits.  Instead, we use parent pointers and
1172    some hairy code in these two functions.  */
1173 static reg_errcode_t
1174 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1175            void *extra)
1176 {
1177   bin_tree_t *node, *prev;
1178
1179   for (node = root; ; )
1180     {
1181       /* Descend down the tree, preferably to the left (or to the right
1182          if that's the only child).  */
1183       while (node->left || node->right)
1184         if (node->left)
1185           node = node->left;
1186         else
1187           node = node->right;
1188
1189       do
1190         {
1191           reg_errcode_t err = fn (extra, node);
1192           if (BE (err != REG_NOERROR, 0))
1193             return err;
1194           if (node->parent == NULL)
1195             return REG_NOERROR;
1196           prev = node;
1197           node = node->parent;
1198         }
1199       /* Go up while we have a node that is reached from the right.  */
1200       while (node->right == prev || node->right == NULL);
1201       node = node->right;
1202     }
1203 }
1204
1205 static reg_errcode_t
1206 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1207           void *extra)
1208 {
1209   bin_tree_t *node;
1210
1211   for (node = root; ; )
1212     {
1213       reg_errcode_t err = fn (extra, node);
1214       if (BE (err != REG_NOERROR, 0))
1215         return err;
1216
1217       /* Go to the left node, or up and to the right.  */
1218       if (node->left)
1219         node = node->left;
1220       else
1221         {
1222           bin_tree_t *prev = NULL;
1223           while (node->right == prev || node->right == NULL)
1224             {
1225               prev = node;
1226               node = node->parent;
1227               if (!node)
1228                 return REG_NOERROR;
1229             }
1230           node = node->right;
1231         }
1232     }
1233 }
1234
1235 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1236    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1237    backreferences as well.  Requires a preorder visit.  */
1238 static reg_errcode_t
1239 optimize_subexps (void *extra, bin_tree_t *node)
1240 {
1241   re_dfa_t *dfa = (re_dfa_t *) extra;
1242
1243   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1244     {
1245       int idx = node->token.opr.idx;
1246       node->token.opr.idx = dfa->subexp_map[idx];
1247       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1248     }
1249
1250   else if (node->token.type == SUBEXP
1251            && node->left && node->left->token.type == SUBEXP)
1252     {
1253       int other_idx = node->left->token.opr.idx;
1254
1255       node->left = node->left->left;
1256       if (node->left)
1257         node->left->parent = node;
1258
1259       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1260       if (other_idx < BITSET_WORD_BITS)
1261           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1262     }
1263
1264   return REG_NOERROR;
1265 }
1266
1267 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1268    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1269 static reg_errcode_t
1270 lower_subexps (void *extra, bin_tree_t *node)
1271 {
1272   regex_t *preg = (regex_t *) extra;
1273   reg_errcode_t err = REG_NOERROR;
1274
1275   if (node->left && node->left->token.type == SUBEXP)
1276     {
1277       node->left = lower_subexp (&err, preg, node->left);
1278       if (node->left)
1279         node->left->parent = node;
1280     }
1281   if (node->right && node->right->token.type == SUBEXP)
1282     {
1283       node->right = lower_subexp (&err, preg, node->right);
1284       if (node->right)
1285         node->right->parent = node;
1286     }
1287
1288   return err;
1289 }
1290
1291 static bin_tree_t *
1292 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1293 {
1294   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1295   bin_tree_t *body = node->left;
1296   bin_tree_t *op, *cls, *tree1, *tree;
1297
1298   if (preg->no_sub
1299       /* We do not optimize empty subexpressions, because otherwise we may
1300          have bad CONCAT nodes with NULL children.  This is obviously not
1301          very common, so we do not lose much.  An example that triggers
1302          this case is the sed "script" /\(\)/x.  */
1303       && node->left != NULL
1304       && (node->token.opr.idx >= BITSET_WORD_BITS
1305           || !(dfa->used_bkref_map
1306                & ((bitset_word_t) 1 << node->token.opr.idx))))
1307     return node->left;
1308
1309   /* Convert the SUBEXP node to the concatenation of an
1310      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1311   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1312   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1313   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1314   tree = create_tree (dfa, op, tree1, CONCAT);
1315   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1316     {
1317       *err = REG_ESPACE;
1318       return NULL;
1319     }
1320
1321   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1322   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1323   return tree;
1324 }
1325
1326 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1327    nodes.  Requires a postorder visit.  */
1328 static reg_errcode_t
1329 calc_first (void *extra, bin_tree_t *node)
1330 {
1331   re_dfa_t *dfa = (re_dfa_t *) extra;
1332   if (node->token.type == CONCAT)
1333     {
1334       node->first = node->left->first;
1335       node->node_idx = node->left->node_idx;
1336     }
1337   else
1338     {
1339       node->first = node;
1340       node->node_idx = re_dfa_add_node (dfa, node->token);
1341       if (BE (node->node_idx == -1, 0))
1342         return REG_ESPACE;
1343       if (node->token.type == ANCHOR)
1344         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1345     }
1346   return REG_NOERROR;
1347 }
1348
1349 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1350 static reg_errcode_t
1351 calc_next (void *extra, bin_tree_t *node)
1352 {
1353   switch (node->token.type)
1354     {
1355     case OP_DUP_ASTERISK:
1356       node->left->next = node;
1357       break;
1358     case CONCAT:
1359       node->left->next = node->right->first;
1360       node->right->next = node->next;
1361       break;
1362     default:
1363       if (node->left)
1364         node->left->next = node->next;
1365       if (node->right)
1366         node->right->next = node->next;
1367       break;
1368     }
1369   return REG_NOERROR;
1370 }
1371
1372 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1373 static reg_errcode_t
1374 link_nfa_nodes (void *extra, bin_tree_t *node)
1375 {
1376   re_dfa_t *dfa = (re_dfa_t *) extra;
1377   int idx = node->node_idx;
1378   reg_errcode_t err = REG_NOERROR;
1379
1380   switch (node->token.type)
1381     {
1382     case CONCAT:
1383       break;
1384
1385     case END_OF_RE:
1386       assert (node->next == NULL);
1387       break;
1388
1389     case OP_DUP_ASTERISK:
1390     case OP_ALT:
1391       {
1392         int left, right;
1393         dfa->has_plural_match = 1;
1394         if (node->left != NULL)
1395           left = node->left->first->node_idx;
1396         else
1397           left = node->next->node_idx;
1398         if (node->right != NULL)
1399           right = node->right->first->node_idx;
1400         else
1401           right = node->next->node_idx;
1402         assert (left > -1);
1403         assert (right > -1);
1404         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1405       }
1406       break;
1407
1408     case ANCHOR:
1409     case OP_OPEN_SUBEXP:
1410     case OP_CLOSE_SUBEXP:
1411       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1412       break;
1413
1414     case OP_BACK_REF:
1415       dfa->nexts[idx] = node->next->node_idx;
1416       if (node->token.type == OP_BACK_REF)
1417         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1418       break;
1419
1420     default:
1421       assert (!IS_EPSILON_NODE (node->token.type));
1422       dfa->nexts[idx] = node->next->node_idx;
1423       break;
1424     }
1425
1426   return err;
1427 }
1428
1429 /* Duplicate the epsilon closure of the node ROOT_NODE.
1430    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1431    to their own constraint.  */
1432
1433 static reg_errcode_t
1434 internal_function
1435 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1436                         int root_node, unsigned int init_constraint)
1437 {
1438   int org_node, clone_node, ret;
1439   unsigned int constraint = init_constraint;
1440   for (org_node = top_org_node, clone_node = top_clone_node;;)
1441     {
1442       int org_dest, clone_dest;
1443       if (dfa->nodes[org_node].type == OP_BACK_REF)
1444         {
1445           /* If the back reference epsilon-transit, its destination must
1446              also have the constraint.  Then duplicate the epsilon closure
1447              of the destination of the back reference, and store it in
1448              edests of the back reference.  */
1449           org_dest = dfa->nexts[org_node];
1450           re_node_set_empty (dfa->edests + clone_node);
1451           clone_dest = duplicate_node (dfa, org_dest, constraint);
1452           if (BE (clone_dest == -1, 0))
1453             return REG_ESPACE;
1454           dfa->nexts[clone_node] = dfa->nexts[org_node];
1455           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1456           if (BE (ret < 0, 0))
1457             return REG_ESPACE;
1458         }
1459       else if (dfa->edests[org_node].nelem == 0)
1460         {
1461           /* In case of the node can't epsilon-transit, don't duplicate the
1462              destination and store the original destination as the
1463              destination of the node.  */
1464           dfa->nexts[clone_node] = dfa->nexts[org_node];
1465           break;
1466         }
1467       else if (dfa->edests[org_node].nelem == 1)
1468         {
1469           /* In case of the node can epsilon-transit, and it has only one
1470              destination.  */
1471           org_dest = dfa->edests[org_node].elems[0];
1472           re_node_set_empty (dfa->edests + clone_node);
1473           /* If the node is root_node itself, it means the epsilon clsoure
1474              has a loop.   Then tie it to the destination of the root_node.  */
1475           if (org_node == root_node && clone_node != org_node)
1476             {
1477               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1478               if (BE (ret < 0, 0))
1479                 return REG_ESPACE;
1480               break;
1481             }
1482           /* In case of the node has another constraint, add it.  */
1483           constraint |= dfa->nodes[org_node].constraint;
1484           clone_dest = duplicate_node (dfa, org_dest, constraint);
1485           if (BE (clone_dest == -1, 0))
1486             return REG_ESPACE;
1487           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488           if (BE (ret < 0, 0))
1489             return REG_ESPACE;
1490         }
1491       else /* dfa->edests[org_node].nelem == 2 */
1492         {
1493           /* In case of the node can epsilon-transit, and it has two
1494              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1495           org_dest = dfa->edests[org_node].elems[0];
1496           re_node_set_empty (dfa->edests + clone_node);
1497           /* Search for a duplicated node which satisfies the constraint.  */
1498           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1499           if (clone_dest == -1)
1500             {
1501               /* There is no such duplicated node, create a new one.  */
1502               reg_errcode_t err;
1503               clone_dest = duplicate_node (dfa, org_dest, constraint);
1504               if (BE (clone_dest == -1, 0))
1505                 return REG_ESPACE;
1506               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1507               if (BE (ret < 0, 0))
1508                 return REG_ESPACE;
1509               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1510                                             root_node, constraint);
1511               if (BE (err != REG_NOERROR, 0))
1512                 return err;
1513             }
1514           else
1515             {
1516               /* There is a duplicated node which satisfies the constraint,
1517                  use it to avoid infinite loop.  */
1518               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519               if (BE (ret < 0, 0))
1520                 return REG_ESPACE;
1521             }
1522
1523           org_dest = dfa->edests[org_node].elems[1];
1524           clone_dest = duplicate_node (dfa, org_dest, constraint);
1525           if (BE (clone_dest == -1, 0))
1526             return REG_ESPACE;
1527           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528           if (BE (ret < 0, 0))
1529             return REG_ESPACE;
1530         }
1531       org_node = org_dest;
1532       clone_node = clone_dest;
1533     }
1534   return REG_NOERROR;
1535 }
1536
1537 /* Search for a node which is duplicated from the node ORG_NODE, and
1538    satisfies the constraint CONSTRAINT.  */
1539
1540 static int
1541 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1542                         unsigned int constraint)
1543 {
1544   int idx;
1545   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1546     {
1547       if (org_node == dfa->org_indices[idx]
1548           && constraint == dfa->nodes[idx].constraint)
1549         return idx; /* Found.  */
1550     }
1551   return -1; /* Not found.  */
1552 }
1553
1554 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1555    Return the index of the new node, or -1 if insufficient storage is
1556    available.  */
1557
1558 static int
1559 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1560 {
1561   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1562   if (BE (dup_idx != -1, 1))
1563     {
1564       dfa->nodes[dup_idx].constraint = constraint;
1565       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1566       dfa->nodes[dup_idx].duplicated = 1;
1567
1568       /* Store the index of the original node.  */
1569       dfa->org_indices[dup_idx] = org_idx;
1570     }
1571   return dup_idx;
1572 }
1573
1574 static reg_errcode_t
1575 calc_inveclosure (re_dfa_t *dfa)
1576 {
1577   int src, idx, ret;
1578   for (idx = 0; idx < dfa->nodes_len; ++idx)
1579     re_node_set_init_empty (dfa->inveclosures + idx);
1580
1581   for (src = 0; src < dfa->nodes_len; ++src)
1582     {
1583       int *elems = dfa->eclosures[src].elems;
1584       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1585         {
1586           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1587           if (BE (ret == -1, 0))
1588             return REG_ESPACE;
1589         }
1590     }
1591
1592   return REG_NOERROR;
1593 }
1594
1595 /* Calculate "eclosure" for all the node in DFA.  */
1596
1597 static reg_errcode_t
1598 calc_eclosure (re_dfa_t *dfa)
1599 {
1600   int node_idx, incomplete;
1601 #ifdef DEBUG
1602   assert (dfa->nodes_len > 0);
1603 #endif
1604   incomplete = 0;
1605   /* For each nodes, calculate epsilon closure.  */
1606   for (node_idx = 0; ; ++node_idx)
1607     {
1608       reg_errcode_t err;
1609       re_node_set eclosure_elem;
1610       if (node_idx == dfa->nodes_len)
1611         {
1612           if (!incomplete)
1613             break;
1614           incomplete = 0;
1615           node_idx = 0;
1616         }
1617
1618 #ifdef DEBUG
1619       assert (dfa->eclosures[node_idx].nelem != -1);
1620 #endif
1621
1622       /* If we have already calculated, skip it.  */
1623       if (dfa->eclosures[node_idx].nelem != 0)
1624         continue;
1625       /* Calculate epsilon closure of `node_idx'.  */
1626       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1627       if (BE (err != REG_NOERROR, 0))
1628         return err;
1629
1630       if (dfa->eclosures[node_idx].nelem == 0)
1631         {
1632           incomplete = 1;
1633           re_node_set_free (&eclosure_elem);
1634         }
1635     }
1636   return REG_NOERROR;
1637 }
1638
1639 /* Calculate epsilon closure of NODE.  */
1640
1641 static reg_errcode_t
1642 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1643 {
1644   reg_errcode_t err;
1645   int i;
1646   re_node_set eclosure;
1647   int ret;
1648   int incomplete = 0;
1649   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1650   if (BE (err != REG_NOERROR, 0))
1651     return err;
1652
1653   /* This indicates that we are calculating this node now.
1654      We reference this value to avoid infinite loop.  */
1655   dfa->eclosures[node].nelem = -1;
1656
1657   /* If the current node has constraints, duplicate all nodes
1658      since they must inherit the constraints.  */
1659   if (dfa->nodes[node].constraint
1660       && dfa->edests[node].nelem
1661       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1662     {
1663       err = duplicate_node_closure (dfa, node, node, node,
1664                                     dfa->nodes[node].constraint);
1665       if (BE (err != REG_NOERROR, 0))
1666         return err;
1667     }
1668
1669   /* Expand each epsilon destination nodes.  */
1670   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1671     for (i = 0; i < dfa->edests[node].nelem; ++i)
1672       {
1673         re_node_set eclosure_elem;
1674         int edest = dfa->edests[node].elems[i];
1675         /* If calculating the epsilon closure of `edest' is in progress,
1676            return intermediate result.  */
1677         if (dfa->eclosures[edest].nelem == -1)
1678           {
1679             incomplete = 1;
1680             continue;
1681           }
1682         /* If we haven't calculated the epsilon closure of `edest' yet,
1683            calculate now. Otherwise use calculated epsilon closure.  */
1684         if (dfa->eclosures[edest].nelem == 0)
1685           {
1686             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1687             if (BE (err != REG_NOERROR, 0))
1688               return err;
1689           }
1690         else
1691           eclosure_elem = dfa->eclosures[edest];
1692         /* Merge the epsilon closure of `edest'.  */
1693         re_node_set_merge (&eclosure, &eclosure_elem);
1694         /* If the epsilon closure of `edest' is incomplete,
1695            the epsilon closure of this node is also incomplete.  */
1696         if (dfa->eclosures[edest].nelem == 0)
1697           {
1698             incomplete = 1;
1699             re_node_set_free (&eclosure_elem);
1700           }
1701       }
1702
1703   /* An epsilon closure includes itself.  */
1704   ret = re_node_set_insert (&eclosure, node);
1705   if (BE (ret < 0, 0))
1706     return REG_ESPACE;
1707   if (incomplete && !root)
1708     dfa->eclosures[node].nelem = 0;
1709   else
1710     dfa->eclosures[node] = eclosure;
1711   *new_set = eclosure;
1712   return REG_NOERROR;
1713 }
1714 \f
1715 /* Functions for token which are used in the parser.  */
1716
1717 /* Fetch a token from INPUT.
1718    We must not use this function inside bracket expressions.  */
1719
1720 static void
1721 internal_function
1722 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1723 {
1724   re_string_skip_bytes (input, peek_token (result, input, syntax));
1725 }
1726
1727 /* Peek a token from INPUT, and return the length of the token.
1728    We must not use this function inside bracket expressions.  */
1729
1730 static int
1731 internal_function
1732 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1733 {
1734   unsigned char c;
1735
1736   if (re_string_eoi (input))
1737     {
1738       token->type = END_OF_RE;
1739       return 0;
1740     }
1741
1742   c = re_string_peek_byte (input, 0);
1743   token->opr.c = c;
1744
1745   token->word_char = 0;
1746 #ifdef RE_ENABLE_I18N
1747   token->mb_partial = 0;
1748   if (input->mb_cur_max > 1 &&
1749       !re_string_first_byte (input, re_string_cur_idx (input)))
1750     {
1751       token->type = CHARACTER;
1752       token->mb_partial = 1;
1753       return 1;
1754     }
1755 #endif
1756   if (c == '\\')
1757     {
1758       unsigned char c2;
1759       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1760         {
1761           token->type = BACK_SLASH;
1762           return 1;
1763         }
1764
1765       c2 = re_string_peek_byte_case (input, 1);
1766       token->opr.c = c2;
1767       token->type = CHARACTER;
1768 #ifdef RE_ENABLE_I18N
1769       if (input->mb_cur_max > 1)
1770         {
1771           wint_t wc = re_string_wchar_at (input,
1772                                           re_string_cur_idx (input) + 1);
1773           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1774         }
1775       else
1776 #endif
1777         token->word_char = IS_WORD_CHAR (c2) != 0;
1778
1779       switch (c2)
1780         {
1781         case '|':
1782           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1783             token->type = OP_ALT;
1784           break;
1785         case '1': case '2': case '3': case '4': case '5':
1786         case '6': case '7': case '8': case '9':
1787           if (!(syntax & RE_NO_BK_REFS))
1788             {
1789               token->type = OP_BACK_REF;
1790               token->opr.idx = c2 - '1';
1791             }
1792           break;
1793         case '<':
1794           if (!(syntax & RE_NO_GNU_OPS))
1795             {
1796               token->type = ANCHOR;
1797               token->opr.ctx_type = WORD_FIRST;
1798             }
1799           break;
1800         case '>':
1801           if (!(syntax & RE_NO_GNU_OPS))
1802             {
1803               token->type = ANCHOR;
1804               token->opr.ctx_type = WORD_LAST;
1805             }
1806           break;
1807         case 'b':
1808           if (!(syntax & RE_NO_GNU_OPS))
1809             {
1810               token->type = ANCHOR;
1811               token->opr.ctx_type = WORD_DELIM;
1812             }
1813           break;
1814         case 'B':
1815           if (!(syntax & RE_NO_GNU_OPS))
1816             {
1817               token->type = ANCHOR;
1818               token->opr.ctx_type = NOT_WORD_DELIM;
1819             }
1820           break;
1821         case 'w':
1822           if (!(syntax & RE_NO_GNU_OPS))
1823             token->type = OP_WORD;
1824           break;
1825         case 'W':
1826           if (!(syntax & RE_NO_GNU_OPS))
1827             token->type = OP_NOTWORD;
1828           break;
1829         case 's':
1830           if (!(syntax & RE_NO_GNU_OPS))
1831             token->type = OP_SPACE;
1832           break;
1833         case 'S':
1834           if (!(syntax & RE_NO_GNU_OPS))
1835             token->type = OP_NOTSPACE;
1836           break;
1837         case '`':
1838           if (!(syntax & RE_NO_GNU_OPS))
1839             {
1840               token->type = ANCHOR;
1841               token->opr.ctx_type = BUF_FIRST;
1842             }
1843           break;
1844         case '\'':
1845           if (!(syntax & RE_NO_GNU_OPS))
1846             {
1847               token->type = ANCHOR;
1848               token->opr.ctx_type = BUF_LAST;
1849             }
1850           break;
1851         case '(':
1852           if (!(syntax & RE_NO_BK_PARENS))
1853             token->type = OP_OPEN_SUBEXP;
1854           break;
1855         case ')':
1856           if (!(syntax & RE_NO_BK_PARENS))
1857             token->type = OP_CLOSE_SUBEXP;
1858           break;
1859         case '+':
1860           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1861             token->type = OP_DUP_PLUS;
1862           break;
1863         case '?':
1864           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1865             token->type = OP_DUP_QUESTION;
1866           break;
1867         case '{':
1868           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1869             token->type = OP_OPEN_DUP_NUM;
1870           break;
1871         case '}':
1872           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1873             token->type = OP_CLOSE_DUP_NUM;
1874           break;
1875         default:
1876           break;
1877         }
1878       return 2;
1879     }
1880
1881   token->type = CHARACTER;
1882 #ifdef RE_ENABLE_I18N
1883   if (input->mb_cur_max > 1)
1884     {
1885       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1886       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1887     }
1888   else
1889 #endif
1890     token->word_char = IS_WORD_CHAR (token->opr.c);
1891
1892   switch (c)
1893     {
1894     case '\n':
1895       if (syntax & RE_NEWLINE_ALT)
1896         token->type = OP_ALT;
1897       break;
1898     case '|':
1899       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1900         token->type = OP_ALT;
1901       break;
1902     case '*':
1903       token->type = OP_DUP_ASTERISK;
1904       break;
1905     case '+':
1906       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1907         token->type = OP_DUP_PLUS;
1908       break;
1909     case '?':
1910       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1911         token->type = OP_DUP_QUESTION;
1912       break;
1913     case '{':
1914       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1915         token->type = OP_OPEN_DUP_NUM;
1916       break;
1917     case '}':
1918       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1919         token->type = OP_CLOSE_DUP_NUM;
1920       break;
1921     case '(':
1922       if (syntax & RE_NO_BK_PARENS)
1923         token->type = OP_OPEN_SUBEXP;
1924       break;
1925     case ')':
1926       if (syntax & RE_NO_BK_PARENS)
1927         token->type = OP_CLOSE_SUBEXP;
1928       break;
1929     case '[':
1930       token->type = OP_OPEN_BRACKET;
1931       break;
1932     case '.':
1933       token->type = OP_PERIOD;
1934       break;
1935     case '^':
1936       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1937           re_string_cur_idx (input) != 0)
1938         {
1939           char prev = re_string_peek_byte (input, -1);
1940           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1941             break;
1942         }
1943       token->type = ANCHOR;
1944       token->opr.ctx_type = LINE_FIRST;
1945       break;
1946     case '$':
1947       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1948           re_string_cur_idx (input) + 1 != re_string_length (input))
1949         {
1950           re_token_t next;
1951           re_string_skip_bytes (input, 1);
1952           peek_token (&next, input, syntax);
1953           re_string_skip_bytes (input, -1);
1954           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1955             break;
1956         }
1957       token->type = ANCHOR;
1958       token->opr.ctx_type = LINE_LAST;
1959       break;
1960     default:
1961       break;
1962     }
1963   return 1;
1964 }
1965
1966 /* Peek a token from INPUT, and return the length of the token.
1967    We must not use this function out of bracket expressions.  */
1968
1969 static int
1970 internal_function
1971 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1972 {
1973   unsigned char c;
1974   if (re_string_eoi (input))
1975     {
1976       token->type = END_OF_RE;
1977       return 0;
1978     }
1979   c = re_string_peek_byte (input, 0);
1980   token->opr.c = c;
1981
1982 #ifdef RE_ENABLE_I18N
1983   if (input->mb_cur_max > 1 &&
1984       !re_string_first_byte (input, re_string_cur_idx (input)))
1985     {
1986       token->type = CHARACTER;
1987       return 1;
1988     }
1989 #endif /* RE_ENABLE_I18N */
1990
1991   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1992       && re_string_cur_idx (input) + 1 < re_string_length (input))
1993     {
1994       /* In this case, '\' escape a character.  */
1995       unsigned char c2;
1996       re_string_skip_bytes (input, 1);
1997       c2 = re_string_peek_byte (input, 0);
1998       token->opr.c = c2;
1999       token->type = CHARACTER;
2000       return 1;
2001     }
2002   if (c == '[') /* '[' is a special char in a bracket exps.  */
2003     {
2004       unsigned char c2;
2005       int token_len;
2006       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2007         c2 = re_string_peek_byte (input, 1);
2008       else
2009         c2 = 0;
2010       token->opr.c = c2;
2011       token_len = 2;
2012       switch (c2)
2013         {
2014         case '.':
2015           token->type = OP_OPEN_COLL_ELEM;
2016           break;
2017         case '=':
2018           token->type = OP_OPEN_EQUIV_CLASS;
2019           break;
2020         case ':':
2021           if (syntax & RE_CHAR_CLASSES)
2022             {
2023               token->type = OP_OPEN_CHAR_CLASS;
2024               break;
2025             }
2026           /* else fall through.  */
2027         default:
2028           token->type = CHARACTER;
2029           token->opr.c = c;
2030           token_len = 1;
2031           break;
2032         }
2033       return token_len;
2034     }
2035   switch (c)
2036     {
2037     case '-':
2038       token->type = OP_CHARSET_RANGE;
2039       break;
2040     case ']':
2041       token->type = OP_CLOSE_BRACKET;
2042       break;
2043     case '^':
2044       token->type = OP_NON_MATCH_LIST;
2045       break;
2046     default:
2047       token->type = CHARACTER;
2048     }
2049   return 1;
2050 }
2051 \f
2052 /* Functions for parser.  */
2053
2054 /* Entry point of the parser.
2055    Parse the regular expression REGEXP and return the structure tree.
2056    If an error is occured, ERR is set by error code, and return NULL.
2057    This function build the following tree, from regular expression <reg_exp>:
2058            CAT
2059            / \
2060           /   \
2061    <reg_exp>  EOR
2062
2063    CAT means concatenation.
2064    EOR means end of regular expression.  */
2065
2066 static bin_tree_t *
2067 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2068        reg_errcode_t *err)
2069 {
2070   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2071   bin_tree_t *tree, *eor, *root;
2072   re_token_t current_token;
2073   dfa->syntax = syntax;
2074   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2075   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2076   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2077     return NULL;
2078   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2079   if (tree != NULL)
2080     root = create_tree (dfa, tree, eor, CONCAT);
2081   else
2082     root = eor;
2083   if (BE (eor == NULL || root == NULL, 0))
2084     {
2085       *err = REG_ESPACE;
2086       return NULL;
2087     }
2088   return root;
2089 }
2090
2091 /* This function build the following tree, from regular expression
2092    <branch1>|<branch2>:
2093            ALT
2094            / \
2095           /   \
2096    <branch1> <branch2>
2097
2098    ALT means alternative, which represents the operator `|'.  */
2099
2100 static bin_tree_t *
2101 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2102                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2103 {
2104   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105   bin_tree_t *tree, *branch = NULL;
2106   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2107   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2108     return NULL;
2109
2110   while (token->type == OP_ALT)
2111     {
2112       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2113       if (token->type != OP_ALT && token->type != END_OF_RE
2114           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2115         {
2116           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2117           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2118             return NULL;
2119         }
2120       else
2121         branch = NULL;
2122       tree = create_tree (dfa, tree, branch, OP_ALT);
2123       if (BE (tree == NULL, 0))
2124         {
2125           *err = REG_ESPACE;
2126           return NULL;
2127         }
2128     }
2129   return tree;
2130 }
2131
2132 /* This function build the following tree, from regular expression
2133    <exp1><exp2>:
2134         CAT
2135         / \
2136        /   \
2137    <exp1> <exp2>
2138
2139    CAT means concatenation.  */
2140
2141 static bin_tree_t *
2142 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2144 {
2145   bin_tree_t *tree, *exp;
2146   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2148   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149     return NULL;
2150
2151   while (token->type != OP_ALT && token->type != END_OF_RE
2152          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2153     {
2154       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2155       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2156         {
2157           return NULL;
2158         }
2159       if (tree != NULL && exp != NULL)
2160         {
2161           tree = create_tree (dfa, tree, exp, CONCAT);
2162           if (tree == NULL)
2163             {
2164               *err = REG_ESPACE;
2165               return NULL;
2166             }
2167         }
2168       else if (tree == NULL)
2169         tree = exp;
2170       /* Otherwise exp == NULL, we don't need to create new tree.  */
2171     }
2172   return tree;
2173 }
2174
2175 /* This function build the following tree, from regular expression a*:
2176          *
2177          |
2178          a
2179 */
2180
2181 static bin_tree_t *
2182 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2184 {
2185   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2186   bin_tree_t *tree;
2187   switch (token->type)
2188     {
2189     case CHARACTER:
2190       tree = create_token_tree (dfa, NULL, NULL, token);
2191       if (BE (tree == NULL, 0))
2192         {
2193           *err = REG_ESPACE;
2194           return NULL;
2195         }
2196 #ifdef RE_ENABLE_I18N
2197       if (dfa->mb_cur_max > 1)
2198         {
2199           while (!re_string_eoi (regexp)
2200                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2201             {
2202               bin_tree_t *mbc_remain;
2203               fetch_token (token, regexp, syntax);
2204               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2205               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2206               if (BE (mbc_remain == NULL || tree == NULL, 0))
2207                 {
2208                   *err = REG_ESPACE;
2209                   return NULL;
2210                 }
2211             }
2212         }
2213 #endif
2214       break;
2215     case OP_OPEN_SUBEXP:
2216       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2217       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2218         return NULL;
2219       break;
2220     case OP_OPEN_BRACKET:
2221       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2222       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223         return NULL;
2224       break;
2225     case OP_BACK_REF:
2226       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2227         {
2228           *err = REG_ESUBREG;
2229           return NULL;
2230         }
2231       dfa->used_bkref_map |= 1 << token->opr.idx;
2232       tree = create_token_tree (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234         {
2235           *err = REG_ESPACE;
2236           return NULL;
2237         }
2238       ++dfa->nbackref;
2239       dfa->has_mb_node = 1;
2240       break;
2241     case OP_OPEN_DUP_NUM:
2242       if (syntax & RE_CONTEXT_INVALID_DUP)
2243         {
2244           *err = REG_BADRPT;
2245           return NULL;
2246         }
2247       /* FALLTHROUGH */
2248     case OP_DUP_ASTERISK:
2249     case OP_DUP_PLUS:
2250     case OP_DUP_QUESTION:
2251       if (syntax & RE_CONTEXT_INVALID_OPS)
2252         {
2253           *err = REG_BADRPT;
2254           return NULL;
2255         }
2256       else if (syntax & RE_CONTEXT_INDEP_OPS)
2257         {
2258           fetch_token (token, regexp, syntax);
2259           return parse_expression (regexp, preg, token, syntax, nest, err);
2260         }
2261       /* else fall through  */
2262     case OP_CLOSE_SUBEXP:
2263       if ((token->type == OP_CLOSE_SUBEXP) &&
2264           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2265         {
2266           *err = REG_ERPAREN;
2267           return NULL;
2268         }
2269       /* else fall through  */
2270     case OP_CLOSE_DUP_NUM:
2271       /* We treat it as a normal character.  */
2272
2273       /* Then we can these characters as normal characters.  */
2274       token->type = CHARACTER;
2275       /* mb_partial and word_char bits should be initialized already
2276          by peek_token.  */
2277       tree = create_token_tree (dfa, NULL, NULL, token);
2278       if (BE (tree == NULL, 0))
2279         {
2280           *err = REG_ESPACE;
2281           return NULL;
2282         }
2283       break;
2284     case ANCHOR:
2285       if ((token->opr.ctx_type
2286            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2287           && dfa->word_ops_used == 0)
2288         init_word_char (dfa);
2289       if (token->opr.ctx_type == WORD_DELIM
2290           || token->opr.ctx_type == NOT_WORD_DELIM)
2291         {
2292           bin_tree_t *tree_first, *tree_last;
2293           if (token->opr.ctx_type == WORD_DELIM)
2294             {
2295               token->opr.ctx_type = WORD_FIRST;
2296               tree_first = create_token_tree (dfa, NULL, NULL, token);
2297               token->opr.ctx_type = WORD_LAST;
2298             }
2299           else
2300             {
2301               token->opr.ctx_type = INSIDE_WORD;
2302               tree_first = create_token_tree (dfa, NULL, NULL, token);
2303               token->opr.ctx_type = INSIDE_NOTWORD;
2304             }
2305           tree_last = create_token_tree (dfa, NULL, NULL, token);
2306           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2307           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2308             {
2309               *err = REG_ESPACE;
2310               return NULL;
2311             }
2312         }
2313       else
2314         {
2315           tree = create_token_tree (dfa, NULL, NULL, token);
2316           if (BE (tree == NULL, 0))
2317             {
2318               *err = REG_ESPACE;
2319               return NULL;
2320             }
2321         }
2322       /* We must return here, since ANCHORs can't be followed
2323          by repetition operators.
2324          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2325              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2326       fetch_token (token, regexp, syntax);
2327       return tree;
2328     case OP_PERIOD:
2329       tree = create_token_tree (dfa, NULL, NULL, token);
2330       if (BE (tree == NULL, 0))
2331         {
2332           *err = REG_ESPACE;
2333           return NULL;
2334         }
2335       if (dfa->mb_cur_max > 1)
2336         dfa->has_mb_node = 1;
2337       break;
2338     case OP_WORD:
2339     case OP_NOTWORD:
2340       tree = build_charclass_op (dfa, regexp->trans,
2341                                  (const unsigned char *) "alnum",
2342                                  (const unsigned char *) "_",
2343                                  token->type == OP_NOTWORD, err);
2344       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2345         return NULL;
2346       break;
2347     case OP_SPACE:
2348     case OP_NOTSPACE:
2349       tree = build_charclass_op (dfa, regexp->trans,
2350                                  (const unsigned char *) "space",
2351                                  (const unsigned char *) "",
2352                                  token->type == OP_NOTSPACE, err);
2353       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2354         return NULL;
2355       break;
2356     case OP_ALT:
2357     case END_OF_RE:
2358       return NULL;
2359     case BACK_SLASH:
2360       *err = REG_EESCAPE;
2361       return NULL;
2362     default:
2363       /* Must not happen?  */
2364 #ifdef DEBUG
2365       assert (0);
2366 #endif
2367       return NULL;
2368     }
2369   fetch_token (token, regexp, syntax);
2370
2371   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2372          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2373     {
2374       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2375       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2376         return NULL;
2377       /* In BRE consecutive duplications are not allowed.  */
2378       if ((syntax & RE_CONTEXT_INVALID_DUP)
2379           && (token->type == OP_DUP_ASTERISK
2380               || token->type == OP_OPEN_DUP_NUM))
2381         {
2382           *err = REG_BADRPT;
2383           return NULL;
2384         }
2385     }
2386
2387   return tree;
2388 }
2389
2390 /* This function build the following tree, from regular expression
2391    (<reg_exp>):
2392          SUBEXP
2393             |
2394         <reg_exp>
2395 */
2396
2397 static bin_tree_t *
2398 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2399                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2400 {
2401   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2402   bin_tree_t *tree;
2403   size_t cur_nsub;
2404   cur_nsub = preg->re_nsub++;
2405
2406   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2407
2408   /* The subexpression may be a null string.  */
2409   if (token->type == OP_CLOSE_SUBEXP)
2410     tree = NULL;
2411   else
2412     {
2413       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2414       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2415         *err = REG_EPAREN;
2416       if (BE (*err != REG_NOERROR, 0))
2417         return NULL;
2418     }
2419
2420   if (cur_nsub <= '9' - '1')
2421     dfa->completed_bkref_map |= 1 << cur_nsub;
2422
2423   tree = create_tree (dfa, tree, NULL, SUBEXP);
2424   if (BE (tree == NULL, 0))
2425     {
2426       *err = REG_ESPACE;
2427       return NULL;
2428     }
2429   tree->token.opr.idx = cur_nsub;
2430   return tree;
2431 }
2432
2433 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2434
2435 static bin_tree_t *
2436 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2437               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2438 {
2439   bin_tree_t *tree = NULL, *old_tree = NULL;
2440   int i, start, end, start_idx = re_string_cur_idx (regexp);
2441   re_token_t start_token = *token;
2442
2443   if (token->type == OP_OPEN_DUP_NUM)
2444     {
2445       end = 0;
2446       start = fetch_number (regexp, token, syntax);
2447       if (start == -1)
2448         {
2449           if (token->type == CHARACTER && token->opr.c == ',')
2450             start = 0; /* We treat "{,m}" as "{0,m}".  */
2451           else
2452             {
2453               *err = REG_BADBR; /* <re>{} is invalid.  */
2454               return NULL;
2455             }
2456         }
2457       if (BE (start != -2, 1))
2458         {
2459           /* We treat "{n}" as "{n,n}".  */
2460           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2461                  : ((token->type == CHARACTER && token->opr.c == ',')
2462                     ? fetch_number (regexp, token, syntax) : -2));
2463         }
2464       if (BE (start == -2 || end == -2, 0))
2465         {
2466           /* Invalid sequence.  */
2467           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2468             {
2469               if (token->type == END_OF_RE)
2470                 *err = REG_EBRACE;
2471               else
2472                 *err = REG_BADBR;
2473
2474               return NULL;
2475             }
2476
2477           /* If the syntax bit is set, rollback.  */
2478           re_string_set_index (regexp, start_idx);
2479           *token = start_token;
2480           token->type = CHARACTER;
2481           /* mb_partial and word_char bits should be already initialized by
2482              peek_token.  */
2483           return elem;
2484         }
2485
2486       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2487         {
2488           /* First number greater than second.  */
2489           *err = REG_BADBR;
2490           return NULL;
2491         }
2492     }
2493   else
2494     {
2495       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2496       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2497     }
2498
2499   fetch_token (token, regexp, syntax);
2500
2501   if (BE (elem == NULL, 0))
2502     return NULL;
2503   if (BE (start == 0 && end == 0, 0))
2504     {
2505       postorder (elem, free_tree, NULL);
2506       return NULL;
2507     }
2508
2509   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2510   if (BE (start > 0, 0))
2511     {
2512       tree = elem;
2513       for (i = 2; i <= start; ++i)
2514         {
2515           elem = duplicate_tree (elem, dfa);
2516           tree = create_tree (dfa, tree, elem, CONCAT);
2517           if (BE (elem == NULL || tree == NULL, 0))
2518             goto parse_dup_op_espace;
2519         }
2520
2521       if (start == end)
2522         return tree;
2523
2524       /* Duplicate ELEM before it is marked optional.  */
2525       elem = duplicate_tree (elem, dfa);
2526       old_tree = tree;
2527     }
2528   else
2529     old_tree = NULL;
2530
2531   if (elem->token.type == SUBEXP)
2532     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2533
2534   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2535   if (BE (tree == NULL, 0))
2536     goto parse_dup_op_espace;
2537
2538   /* This loop is actually executed only when end != -1,
2539      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2540      already created the start+1-th copy.  */
2541   for (i = start + 2; i <= end; ++i)
2542     {
2543       elem = duplicate_tree (elem, dfa);
2544       tree = create_tree (dfa, tree, elem, CONCAT);
2545       if (BE (elem == NULL || tree == NULL, 0))
2546         goto parse_dup_op_espace;
2547
2548       tree = create_tree (dfa, tree, NULL, OP_ALT);
2549       if (BE (tree == NULL, 0))
2550         goto parse_dup_op_espace;
2551     }
2552
2553   if (old_tree)
2554     tree = create_tree (dfa, old_tree, tree, CONCAT);
2555
2556   return tree;
2557
2558  parse_dup_op_espace:
2559   *err = REG_ESPACE;
2560   return NULL;
2561 }
2562
2563 /* Size of the names for collating symbol/equivalence_class/character_class.
2564    I'm not sure, but maybe enough.  */
2565 #define BRACKET_NAME_BUF_SIZE 32
2566
2567 #ifndef _LIBC
2568   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2569      Build the range expression which starts from START_ELEM, and ends
2570      at END_ELEM.  The result are written to MBCSET and SBCSET.
2571      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2572      mbcset->range_ends, is a pointer argument sinse we may
2573      update it.  */
2574
2575 static reg_errcode_t
2576 internal_function
2577 # ifdef RE_ENABLE_I18N
2578 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2579                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2580 # else /* not RE_ENABLE_I18N */
2581 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2582                  bracket_elem_t *end_elem)
2583 # endif /* not RE_ENABLE_I18N */
2584 {
2585   unsigned int start_ch, end_ch;
2586   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2587   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2588           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2589           0))
2590     return REG_ERANGE;
2591
2592   /* We can handle no multi character collating elements without libc
2593      support.  */
2594   if (BE ((start_elem->type == COLL_SYM
2595            && strlen ((char *) start_elem->opr.name) > 1)
2596           || (end_elem->type == COLL_SYM
2597               && strlen ((char *) end_elem->opr.name) > 1), 0))
2598     return REG_ECOLLATE;
2599
2600 # ifdef RE_ENABLE_I18N
2601   {
2602     wchar_t wc;
2603     wint_t start_wc;
2604     wint_t end_wc;
2605     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2606
2607     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2608                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2609                    : 0));
2610     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2611               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2612                  : 0));
2613     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2614                 ? __btowc (start_ch) : start_elem->opr.wch);
2615     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2616               ? __btowc (end_ch) : end_elem->opr.wch);
2617     if (start_wc == WEOF || end_wc == WEOF)
2618       return REG_ECOLLATE;
2619     cmp_buf[0] = start_wc;
2620     cmp_buf[4] = end_wc;
2621     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2622       return REG_ERANGE;
2623
2624     /* Got valid collation sequence values, add them as a new entry.
2625        However, for !_LIBC we have no collation elements: if the
2626        character set is single byte, the single byte character set
2627        that we build below suffices.  parse_bracket_exp passes
2628        no MBCSET if dfa->mb_cur_max == 1.  */
2629     if (mbcset)
2630       {
2631         /* Check the space of the arrays.  */
2632         if (BE (*range_alloc == mbcset->nranges, 0))
2633           {
2634             /* There is not enough space, need realloc.  */
2635             wchar_t *new_array_start, *new_array_end;
2636             int new_nranges;
2637
2638             /* +1 in case of mbcset->nranges is 0.  */
2639             new_nranges = 2 * mbcset->nranges + 1;
2640             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2641                are NULL if *range_alloc == 0.  */
2642             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2643                                           new_nranges);
2644             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2645                                         new_nranges);
2646
2647             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2648               return REG_ESPACE;
2649
2650             mbcset->range_starts = new_array_start;
2651             mbcset->range_ends = new_array_end;
2652             *range_alloc = new_nranges;
2653           }
2654
2655         mbcset->range_starts[mbcset->nranges] = start_wc;
2656         mbcset->range_ends[mbcset->nranges++] = end_wc;
2657       }
2658
2659     /* Build the table for single byte characters.  */
2660     for (wc = 0; wc < SBC_MAX; ++wc)
2661       {
2662         cmp_buf[2] = wc;
2663         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2664             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2665           bitset_set (sbcset, wc);
2666       }
2667   }
2668 # else /* not RE_ENABLE_I18N */
2669   {
2670     unsigned int ch;
2671     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2672                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2673                    : 0));
2674     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2675               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2676                  : 0));
2677     if (start_ch > end_ch)
2678       return REG_ERANGE;
2679     /* Build the table for single byte characters.  */
2680     for (ch = 0; ch < SBC_MAX; ++ch)
2681       if (start_ch <= ch  && ch <= end_ch)
2682         bitset_set (sbcset, ch);
2683   }
2684 # endif /* not RE_ENABLE_I18N */
2685   return REG_NOERROR;
2686 }
2687 #endif /* not _LIBC */
2688
2689 #ifndef _LIBC
2690 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2691    Build the collating element which is represented by NAME.
2692    The result are written to MBCSET and SBCSET.
2693    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2694    pointer argument since we may update it.  */
2695
2696 static reg_errcode_t
2697 internal_function
2698 # ifdef RE_ENABLE_I18N
2699 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2700                         int *coll_sym_alloc, const unsigned char *name)
2701 # else /* not RE_ENABLE_I18N */
2702 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2703 # endif /* not RE_ENABLE_I18N */
2704 {
2705   size_t name_len = strlen ((const char *) name);
2706   if (BE (name_len != 1, 0))
2707     return REG_ECOLLATE;
2708   else
2709     {
2710       bitset_set (sbcset, name[0]);
2711       return REG_NOERROR;
2712     }
2713 }
2714 #endif /* not _LIBC */
2715
2716 /* This function parse bracket expression like "[abc]", "[a-c]",
2717    "[[.a-a.]]" etc.  */
2718
2719 static bin_tree_t *
2720 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2721                    reg_syntax_t syntax, reg_errcode_t *err)
2722 {
2723 #ifdef _LIBC
2724   const unsigned char *collseqmb;
2725   const char *collseqwc;
2726   uint32_t nrules;
2727   int32_t table_size;
2728   const int32_t *symb_table;
2729   const unsigned char *extra;
2730
2731   /* Local function for parse_bracket_exp used in _LIBC environement.
2732      Seek the collating symbol entry correspondings to NAME.
2733      Return the index of the symbol in the SYMB_TABLE.  */
2734
2735   auto inline int32_t
2736   __attribute ((always_inline))
2737   seek_collating_symbol_entry (name, name_len)
2738          const unsigned char *name;
2739          size_t name_len;
2740     {
2741       int32_t hash = elem_hash ((const char *) name, name_len);
2742       int32_t elem = hash % table_size;
2743       if (symb_table[2 * elem] != 0)
2744         {
2745           int32_t second = hash % (table_size - 2) + 1;
2746
2747           do
2748             {
2749               /* First compare the hashing value.  */
2750               if (symb_table[2 * elem] == hash
2751                   /* Compare the length of the name.  */
2752                   && name_len == extra[symb_table[2 * elem + 1]]
2753                   /* Compare the name.  */
2754                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2755                              name_len) == 0)
2756                 {
2757                   /* Yep, this is the entry.  */
2758                   break;
2759                 }
2760
2761               /* Next entry.  */
2762               elem += second;
2763             }
2764           while (symb_table[2 * elem] != 0);
2765         }
2766       return elem;
2767     }
2768
2769   /* Local function for parse_bracket_exp used in _LIBC environment.
2770      Look up the collation sequence value of BR_ELEM.
2771      Return the value if succeeded, UINT_MAX otherwise.  */
2772
2773   auto inline unsigned int
2774   __attribute ((always_inline))
2775   lookup_collation_sequence_value (br_elem)
2776          bracket_elem_t *br_elem;
2777     {
2778       if (br_elem->type == SB_CHAR)
2779         {
2780           /*
2781           if (MB_CUR_MAX == 1)
2782           */
2783           if (nrules == 0)
2784             return collseqmb[br_elem->opr.ch];
2785           else
2786             {
2787               wint_t wc = __btowc (br_elem->opr.ch);
2788               return __collseq_table_lookup (collseqwc, wc);
2789             }
2790         }
2791       else if (br_elem->type == MB_CHAR)
2792         {
2793           if (nrules != 0)
2794             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2795         }
2796       else if (br_elem->type == COLL_SYM)
2797         {
2798           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2799           if (nrules != 0)
2800             {
2801               int32_t elem, idx;
2802               elem = seek_collating_symbol_entry (br_elem->opr.name,
2803                                                   sym_name_len);
2804               if (symb_table[2 * elem] != 0)
2805                 {
2806                   /* We found the entry.  */
2807                   idx = symb_table[2 * elem + 1];
2808                   /* Skip the name of collating element name.  */
2809                   idx += 1 + extra[idx];
2810                   /* Skip the byte sequence of the collating element.  */
2811                   idx += 1 + extra[idx];
2812                   /* Adjust for the alignment.  */
2813                   idx = (idx + 3) & ~3;
2814                   /* Skip the multibyte collation sequence value.  */
2815                   idx += sizeof (unsigned int);
2816                   /* Skip the wide char sequence of the collating element.  */
2817                   idx += sizeof (unsigned int) *
2818                     (1 + *(unsigned int *) (extra + idx));
2819                   /* Return the collation sequence value.  */
2820                   return *(unsigned int *) (extra + idx);
2821                 }
2822               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2823                 {
2824                   /* No valid character.  Match it as a single byte
2825                      character.  */
2826                   return collseqmb[br_elem->opr.name[0]];
2827                 }
2828             }
2829           else if (sym_name_len == 1)
2830             return collseqmb[br_elem->opr.name[0]];
2831         }
2832       return UINT_MAX;
2833     }
2834
2835   /* Local function for parse_bracket_exp used in _LIBC environement.
2836      Build the range expression which starts from START_ELEM, and ends
2837      at END_ELEM.  The result are written to MBCSET and SBCSET.
2838      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2839      mbcset->range_ends, is a pointer argument sinse we may
2840      update it.  */
2841
2842   auto inline reg_errcode_t
2843   __attribute ((always_inline))
2844   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2845          re_charset_t *mbcset;
2846          int *range_alloc;
2847          bitset_t sbcset;
2848          bracket_elem_t *start_elem, *end_elem;
2849     {
2850       unsigned int ch;
2851       uint32_t start_collseq;
2852       uint32_t end_collseq;
2853
2854       /* Equivalence Classes and Character Classes can't be a range
2855          start/end.  */
2856       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2857               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2858               0))
2859         return REG_ERANGE;
2860
2861       start_collseq = lookup_collation_sequence_value (start_elem);
2862       end_collseq = lookup_collation_sequence_value (end_elem);
2863       /* Check start/end collation sequence values.  */
2864       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2865         return REG_ECOLLATE;
2866       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2867         return REG_ERANGE;
2868
2869       /* Got valid collation sequence values, add them as a new entry.
2870          However, if we have no collation elements, and the character set
2871          is single byte, the single byte character set that we
2872          build below suffices. */
2873       if (nrules > 0 || dfa->mb_cur_max > 1)
2874         {
2875           /* Check the space of the arrays.  */
2876           if (BE (*range_alloc == mbcset->nranges, 0))
2877             {
2878               /* There is not enough space, need realloc.  */
2879               uint32_t *new_array_start;
2880               uint32_t *new_array_end;
2881               int new_nranges;
2882
2883               /* +1 in case of mbcset->nranges is 0.  */
2884               new_nranges = 2 * mbcset->nranges + 1;
2885               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2886                                             new_nranges);
2887               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2888                                           new_nranges);
2889
2890               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2891                 return REG_ESPACE;
2892
2893               mbcset->range_starts = new_array_start;
2894               mbcset->range_ends = new_array_end;
2895               *range_alloc = new_nranges;
2896             }
2897
2898           mbcset->range_starts[mbcset->nranges] = start_collseq;
2899           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2900         }
2901
2902       /* Build the table for single byte characters.  */
2903       for (ch = 0; ch < SBC_MAX; ch++)
2904         {
2905           uint32_t ch_collseq;
2906           /*
2907           if (MB_CUR_MAX == 1)
2908           */
2909           if (nrules == 0)
2910             ch_collseq = collseqmb[ch];
2911           else
2912             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2913           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2914             bitset_set (sbcset, ch);
2915         }
2916       return REG_NOERROR;
2917     }
2918
2919   /* Local function for parse_bracket_exp used in _LIBC environement.
2920      Build the collating element which is represented by NAME.
2921      The result are written to MBCSET and SBCSET.
2922      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2923      pointer argument sinse we may update it.  */
2924
2925   auto inline reg_errcode_t
2926   __attribute ((always_inline))
2927   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2928          re_charset_t *mbcset;
2929          int *coll_sym_alloc;
2930          bitset_t sbcset;
2931          const unsigned char *name;
2932     {
2933       int32_t elem, idx;
2934       size_t name_len = strlen ((const char *) name);
2935       if (nrules != 0)
2936         {
2937           elem = seek_collating_symbol_entry (name, name_len);
2938           if (symb_table[2 * elem] != 0)
2939             {
2940               /* We found the entry.  */
2941               idx = symb_table[2 * elem + 1];
2942               /* Skip the name of collating element name.  */
2943               idx += 1 + extra[idx];
2944             }
2945           else if (symb_table[2 * elem] == 0 && name_len == 1)
2946             {
2947               /* No valid character, treat it as a normal
2948                  character.  */
2949               bitset_set (sbcset, name[0]);
2950               return REG_NOERROR;
2951             }
2952           else
2953             return REG_ECOLLATE;
2954
2955           /* Got valid collation sequence, add it as a new entry.  */
2956           /* Check the space of the arrays.  */
2957           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2958             {
2959               /* Not enough, realloc it.  */
2960               /* +1 in case of mbcset->ncoll_syms is 0.  */
2961               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2962               /* Use realloc since mbcset->coll_syms is NULL
2963                  if *alloc == 0.  */
2964               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2965                                                    new_coll_sym_alloc);
2966               if (BE (new_coll_syms == NULL, 0))
2967                 return REG_ESPACE;
2968               mbcset->coll_syms = new_coll_syms;
2969               *coll_sym_alloc = new_coll_sym_alloc;
2970             }
2971           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2972           return REG_NOERROR;
2973         }
2974       else
2975         {
2976           if (BE (name_len != 1, 0))
2977             return REG_ECOLLATE;
2978           else
2979             {
2980               bitset_set (sbcset, name[0]);
2981               return REG_NOERROR;
2982             }
2983         }
2984     }
2985 #endif
2986
2987   re_token_t br_token;
2988   re_bitset_ptr_t sbcset;
2989 #ifdef RE_ENABLE_I18N
2990   re_charset_t *mbcset;
2991   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2992   int equiv_class_alloc = 0, char_class_alloc = 0;
2993 #endif /* not RE_ENABLE_I18N */
2994   int non_match = 0;
2995   bin_tree_t *work_tree;
2996   int token_len;
2997   int first_round = 1;
2998 #ifdef _LIBC
2999   collseqmb = (const unsigned char *)
3000     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3001   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3002   if (nrules)
3003     {
3004       /*
3005       if (MB_CUR_MAX > 1)
3006       */
3007       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3008       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3009       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3010                                                   _NL_COLLATE_SYMB_TABLEMB);
3011       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3012                                                    _NL_COLLATE_SYMB_EXTRAMB);
3013     }
3014 #endif
3015   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3016 #ifdef RE_ENABLE_I18N
3017   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3018 #endif /* RE_ENABLE_I18N */
3019 #ifdef RE_ENABLE_I18N
3020   if (BE (sbcset == NULL || mbcset == NULL, 0))
3021 #else
3022   if (BE (sbcset == NULL, 0))
3023 #endif /* RE_ENABLE_I18N */
3024     {
3025       *err = REG_ESPACE;
3026       return NULL;
3027     }
3028
3029   token_len = peek_token_bracket (token, regexp, syntax);
3030   if (BE (token->type == END_OF_RE, 0))
3031     {
3032       *err = REG_BADPAT;
3033       goto parse_bracket_exp_free_return;
3034     }
3035   if (token->type == OP_NON_MATCH_LIST)
3036     {
3037 #ifdef RE_ENABLE_I18N
3038       mbcset->non_match = 1;
3039 #endif /* not RE_ENABLE_I18N */
3040       non_match = 1;
3041       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3042         bitset_set (sbcset, '\n');
3043       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3044       token_len = peek_token_bracket (token, regexp, syntax);
3045       if (BE (token->type == END_OF_RE, 0))
3046         {
3047           *err = REG_BADPAT;
3048           goto parse_bracket_exp_free_return;
3049         }
3050     }
3051
3052   /* We treat the first ']' as a normal character.  */
3053   if (token->type == OP_CLOSE_BRACKET)
3054     token->type = CHARACTER;
3055
3056   while (1)
3057     {
3058       bracket_elem_t start_elem, end_elem;
3059       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3060       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3061       reg_errcode_t ret;
3062       int token_len2 = 0, is_range_exp = 0;
3063       re_token_t token2;
3064
3065       start_elem.opr.name = start_name_buf;
3066       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3067                                    syntax, first_round);
3068       if (BE (ret != REG_NOERROR, 0))
3069         {
3070           *err = ret;
3071           goto parse_bracket_exp_free_return;
3072         }
3073       first_round = 0;
3074
3075       /* Get information about the next token.  We need it in any case.  */
3076       token_len = peek_token_bracket (token, regexp, syntax);
3077
3078       /* Do not check for ranges if we know they are not allowed.  */
3079       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3080         {
3081           if (BE (token->type == END_OF_RE, 0))
3082             {
3083               *err = REG_EBRACK;
3084               goto parse_bracket_exp_free_return;
3085             }
3086           if (token->type == OP_CHARSET_RANGE)
3087             {
3088               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3089               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3090               if (BE (token2.type == END_OF_RE, 0))
3091                 {
3092                   *err = REG_EBRACK;
3093                   goto parse_bracket_exp_free_return;
3094                 }
3095               if (token2.type == OP_CLOSE_BRACKET)
3096                 {
3097                   /* We treat the last '-' as a normal character.  */
3098                   re_string_skip_bytes (regexp, -token_len);
3099                   token->type = CHARACTER;
3100                 }
3101               else
3102                 is_range_exp = 1;
3103             }
3104         }
3105
3106       if (is_range_exp == 1)
3107         {
3108           end_elem.opr.name = end_name_buf;
3109           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3110                                        dfa, syntax, 1);
3111           if (BE (ret != REG_NOERROR, 0))
3112             {
3113               *err = ret;
3114               goto parse_bracket_exp_free_return;
3115             }
3116
3117           token_len = peek_token_bracket (token, regexp, syntax);
3118
3119 #ifdef _LIBC
3120           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3121                                   &start_elem, &end_elem);
3122 #else
3123 # ifdef RE_ENABLE_I18N
3124           *err = build_range_exp (sbcset,
3125                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3126                                   &range_alloc, &start_elem, &end_elem);
3127 # else
3128           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3129 # endif
3130 #endif /* RE_ENABLE_I18N */
3131           if (BE (*err != REG_NOERROR, 0))
3132             goto parse_bracket_exp_free_return;
3133         }
3134       else
3135         {
3136           switch (start_elem.type)
3137             {
3138             case SB_CHAR:
3139               bitset_set (sbcset, start_elem.opr.ch);
3140               break;
3141 #ifdef RE_ENABLE_I18N
3142             case MB_CHAR:
3143               /* Check whether the array has enough space.  */
3144               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3145                 {
3146                   wchar_t *new_mbchars;
3147                   /* Not enough, realloc it.  */
3148                   /* +1 in case of mbcset->nmbchars is 0.  */
3149                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3150                   /* Use realloc since array is NULL if *alloc == 0.  */
3151                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3152                                             mbchar_alloc);
3153                   if (BE (new_mbchars == NULL, 0))
3154                     goto parse_bracket_exp_espace;
3155                   mbcset->mbchars = new_mbchars;
3156                 }
3157               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3158               break;
3159 #endif /* RE_ENABLE_I18N */
3160             case EQUIV_CLASS:
3161               *err = build_equiv_class (sbcset,
3162 #ifdef RE_ENABLE_I18N
3163                                         mbcset, &equiv_class_alloc,
3164 #endif /* RE_ENABLE_I18N */
3165                                         start_elem.opr.name);
3166               if (BE (*err != REG_NOERROR, 0))
3167                 goto parse_bracket_exp_free_return;
3168               break;
3169             case COLL_SYM:
3170               *err = build_collating_symbol (sbcset,
3171 #ifdef RE_ENABLE_I18N
3172                                              mbcset, &coll_sym_alloc,
3173 #endif /* RE_ENABLE_I18N */
3174                                              start_elem.opr.name);
3175               if (BE (*err != REG_NOERROR, 0))
3176                 goto parse_bracket_exp_free_return;
3177               break;
3178             case CHAR_CLASS:
3179               *err = build_charclass (regexp->trans, sbcset,
3180 #ifdef RE_ENABLE_I18N
3181                                       mbcset, &char_class_alloc,
3182 #endif /* RE_ENABLE_I18N */
3183                                       start_elem.opr.name, syntax);
3184               if (BE (*err != REG_NOERROR, 0))
3185                goto parse_bracket_exp_free_return;
3186               break;
3187             default:
3188               assert (0);
3189               break;
3190             }
3191         }
3192       if (BE (token->type == END_OF_RE, 0))
3193         {
3194           *err = REG_EBRACK;
3195           goto parse_bracket_exp_free_return;
3196         }
3197       if (token->type == OP_CLOSE_BRACKET)
3198         break;
3199     }
3200
3201   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3202
3203   /* If it is non-matching list.  */
3204   if (non_match)
3205     bitset_not (sbcset);
3206
3207 #ifdef RE_ENABLE_I18N
3208   /* Ensure only single byte characters are set.  */
3209   if (dfa->mb_cur_max > 1)
3210     bitset_mask (sbcset, dfa->sb_char);
3211
3212   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3213       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3214                                                      || mbcset->non_match)))
3215     {
3216       bin_tree_t *mbc_tree;
3217       int sbc_idx;
3218       /* Build a tree for complex bracket.  */
3219       dfa->has_mb_node = 1;
3220       br_token.type = COMPLEX_BRACKET;
3221       br_token.opr.mbcset = mbcset;
3222       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3223       if (BE (mbc_tree == NULL, 0))
3224         goto parse_bracket_exp_espace;
3225       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3226         if (sbcset[sbc_idx])
3227           break;
3228       /* If there are no bits set in sbcset, there is no point
3229          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3230       if (sbc_idx < BITSET_WORDS)
3231         {
3232           /* Build a tree for simple bracket.  */
3233           br_token.type = SIMPLE_BRACKET;
3234           br_token.opr.sbcset = sbcset;
3235           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3236           if (BE (work_tree == NULL, 0))
3237             goto parse_bracket_exp_espace;
3238
3239           /* Then join them by ALT node.  */
3240           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3241           if (BE (work_tree == NULL, 0))
3242             goto parse_bracket_exp_espace;
3243         }
3244       else
3245         {
3246           re_free (sbcset);
3247           work_tree = mbc_tree;
3248         }
3249     }
3250   else
3251 #endif /* not RE_ENABLE_I18N */
3252     {
3253 #ifdef RE_ENABLE_I18N
3254       free_charset (mbcset);
3255 #endif
3256       /* Build a tree for simple bracket.  */
3257       br_token.type = SIMPLE_BRACKET;
3258       br_token.opr.sbcset = sbcset;
3259       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3260       if (BE (work_tree == NULL, 0))
3261         goto parse_bracket_exp_espace;
3262     }
3263   return work_tree;
3264
3265  parse_bracket_exp_espace:
3266   *err = REG_ESPACE;
3267  parse_bracket_exp_free_return:
3268   re_free (sbcset);
3269 #ifdef RE_ENABLE_I18N
3270   free_charset (mbcset);
3271 #endif /* RE_ENABLE_I18N */
3272   return NULL;
3273 }
3274
3275 /* Parse an element in the bracket expression.  */
3276
3277 static reg_errcode_t
3278 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3279                        re_token_t *token, int token_len, re_dfa_t *dfa,
3280                        reg_syntax_t syntax, int accept_hyphen)
3281 {
3282 #ifdef RE_ENABLE_I18N
3283   int cur_char_size;
3284   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3285   if (cur_char_size > 1)
3286     {
3287       elem->type = MB_CHAR;
3288       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3289       re_string_skip_bytes (regexp, cur_char_size);
3290       return REG_NOERROR;
3291     }
3292 #endif /* RE_ENABLE_I18N */
3293   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3294   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3295       || token->type == OP_OPEN_EQUIV_CLASS)
3296     return parse_bracket_symbol (elem, regexp, token);
3297   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3298     {
3299       /* A '-' must only appear as anything but a range indicator before
3300          the closing bracket.  Everything else is an error.  */
3301       re_token_t token2;
3302       (void) peek_token_bracket (&token2, regexp, syntax);
3303       if (token2.type != OP_CLOSE_BRACKET)
3304         /* The actual error value is not standardized since this whole
3305            case is undefined.  But ERANGE makes good sense.  */
3306         return REG_ERANGE;
3307     }
3308   elem->type = SB_CHAR;
3309   elem->opr.ch = token->opr.c;
3310   return REG_NOERROR;
3311 }
3312
3313 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3314    such as [:<character_class>:], [.<collating_element>.], and
3315    [=<equivalent_class>=].  */
3316
3317 static reg_errcode_t
3318 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3319                       re_token_t *token)
3320 {
3321   unsigned char ch, delim = token->opr.c;
3322   int i = 0;
3323   if (re_string_eoi(regexp))
3324     return REG_EBRACK;
3325   for (;; ++i)
3326     {
3327       if (i >= BRACKET_NAME_BUF_SIZE)
3328         return REG_EBRACK;
3329       if (token->type == OP_OPEN_CHAR_CLASS)
3330         ch = re_string_fetch_byte_case (regexp);
3331       else
3332         ch = re_string_fetch_byte (regexp);
3333       if (re_string_eoi(regexp))
3334         return REG_EBRACK;
3335       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3336         break;
3337       elem->opr.name[i] = ch;
3338     }
3339   re_string_skip_bytes (regexp, 1);
3340   elem->opr.name[i] = '\0';
3341   switch (token->type)
3342     {
3343     case OP_OPEN_COLL_ELEM:
3344       elem->type = COLL_SYM;
3345       break;
3346     case OP_OPEN_EQUIV_CLASS:
3347       elem->type = EQUIV_CLASS;
3348       break;
3349     case OP_OPEN_CHAR_CLASS:
3350       elem->type = CHAR_CLASS;
3351       break;
3352     default:
3353       break;
3354     }
3355   return REG_NOERROR;
3356 }
3357
3358   /* Helper function for parse_bracket_exp.
3359      Build the equivalence class which is represented by NAME.
3360      The result are written to MBCSET and SBCSET.
3361      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3362      is a pointer argument sinse we may update it.  */
3363
3364 static reg_errcode_t
3365 #ifdef RE_ENABLE_I18N
3366 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3367                    int *equiv_class_alloc, const unsigned char *name)
3368 #else /* not RE_ENABLE_I18N */
3369 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3370 #endif /* not RE_ENABLE_I18N */
3371 {
3372 #ifdef _LIBC
3373   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3374   if (nrules != 0)
3375     {
3376       const int32_t *table, *indirect;
3377       const unsigned char *weights, *extra, *cp;
3378       unsigned char char_buf[2];
3379       int32_t idx1, idx2;
3380       unsigned int ch;
3381       size_t len;
3382       /* This #include defines a local function!  */
3383 # include <locale/weight.h>
3384       /* Calculate the index for equivalence class.  */
3385       cp = name;
3386       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3387       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388                                                _NL_COLLATE_WEIGHTMB);
3389       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3390                                                    _NL_COLLATE_EXTRAMB);
3391       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3392                                                 _NL_COLLATE_INDIRECTMB);
3393       idx1 = findidx (&cp);
3394       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3395         /* This isn't a valid character.  */
3396         return REG_ECOLLATE;
3397
3398       /* Build single byte matcing table for this equivalence class.  */
3399       char_buf[1] = (unsigned char) '\0';
3400       len = weights[idx1 & 0xffffff];
3401       for (ch = 0; ch < SBC_MAX; ++ch)
3402         {
3403           char_buf[0] = ch;
3404           cp = char_buf;
3405           idx2 = findidx (&cp);
3406 /*
3407           idx2 = table[ch];
3408 */
3409           if (idx2 == 0)
3410             /* This isn't a valid character.  */
3411             continue;
3412           /* Compare only if the length matches and the collation rule
3413              index is the same.  */
3414           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3415             {
3416               int cnt = 0;
3417
3418               while (cnt <= len &&
3419                      weights[(idx1 & 0xffffff) + 1 + cnt]
3420                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3421                 ++cnt;
3422
3423               if (cnt > len)
3424                 bitset_set (sbcset, ch);
3425             }
3426         }
3427       /* Check whether the array has enough space.  */
3428       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3429         {
3430           /* Not enough, realloc it.  */
3431           /* +1 in case of mbcset->nequiv_classes is 0.  */
3432           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3433           /* Use realloc since the array is NULL if *alloc == 0.  */
3434           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3435                                                    int32_t,
3436                                                    new_equiv_class_alloc);
3437           if (BE (new_equiv_classes == NULL, 0))
3438             return REG_ESPACE;
3439           mbcset->equiv_classes = new_equiv_classes;
3440           *equiv_class_alloc = new_equiv_class_alloc;
3441         }
3442       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3443     }
3444   else
3445 #endif /* _LIBC */
3446     {
3447       if (BE (strlen ((const char *) name) != 1, 0))
3448         return REG_ECOLLATE;
3449       bitset_set (sbcset, *name);
3450     }
3451   return REG_NOERROR;
3452 }
3453
3454   /* Helper function for parse_bracket_exp.
3455      Build the character class which is represented by NAME.
3456      The result are written to MBCSET and SBCSET.
3457      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3458      is a pointer argument sinse we may update it.  */
3459
3460 static reg_errcode_t
3461 #ifdef RE_ENABLE_I18N
3462 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3463                  re_charset_t *mbcset, int *char_class_alloc,
3464                  const unsigned char *class_name, reg_syntax_t syntax)
3465 #else /* not RE_ENABLE_I18N */
3466 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3467                  const unsigned char *class_name, reg_syntax_t syntax)
3468 #endif /* not RE_ENABLE_I18N */
3469 {
3470   int i;
3471   const char *name = (const char *) class_name;
3472
3473   /* In case of REG_ICASE "upper" and "lower" match the both of
3474      upper and lower cases.  */
3475   if ((syntax & RE_ICASE)
3476       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3477     name = "alpha";
3478
3479 #ifdef RE_ENABLE_I18N
3480   /* Check the space of the arrays.  */
3481   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3482     {
3483       /* Not enough, realloc it.  */
3484       /* +1 in case of mbcset->nchar_classes is 0.  */
3485       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3486       /* Use realloc since array is NULL if *alloc == 0.  */
3487       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3488                                                new_char_class_alloc);
3489       if (BE (new_char_classes == NULL, 0))
3490         return REG_ESPACE;
3491       mbcset->char_classes = new_char_classes;
3492       *char_class_alloc = new_char_class_alloc;
3493     }
3494   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3495 #endif /* RE_ENABLE_I18N */
3496
3497 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3498   do {                                          \
3499     if (BE (trans != NULL, 0))                  \
3500       {                                         \
3501         for (i = 0; i < SBC_MAX; ++i)           \
3502           if (ctype_func (i))                   \
3503             bitset_set (sbcset, trans[i]);      \
3504       }                                         \
3505     else                                        \
3506       {                                         \
3507         for (i = 0; i < SBC_MAX; ++i)           \
3508           if (ctype_func (i))                   \
3509             bitset_set (sbcset, i);             \
3510       }                                         \
3511   } while (0)
3512
3513   if (strcmp (name, "alnum") == 0)
3514     BUILD_CHARCLASS_LOOP (isalnum);
3515   else if (strcmp (name, "cntrl") == 0)
3516     BUILD_CHARCLASS_LOOP (iscntrl);
3517   else if (strcmp (name, "lower") == 0)
3518     BUILD_CHARCLASS_LOOP (islower);
3519   else if (strcmp (name, "space") == 0)
3520     BUILD_CHARCLASS_LOOP (isspace);
3521   else if (strcmp (name, "alpha") == 0)
3522     BUILD_CHARCLASS_LOOP (isalpha);
3523   else if (strcmp (name, "digit") == 0)
3524     BUILD_CHARCLASS_LOOP (isdigit);
3525   else if (strcmp (name, "print") == 0)
3526     BUILD_CHARCLASS_LOOP (isprint);
3527   else if (strcmp (name, "upper") == 0)
3528     BUILD_CHARCLASS_LOOP (isupper);
3529   else if (strcmp (name, "blank") == 0)
3530     BUILD_CHARCLASS_LOOP (isblank);
3531   else if (strcmp (name, "graph") == 0)
3532     BUILD_CHARCLASS_LOOP (isgraph);
3533   else if (strcmp (name, "punct") == 0)
3534     BUILD_CHARCLASS_LOOP (ispunct);
3535   else if (strcmp (name, "xdigit") == 0)
3536     BUILD_CHARCLASS_LOOP (isxdigit);
3537   else
3538     return REG_ECTYPE;
3539
3540   return REG_NOERROR;
3541 }
3542
3543 static bin_tree_t *
3544 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3545                     const unsigned char *class_name,
3546                     const unsigned char *extra, int non_match,
3547                     reg_errcode_t *err)
3548 {
3549   re_bitset_ptr_t sbcset;
3550 #ifdef RE_ENABLE_I18N
3551   re_charset_t *mbcset;
3552   int alloc = 0;
3553 #endif /* not RE_ENABLE_I18N */
3554   reg_errcode_t ret;
3555   re_token_t br_token;
3556   bin_tree_t *tree;
3557
3558   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3559 #ifdef RE_ENABLE_I18N
3560   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3561 #endif /* RE_ENABLE_I18N */
3562
3563 #ifdef RE_ENABLE_I18N
3564   if (BE (sbcset == NULL || mbcset == NULL, 0))
3565 #else /* not RE_ENABLE_I18N */
3566   if (BE (sbcset == NULL, 0))
3567 #endif /* not RE_ENABLE_I18N */
3568     {
3569       *err = REG_ESPACE;
3570       return NULL;
3571     }
3572
3573   if (non_match)
3574     {
3575 #ifdef RE_ENABLE_I18N
3576       mbcset->non_match = 1;
3577 #endif /* not RE_ENABLE_I18N */
3578     }
3579
3580   /* We don't care the syntax in this case.  */
3581   ret = build_charclass (trans, sbcset,
3582 #ifdef RE_ENABLE_I18N
3583                          mbcset, &alloc,
3584 #endif /* RE_ENABLE_I18N */
3585                          class_name, 0);
3586
3587   if (BE (ret != REG_NOERROR, 0))
3588     {
3589       re_free (sbcset);
3590 #ifdef RE_ENABLE_I18N
3591       free_charset (mbcset);
3592 #endif /* RE_ENABLE_I18N */
3593       *err = ret;
3594       return NULL;
3595     }
3596   /* \w match '_' also.  */
3597   for (; *extra; extra++)
3598     bitset_set (sbcset, *extra);
3599
3600   /* If it is non-matching list.  */
3601   if (non_match)
3602     bitset_not (sbcset);
3603
3604 #ifdef RE_ENABLE_I18N
3605   /* Ensure only single byte characters are set.  */
3606   if (dfa->mb_cur_max > 1)
3607     bitset_mask (sbcset, dfa->sb_char);
3608 #endif
3609
3610   /* Build a tree for simple bracket.  */
3611   br_token.type = SIMPLE_BRACKET;
3612   br_token.opr.sbcset = sbcset;
3613   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3614   if (BE (tree == NULL, 0))
3615     goto build_word_op_espace;
3616
3617 #ifdef RE_ENABLE_I18N
3618   if (dfa->mb_cur_max > 1)
3619     {
3620       bin_tree_t *mbc_tree;
3621       /* Build a tree for complex bracket.  */
3622       br_token.type = COMPLEX_BRACKET;
3623       br_token.opr.mbcset = mbcset;
3624       dfa->has_mb_node = 1;
3625       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3626       if (BE (mbc_tree == NULL, 0))
3627         goto build_word_op_espace;
3628       /* Then join them by ALT node.  */
3629       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3630       if (BE (mbc_tree != NULL, 1))
3631         return tree;
3632     }
3633   else
3634     {
3635       free_charset (mbcset);
3636       return tree;
3637     }
3638 #else /* not RE_ENABLE_I18N */
3639   return tree;
3640 #endif /* not RE_ENABLE_I18N */
3641
3642  build_word_op_espace:
3643   re_free (sbcset);
3644 #ifdef RE_ENABLE_I18N
3645   free_charset (mbcset);
3646 #endif /* RE_ENABLE_I18N */
3647   *err = REG_ESPACE;
3648   return NULL;
3649 }
3650
3651 /* This is intended for the expressions like "a{1,3}".
3652    Fetch a number from `input', and return the number.
3653    Return -1, if the number field is empty like "{,1}".
3654    Return -2, If an error is occured.  */
3655
3656 static int
3657 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3658 {
3659   int num = -1;
3660   unsigned char c;
3661   while (1)
3662     {
3663       fetch_token (token, input, syntax);
3664       c = token->opr.c;
3665       if (BE (token->type == END_OF_RE, 0))
3666         return -2;
3667       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3668         break;
3669       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3670              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3671       num = (num > RE_DUP_MAX) ? -2 : num;
3672     }
3673   return num;
3674 }
3675 \f
3676 #ifdef RE_ENABLE_I18N
3677 static void
3678 free_charset (re_charset_t *cset)
3679 {
3680   re_free (cset->mbchars);
3681 # ifdef _LIBC
3682   re_free (cset->coll_syms);
3683   re_free (cset->equiv_classes);
3684   re_free (cset->range_starts);
3685   re_free (cset->range_ends);
3686 # endif
3687   re_free (cset->char_classes);
3688   re_free (cset);
3689 }
3690 #endif /* RE_ENABLE_I18N */
3691 \f
3692 /* Functions for binary tree operation.  */
3693
3694 /* Create a tree node.  */
3695
3696 static bin_tree_t *
3697 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3698              re_token_type_t type)
3699 {
3700   re_token_t t;
3701   t.type = type;
3702   return create_token_tree (dfa, left, right, &t);
3703 }
3704
3705 static bin_tree_t *
3706 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3707                    const re_token_t *token)
3708 {
3709   bin_tree_t *tree;
3710   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3711     {
3712       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3713
3714       if (storage == NULL)
3715         return NULL;
3716       storage->next = dfa->str_tree_storage;
3717       dfa->str_tree_storage = storage;
3718       dfa->str_tree_storage_idx = 0;
3719     }
3720   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3721
3722   tree->parent = NULL;
3723   tree->left = left;
3724   tree->right = right;
3725   tree->token = *token;
3726   tree->token.duplicated = 0;
3727   tree->token.opt_subexp = 0;
3728   tree->first = NULL;
3729   tree->next = NULL;
3730   tree->node_idx = -1;
3731
3732   if (left != NULL)
3733     left->parent = tree;
3734   if (right != NULL)
3735     right->parent = tree;
3736   return tree;
3737 }
3738
3739 /* Mark the tree SRC as an optional subexpression.
3740    To be called from preorder or postorder.  */
3741
3742 static reg_errcode_t
3743 mark_opt_subexp (void *extra, bin_tree_t *node)
3744 {
3745   int idx = (int) (long) extra;
3746   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3747     node->token.opt_subexp = 1;
3748
3749   return REG_NOERROR;
3750 }
3751
3752 /* Free the allocated memory inside NODE. */
3753
3754 static void
3755 free_token (re_token_t *node)
3756 {
3757 #ifdef RE_ENABLE_I18N
3758   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3759     free_charset (node->opr.mbcset);
3760   else
3761 #endif /* RE_ENABLE_I18N */
3762     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3763       re_free (node->opr.sbcset);
3764 }
3765
3766 /* Worker function for tree walking.  Free the allocated memory inside NODE
3767    and its children. */
3768
3769 static reg_errcode_t
3770 free_tree (void *extra, bin_tree_t *node)
3771 {
3772   free_token (&node->token);
3773   return REG_NOERROR;
3774 }
3775
3776
3777 /* Duplicate the node SRC, and return new node.  This is a preorder
3778    visit similar to the one implemented by the generic visitor, but
3779    we need more infrastructure to maintain two parallel trees --- so,
3780    it's easier to duplicate.  */
3781
3782 static bin_tree_t *
3783 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3784 {
3785   const bin_tree_t *node;
3786   bin_tree_t *dup_root;
3787   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3788
3789   for (node = root; ; )
3790     {
3791       /* Create a new tree and link it back to the current parent.  */
3792       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3793       if (*p_new == NULL)
3794         return NULL;
3795       (*p_new)->parent = dup_node;
3796       (*p_new)->token.duplicated = 1;
3797       dup_node = *p_new;
3798
3799       /* Go to the left node, or up and to the right.  */
3800       if (node->left)
3801         {
3802           node = node->left;
3803           p_new = &dup_node->left;
3804         }
3805       else
3806         {
3807           const bin_tree_t *prev = NULL;
3808           while (node->right == prev || node->right == NULL)
3809             {
3810               prev = node;
3811               node = node->parent;
3812               dup_node = dup_node->parent;
3813               if (!node)
3814                 return dup_root;
3815             }
3816           node = node->right;
3817           p_new = &dup_node->right;
3818         }
3819     }
3820 }