posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 {
 219   reg_errcode_t ret;
 220
 221   /* And GNU code determines whether or not to get register information
 222      by passing null for the REGS argument to re_match, etc., not by
 223      setting no_sub, unless RE_NO_SUB is set.  */
 224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 225
 226   /* Match anchors at newline.  */
 227   bufp->newline_anchor = 1;
 228
 229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 230
 231   if (!ret)
 232     return NULL;
 233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 234 }
 235 #ifdef _LIBC
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237 #endif
 238
 239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 240    also be assigned to arbitrarily: each pattern buffer stores its own
 241    syntax, so it can be changed between regex compilations.  */
 242 /* This has no initializer because initialized variables in Emacs
 243    become read-only after dumping.  */
 244 reg_syntax_t re_syntax_options;
 245
 246
 247 /* Specify the precise syntax of regexps for compilation.  This provides
 248    for compatibility for various utilities which historically have
 249    different, incompatible syntaxes.
 250
 251    The argument SYNTAX is a bit mask comprised of the various bits
 252    defined in regex.h.  We return the old syntax.  */
 253
 254 reg_syntax_t
 255 re_set_syntax (syntax)
 256     reg_syntax_t syntax;
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (bufp)
 269     struct re_pattern_buffer *bufp;
 270 {
 271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 272   char *fastmap = bufp->fastmap;
 273
 274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 276   if (dfa->init_state != dfa->init_state_word)
 277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 278   if (dfa->init_state != dfa->init_state_nl)
 279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 280   if (dfa->init_state != dfa->init_state_begbuf)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 282   bufp->fastmap_accurate = 1;
 283   return 0;
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 287 #endif
 288
 289 static inline void
 290 __attribute ((always_inline))
 291 re_set_fastmap (char *fastmap, int icase, int ch)
 292 {
 293   fastmap[ch] = 1;
 294   if (icase)
 295     fastmap[tolower (ch)] = 1;
 296 }
 297
 298 /* Helper function for re_compile_fastmap.
 299    Compile fastmap for the initial_state INIT_STATE.  */
 300
 301 static void
 302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 303                          char *fastmap)
 304 {
 305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 306   int node_cnt;
 307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 309     {
 310       int node = init_state->nodes.elems[node_cnt];
 311       re_token_type_t type = dfa->nodes[node].type;
 312
 313       if (type == CHARACTER)
 314         {
 315           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 316 #ifdef RE_ENABLE_I18N
 317           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 318             {
 319               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 331                              &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, 0, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 354           int i;
 355
 356 # ifdef _LIBC
 357           /* See if we have to try all bytes which start multiple collation
 358              elements.
 359              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 360                   collation element, and don't catch 'b' since 'b' is
 361                   the only collation element which starts from 'b' (and
 362                   it is caught by SIMPLE_BRACKET).  */
 363               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 364                   && (cset->ncoll_syms || cset->nranges))
 365                 {
 366                   const int32_t *table = (const int32_t *)
 367                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 368                   for (i = 0; i < SBC_MAX; ++i)
 369                     if (table[i] < 0)
 370                       re_set_fastmap (fastmap, icase, i);
 371                 }
 372 # endif /* _LIBC */
 373
 374           /* See if we have to start the match at all multibyte characters,
 375              i.e. where we would not find an invalid sequence.  This only
 376              applies to multibyte character sets; for single byte character
 377              sets, the SIMPLE_BRACKET again suffices.  */
 378           if (dfa->mb_cur_max > 1
 379               && (cset->nchar_classes || cset->non_match || cset->nranges
 380 # ifdef _LIBC
 381                   || cset->nequiv_classes
 382 # endif /* _LIBC */
 383                  ))
 384             {
 385               unsigned char c = 0;
 386               do
 387                 {
 388                   mbstate_t mbs;
 389                   memset (&mbs, 0, sizeof (mbs));
 390                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 391                     re_set_fastmap (fastmap, false, (int) c);
 392                 }
 393               while (++c != 0);
 394             }
 395
 396           else
 397             {
 398               /* ... Else catch all bytes which can start the mbchars.  */
 399               for (i = 0; i < cset->nmbchars; ++i)
 400                 {
 401                   char buf[256];
 402                   mbstate_t state;
 403                   memset (&state, '\0', sizeof (state));
 404                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 405                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 406                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 407                     {
 408                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 409                           != (size_t) -1)
 410                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 411                     }
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      `buffer' to the compiled pattern;
 437      `used' to the length of the compiled pattern;
 438      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      `fastmap' to an allocated space for the fastmap;
 443      `fastmap_accurate' to zero;
 444      `re_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (preg, pattern, cflags)
 468     regex_t *__restrict preg;
 469     const char *__restrict pattern;
 470     int cflags;
 471 {
 472   reg_errcode_t ret;
 473   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 474                          : RE_SYNTAX_POSIX_BASIC);
 475
 476   preg->buffer = NULL;
 477   preg->allocated = 0;
 478   preg->used = 0;
 479
 480   /* Try to allocate space for the fastmap.  */
 481   preg->fastmap = re_malloc (char, SBC_MAX);
 482   if (BE (preg->fastmap == NULL, 0))
 483     return REG_ESPACE;
 484
 485   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 486
 487   /* If REG_NEWLINE is set, newlines are treated differently.  */
 488   if (cflags & REG_NEWLINE)
 489     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 490       syntax &= ~RE_DOT_NEWLINE;
 491       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 492       /* It also changes the matching behavior.  */
 493       preg->newline_anchor = 1;
 494     }
 495   else
 496     preg->newline_anchor = 0;
 497   preg->no_sub = !!(cflags & REG_NOSUB);
 498   preg->translate = NULL;
 499
 500   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 501
 502   /* POSIX doesn't distinguish between an unmatched open-group and an
 503      unmatched close-group: both are REG_EPAREN.  */
 504   if (ret == REG_ERPAREN)
 505     ret = REG_EPAREN;
 506
 507   /* We have already checked preg->fastmap != NULL.  */
 508   if (BE (ret == REG_NOERROR, 1))
 509     /* Compute the fastmap now, since regexec cannot modify the pattern
 510        buffer.  This function never fails in this implementation.  */
 511     (void) re_compile_fastmap (preg);
 512   else
 513     {
 514       /* Some error occurred while compiling the expression.  */
 515       re_free (preg->fastmap);
 516       preg->fastmap = NULL;
 517     }
 518
 519   return (int) ret;
 520 }
 521 #ifdef _LIBC
 522 weak_alias (__regcomp, regcomp)
 523 #endif
 524
 525 /* Returns a message corresponding to an error code, ERRCODE, returned
 526    from either regcomp or regexec.   We don't use PREG here.  */
 527
 528 size_t
 529 regerror (errcode, preg, errbuf, errbuf_size)
 530     int errcode;
 531     const regex_t *__restrict preg;
 532     char *__restrict errbuf;
 533     size_t errbuf_size;
 534 {
 535   const char *msg;
 536   size_t msg_size;
 537
 538   if (BE (errcode < 0
 539           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 540                                / sizeof (__re_error_msgid_idx[0])), 0))
 541     /* Only error codes returned by the rest of the code should be passed
 542        to this routine.  If we are given anything else, or if other regex
 543        code generates an invalid error code, then the program has a bug.
 544        Dump core so we can fix it.  */
 545     abort ();
 546
 547   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 548
 549   msg_size = strlen (msg) + 1; /* Includes the null.  */
 550
 551   if (BE (errbuf_size != 0, 1))
 552     {
 553       if (BE (msg_size > errbuf_size, 0))
 554         {
 555 #if defined HAVE_MEMPCPY || defined _LIBC
 556           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 557 #else
 558           memcpy (errbuf, msg, errbuf_size - 1);
 559           errbuf[errbuf_size - 1] = 0;
 560 #endif
 561         }
 562       else
 563         memcpy (errbuf, msg, msg_size);
 564     }
 565
 566   return msg_size;
 567 }
 568 #ifdef _LIBC
 569 weak_alias (__regerror, regerror)
 570 #endif
 571
 572
 573 #ifdef RE_ENABLE_I18N
 574 /* This static array is used for the map to single-byte characters when
 575    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 576    it the same all the time.  UTF-8 is the preferred encoding so this is
 577    a worthwhile optimization.  */
 578 static const bitset_t utf8_sb_map =
 579 {
 580   /* Set the first 128 bits.  */
 581   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 582 };
 583 #endif
 584
 585
 586 static void
 587 free_dfa_content (re_dfa_t *dfa)
 588 {
 589   int i, j;
 590
 591   if (dfa->nodes)
 592     for (i = 0; i < dfa->nodes_len; ++i)
 593       free_token (dfa->nodes + i);
 594   re_free (dfa->nexts);
 595   for (i = 0; i < dfa->nodes_len; ++i)
 596     {
 597       if (dfa->eclosures != NULL)
 598         re_node_set_free (dfa->eclosures + i);
 599       if (dfa->inveclosures != NULL)
 600         re_node_set_free (dfa->inveclosures + i);
 601       if (dfa->edests != NULL)
 602         re_node_set_free (dfa->edests + i);
 603     }
 604   re_free (dfa->edests);
 605   re_free (dfa->eclosures);
 606   re_free (dfa->inveclosures);
 607   re_free (dfa->nodes);
 608
 609   if (dfa->state_table)
 610     for (i = 0; i <= dfa->state_hash_mask; ++i)
 611       {
 612         struct re_state_table_entry *entry = dfa->state_table + i;
 613         for (j = 0; j < entry->num; ++j)
 614           {
 615             re_dfastate_t *state = entry->array[j];
 616             free_state (state);
 617           }
 618         re_free (entry->array);
 619       }
 620   re_free (dfa->state_table);
 621 #ifdef RE_ENABLE_I18N
 622   if (dfa->sb_char != utf8_sb_map)
 623     re_free (dfa->sb_char);
 624 #endif
 625   re_free (dfa->subexp_map);
 626 #ifdef DEBUG
 627   re_free (dfa->re_str);
 628 #endif
 629
 630   re_free (dfa);
 631 }
 632
 633
 634 /* Free dynamically allocated space used by PREG.  */
 635
 636 void
 637 regfree (preg)
 638     regex_t *preg;
 639 {
 640   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 641   if (BE (dfa != NULL, 1))
 642     free_dfa_content (dfa);
 643   preg->buffer = NULL;
 644   preg->allocated = 0;
 645
 646   re_free (preg->fastmap);
 647   preg->fastmap = NULL;
 648
 649   re_free (preg->translate);
 650   preg->translate = NULL;
 651 }
 652 #ifdef _LIBC
 653 weak_alias (__regfree, regfree)
 654 #endif
 655 \f
 656 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 657    them unless specifically requested.  */
 658
 659 #if defined _REGEX_RE_COMP || defined _LIBC
 660
 661 /* BSD has one and only one pattern buffer.  */
 662 static struct re_pattern_buffer re_comp_buf;
 663
 664 char *
 665 # ifdef _LIBC
 666 /* Make these definitions weak in libc, so POSIX programs can redefine
 667    these names if they don't use our functions, and still use
 668    regcomp/regexec above without link errors.  */
 669 weak_function
 670 # endif
 671 re_comp (s)
 672      const char *s;
 673 {
 674   reg_errcode_t ret;
 675   char *fastmap;
 676
 677   if (!s)
 678     {
 679       if (!re_comp_buf.buffer)
 680         return gettext ("No previous regular expression");
 681       return 0;
 682     }
 683
 684   if (re_comp_buf.buffer)
 685     {
 686       fastmap = re_comp_buf.fastmap;
 687       re_comp_buf.fastmap = NULL;
 688       __regfree (&re_comp_buf);
 689       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 690       re_comp_buf.fastmap = fastmap;
 691     }
 692
 693   if (re_comp_buf.fastmap == NULL)
 694     {
 695       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 696       if (re_comp_buf.fastmap == NULL)
 697         return (char *) gettext (__re_error_msgid
 698                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 699     }
 700
 701   /* Since `re_exec' always passes NULL for the `regs' argument, we
 702      don't need to initialize the pattern buffer fields which affect it.  */
 703
 704   /* Match anchors at newlines.  */
 705   re_comp_buf.newline_anchor = 1;
 706
 707   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 708
 709   if (!ret)
 710     return NULL;
 711
 712   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 713   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 714 }
 715
 716 #ifdef _LIBC
 717 libc_freeres_fn (free_mem)
 718 {
 719   __regfree (&re_comp_buf);
 720 }
 721 #endif
 722
 723 #endif /* _REGEX_RE_COMP */
 724 \f
 725 /* Internal entry point.
 726    Compile the regular expression PATTERN, whose length is LENGTH.
 727    SYNTAX indicate regular expression's syntax.  */
 728
 729 static reg_errcode_t
 730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 731                      reg_syntax_t syntax)
 732 {
 733   reg_errcode_t err = REG_NOERROR;
 734   re_dfa_t *dfa;
 735   re_string_t regexp;
 736
 737   /* Initialize the pattern buffer.  */
 738   preg->fastmap_accurate = 0;
 739   preg->syntax = syntax;
 740   preg->not_bol = preg->not_eol = 0;
 741   preg->used = 0;
 742   preg->re_nsub = 0;
 743   preg->can_be_null = 0;
 744   preg->regs_allocated = REGS_UNALLOCATED;
 745
 746   /* Initialize the dfa.  */
 747   dfa = (re_dfa_t *) preg->buffer;
 748   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 749     {
 750       /* If zero allocated, but buffer is non-null, try to realloc
 751          enough space.  This loses if buffer's address is bogus, but
 752          that is the user's responsibility.  If ->buffer is NULL this
 753          is a simple allocation.  */
 754       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 755       if (dfa == NULL)
 756         return REG_ESPACE;
 757       preg->allocated = sizeof (re_dfa_t);
 758       preg->buffer = (unsigned char *) dfa;
 759     }
 760   preg->used = sizeof (re_dfa_t);
 761
 762   err = init_dfa (dfa, length);
 763   if (BE (err != REG_NOERROR, 0))
 764     {
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770 #ifdef DEBUG
 771   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 772   dfa->re_str = re_malloc (char, length + 1);
 773   strncpy (dfa->re_str, pattern, length + 1);
 774 #endif
 775
 776   __libc_lock_init (dfa->lock);
 777
 778   err = re_string_construct (&regexp, pattern, length, preg->translate,
 779                              syntax & RE_ICASE, dfa);
 780   if (BE (err != REG_NOERROR, 0))
 781     {
 782     re_compile_internal_free_return:
 783       free_workarea_compile (preg);
 784       re_string_destruct (&regexp);
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790
 791   /* Parse the regular expression, and build a structure tree.  */
 792   preg->re_nsub = 0;
 793   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 794   if (BE (dfa->str_tree == NULL, 0))
 795     goto re_compile_internal_free_return;
 796
 797   /* Analyze the tree and create the nfa.  */
 798   err = analyze (preg);
 799   if (BE (err != REG_NOERROR, 0))
 800     goto re_compile_internal_free_return;
 801
 802 #ifdef RE_ENABLE_I18N
 803   /* If possible, do searching in single byte encoding to speed things up.  */
 804   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 805     optimize_utf8 (dfa);
 806 #endif
 807
 808   /* Then create the initial state of the dfa.  */
 809   err = create_initial_state (dfa);
 810
 811   /* Release work areas.  */
 812   free_workarea_compile (preg);
 813   re_string_destruct (&regexp);
 814
 815   if (BE (err != REG_NOERROR, 0))
 816     {
 817       free_dfa_content (dfa);
 818       preg->buffer = NULL;
 819       preg->allocated = 0;
 820     }
 821
 822   return err;
 823 }
 824
 825 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 826    as the initial length of some arrays.  */
 827
 828 static reg_errcode_t
 829 init_dfa (re_dfa_t *dfa, size_t pat_len)
 830 {
 831   unsigned int table_size;
 832 #ifndef _LIBC
 833   char *codeset_name;
 834 #endif
 835
 836   memset (dfa, '\0', sizeof (re_dfa_t));
 837
 838   /* Force allocation of str_tree_storage the first time.  */
 839   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 840
 841   /* Avoid overflows.  */
 842   if (pat_len == SIZE_MAX)
 843     return REG_ESPACE;
 844
 845   dfa->nodes_alloc = pat_len + 1;
 846   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 847
 848   /*  table_size = 2 ^ ceil(log pat_len) */
 849   for (table_size = 1; ; table_size <<= 1)
 850     if (table_size > pat_len)
 851       break;
 852
 853   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 854   dfa->state_hash_mask = table_size - 1;
 855
 856   dfa->mb_cur_max = MB_CUR_MAX;
 857 #ifdef _LIBC
 858   if (dfa->mb_cur_max == 6
 859       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 860     dfa->is_utf8 = 1;
 861   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 862                        != 0);
 863 #else
 864 # ifdef HAVE_LANGINFO_CODESET
 865   codeset_name = nl_langinfo (CODESET);
 866 # else
 867   codeset_name = getenv ("LC_ALL");
 868   if (codeset_name == NULL || codeset_name[0] == '\0')
 869     codeset_name = getenv ("LC_CTYPE");
 870   if (codeset_name == NULL || codeset_name[0] == '\0')
 871     codeset_name = getenv ("LANG");
 872   if (codeset_name == NULL)
 873     codeset_name = "";
 874   else if (strchr (codeset_name, '.') !=  NULL)
 875     codeset_name = strchr (codeset_name, '.') + 1;
 876 # endif
 877
 878   if (strcasecmp (codeset_name, "UTF-8") == 0
 879       || strcasecmp (codeset_name, "UTF8") == 0)
 880     dfa->is_utf8 = 1;
 881
 882   /* We check exhaustively in the loop below if this charset is a
 883      superset of ASCII.  */
 884   dfa->map_notascii = 0;
 885 #endif
 886
 887 #ifdef RE_ENABLE_I18N
 888   if (dfa->mb_cur_max > 1)
 889     {
 890       if (dfa->is_utf8)
 891         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 892       else
 893         {
 894           int i, j, ch;
 895
 896           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 897           if (BE (dfa->sb_char == NULL, 0))
 898             return REG_ESPACE;
 899
 900           /* Set the bits corresponding to single byte chars.  */
 901           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 902             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 903               {
 904                 wint_t wch = __btowc (ch);
 905                 if (wch != WEOF)
 906                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 907 # ifndef _LIBC
 908                 if (isascii (ch) && wch != ch)
 909                   dfa->map_notascii = 1;
 910 # endif
 911               }
 912         }
 913     }
 914 #endif
 915
 916   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 917     return REG_ESPACE;
 918   return REG_NOERROR;
 919 }
 920
 921 /* Initialize WORD_CHAR table, which indicate which character is
 922    "word".  In this case "word" means that it is the word construction
 923    character used by some operators like "\<", "\>", etc.  */
 924
 925 static void
 926 internal_function
 927 init_word_char (re_dfa_t *dfa)
 928 {
 929   int i, j, ch;
 930   dfa->word_ops_used = 1;
 931   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 932     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 933       if (isalnum (ch) || ch == '_')
 934         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 935 }
 936
 937 /* Free the work area which are only used while compiling.  */
 938
 939 static void
 940 free_workarea_compile (regex_t *preg)
 941 {
 942   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 943   bin_tree_storage_t *storage, *next;
 944   for (storage = dfa->str_tree_storage; storage; storage = next)
 945     {
 946       next = storage->next;
 947       re_free (storage);
 948     }
 949   dfa->str_tree_storage = NULL;
 950   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 951   dfa->str_tree = NULL;
 952   re_free (dfa->org_indices);
 953   dfa->org_indices = NULL;
 954 }
 955
 956 /* Create initial states for all contexts.  */
 957
 958 static reg_errcode_t
 959 create_initial_state (re_dfa_t *dfa)
 960 {
 961   int first, i;
 962   reg_errcode_t err;
 963   re_node_set init_nodes;
 964
 965   /* Initial states have the epsilon closure of the node which is
 966      the first node of the regular expression.  */
 967   first = dfa->str_tree->first->node_idx;
 968   dfa->init_node = first;
 969   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 970   if (BE (err != REG_NOERROR, 0))
 971     return err;
 972
 973   /* The back-references which are in initial states can epsilon transit,
 974      since in this case all of the subexpressions can be null.
 975      Then we add epsilon closures of the nodes which are the next nodes of
 976      the back-references.  */
 977   if (dfa->nbackref > 0)
 978     for (i = 0; i < init_nodes.nelem; ++i)
 979       {
 980         int node_idx = init_nodes.elems[i];
 981         re_token_type_t type = dfa->nodes[node_idx].type;
 982
 983         int clexp_idx;
 984         if (type != OP_BACK_REF)
 985           continue;
 986         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 987           {
 988             re_token_t *clexp_node;
 989             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 990             if (clexp_node->type == OP_CLOSE_SUBEXP
 991                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 992               break;
 993           }
 994         if (clexp_idx == init_nodes.nelem)
 995           continue;
 996
 997         if (type == OP_BACK_REF)
 998           {
 999             int dest_idx = dfa->edests[node_idx].elems[0];
1000             if (!re_node_set_contains (&init_nodes, dest_idx))
1001               {
1002                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1003                                                        dfa->eclosures
1004                                                        + dest_idx);
1005                 if (err != REG_NOERROR)
1006                   return err;
1007                 i = 0;
1008               }
1009           }
1010       }
1011
1012   /* It must be the first time to invoke acquire_state.  */
1013   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1014   /* We don't check ERR here, since the initial state must not be NULL.  */
1015   if (BE (dfa->init_state == NULL, 0))
1016     return err;
1017   if (dfa->init_state->has_constraint)
1018     {
1019       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1020                                                        CONTEXT_WORD);
1021       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1022                                                      CONTEXT_NEWLINE);
1023       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1024                                                          &init_nodes,
1025                                                          CONTEXT_NEWLINE
1026                                                          | CONTEXT_BEGBUF);
1027       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1028               || dfa->init_state_begbuf == NULL, 0))
1029         return err;
1030     }
1031   else
1032     dfa->init_state_word = dfa->init_state_nl
1033       = dfa->init_state_begbuf = dfa->init_state;
1034
1035   re_node_set_free (&init_nodes);
1036   return REG_NOERROR;
1037 }
1038 \f
1039 #ifdef RE_ENABLE_I18N
1040 /* If it is possible to do searching in single byte encoding instead of UTF-8
1041    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1042    DFA nodes where needed.  */
1043
1044 static void
1045 optimize_utf8 (re_dfa_t *dfa)
1046 {
1047   int node, i, mb_chars = 0, has_period = 0;
1048
1049   for (node = 0; node < dfa->nodes_len; ++node)
1050     switch (dfa->nodes[node].type)
1051       {
1052       case CHARACTER:
1053         if (dfa->nodes[node].opr.c >= 0x80)
1054           mb_chars = 1;
1055         break;
1056       case ANCHOR:
1057         switch (dfa->nodes[node].opr.ctx_type)
1058           {
1059           case LINE_FIRST:
1060           case LINE_LAST:
1061           case BUF_FIRST:
1062           case BUF_LAST:
1063             break;
1064           default:
1065             /* Word anchors etc. cannot be handled.  It's okay to test
1066                opr.ctx_type since constraints (for all DFA nodes) are
1067                created by ORing one or more opr.ctx_type values.  */
1068             return;
1069           }
1070         break;
1071       case OP_PERIOD:
1072         has_period = 1;
1073         break;
1074       case OP_BACK_REF:
1075       case OP_ALT:
1076       case END_OF_RE:
1077       case OP_DUP_ASTERISK:
1078       case OP_OPEN_SUBEXP:
1079       case OP_CLOSE_SUBEXP:
1080         break;
1081       case COMPLEX_BRACKET:
1082         return;
1083       case SIMPLE_BRACKET:
1084         /* Just double check.  The non-ASCII range starts at 0x80.  */
1085         assert (0x80 % BITSET_WORD_BITS == 0);
1086         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1087           if (dfa->nodes[node].opr.sbcset[i])
1088             return;
1089         break;
1090       default:
1091         abort ();
1092       }
1093
1094   if (mb_chars || has_period)
1095     for (node = 0; node < dfa->nodes_len; ++node)
1096       {
1097         if (dfa->nodes[node].type == CHARACTER
1098             && dfa->nodes[node].opr.c >= 0x80)
1099           dfa->nodes[node].mb_partial = 0;
1100         else if (dfa->nodes[node].type == OP_PERIOD)
1101           dfa->nodes[node].type = OP_UTF8_PERIOD;
1102       }
1103
1104   /* The search can be in single byte locale.  */
1105   dfa->mb_cur_max = 1;
1106   dfa->is_utf8 = 0;
1107   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1108 }
1109 #endif
1110 \f
1111 /* Analyze the structure tree, and calculate "first", "next", "edest",
1112    "eclosure", and "inveclosure".  */
1113
1114 static reg_errcode_t
1115 analyze (regex_t *preg)
1116 {
1117   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1118   reg_errcode_t ret;
1119
1120   /* Allocate arrays.  */
1121   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1122   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1123   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1124   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1125   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1126           || dfa->eclosures == NULL, 0))
1127     return REG_ESPACE;
1128
1129   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1130   if (dfa->subexp_map != NULL)
1131     {
1132       int i;
1133       for (i = 0; i < preg->re_nsub; i++)
1134         dfa->subexp_map[i] = i;
1135       preorder (dfa->str_tree, optimize_subexps, dfa);
1136       for (i = 0; i < preg->re_nsub; i++)
1137         if (dfa->subexp_map[i] != i)
1138           break;
1139       if (i == preg->re_nsub)
1140         {
1141           free (dfa->subexp_map);
1142           dfa->subexp_map = NULL;
1143         }
1144     }
1145
1146   ret = postorder (dfa->str_tree, lower_subexps, preg);
1147   if (BE (ret != REG_NOERROR, 0))
1148     return ret;
1149   ret = postorder (dfa->str_tree, calc_first, dfa);
1150   if (BE (ret != REG_NOERROR, 0))
1151     return ret;
1152   preorder (dfa->str_tree, calc_next, dfa);
1153   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1154   if (BE (ret != REG_NOERROR, 0))
1155     return ret;
1156   ret = calc_eclosure (dfa);
1157   if (BE (ret != REG_NOERROR, 0))
1158     return ret;
1159
1160   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1161      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1162   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1163       || dfa->nbackref)
1164     {
1165       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1166       if (BE (dfa->inveclosures == NULL, 0))
1167         return REG_ESPACE;
1168       ret = calc_inveclosure (dfa);
1169     }
1170
1171   return ret;
1172 }
1173
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175    implement parse tree visits.  Instead, we use parent pointers and
1176    some hairy code in these two functions.  */
1177 static reg_errcode_t
1178 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1179            void *extra)
1180 {
1181   bin_tree_t *node, *prev;
1182
1183   for (node = root; ; )
1184     {
1185       /* Descend down the tree, preferably to the left (or to the right
1186          if that's the only child).  */
1187       while (node->left || node->right)
1188         if (node->left)
1189           node = node->left;
1190         else
1191           node = node->right;
1192
1193       do
1194         {
1195           reg_errcode_t err = fn (extra, node);
1196           if (BE (err != REG_NOERROR, 0))
1197             return err;
1198           if (node->parent == NULL)
1199             return REG_NOERROR;
1200           prev = node;
1201           node = node->parent;
1202         }
1203       /* Go up while we have a node that is reached from the right.  */
1204       while (node->right == prev || node->right == NULL);
1205       node = node->right;
1206     }
1207 }
1208
1209 static reg_errcode_t
1210 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211           void *extra)
1212 {
1213   bin_tree_t *node;
1214
1215   for (node = root; ; )
1216     {
1217       reg_errcode_t err = fn (extra, node);
1218       if (BE (err != REG_NOERROR, 0))
1219         return err;
1220
1221       /* Go to the left node, or up and to the right.  */
1222       if (node->left)
1223         node = node->left;
1224       else
1225         {
1226           bin_tree_t *prev = NULL;
1227           while (node->right == prev || node->right == NULL)
1228             {
1229               prev = node;
1230               node = node->parent;
1231               if (!node)
1232                 return REG_NOERROR;
1233             }
1234           node = node->right;
1235         }
1236     }
1237 }
1238
1239 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1240    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1241    backreferences as well.  Requires a preorder visit.  */
1242 static reg_errcode_t
1243 optimize_subexps (void *extra, bin_tree_t *node)
1244 {
1245   re_dfa_t *dfa = (re_dfa_t *) extra;
1246
1247   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1248     {
1249       int idx = node->token.opr.idx;
1250       node->token.opr.idx = dfa->subexp_map[idx];
1251       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1252     }
1253
1254   else if (node->token.type == SUBEXP
1255            && node->left && node->left->token.type == SUBEXP)
1256     {
1257       int other_idx = node->left->token.opr.idx;
1258
1259       node->left = node->left->left;
1260       if (node->left)
1261         node->left->parent = node;
1262
1263       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1264       if (other_idx < BITSET_WORD_BITS)
1265           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1266     }
1267
1268   return REG_NOERROR;
1269 }
1270
1271 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1272    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1273 static reg_errcode_t
1274 lower_subexps (void *extra, bin_tree_t *node)
1275 {
1276   regex_t *preg = (regex_t *) extra;
1277   reg_errcode_t err = REG_NOERROR;
1278
1279   if (node->left && node->left->token.type == SUBEXP)
1280     {
1281       node->left = lower_subexp (&err, preg, node->left);
1282       if (node->left)
1283         node->left->parent = node;
1284     }
1285   if (node->right && node->right->token.type == SUBEXP)
1286     {
1287       node->right = lower_subexp (&err, preg, node->right);
1288       if (node->right)
1289         node->right->parent = node;
1290     }
1291
1292   return err;
1293 }
1294
1295 static bin_tree_t *
1296 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1297 {
1298   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1299   bin_tree_t *body = node->left;
1300   bin_tree_t *op, *cls, *tree1, *tree;
1301
1302   if (preg->no_sub
1303       /* We do not optimize empty subexpressions, because otherwise we may
1304          have bad CONCAT nodes with NULL children.  This is obviously not
1305          very common, so we do not lose much.  An example that triggers
1306          this case is the sed "script" /\(\)/x.  */
1307       && node->left != NULL
1308       && (node->token.opr.idx >= BITSET_WORD_BITS
1309           || !(dfa->used_bkref_map
1310                & ((bitset_word_t) 1 << node->token.opr.idx))))
1311     return node->left;
1312
1313   /* Convert the SUBEXP node to the concatenation of an
1314      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1315   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1316   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1317   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1318   tree = create_tree (dfa, op, tree1, CONCAT);
1319   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1320     {
1321       *err = REG_ESPACE;
1322       return NULL;
1323     }
1324
1325   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1326   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1327   return tree;
1328 }
1329
1330 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1331    nodes.  Requires a postorder visit.  */
1332 static reg_errcode_t
1333 calc_first (void *extra, bin_tree_t *node)
1334 {
1335   re_dfa_t *dfa = (re_dfa_t *) extra;
1336   if (node->token.type == CONCAT)
1337     {
1338       node->first = node->left->first;
1339       node->node_idx = node->left->node_idx;
1340     }
1341   else
1342     {
1343       node->first = node;
1344       node->node_idx = re_dfa_add_node (dfa, node->token);
1345       if (BE (node->node_idx == -1, 0))
1346         return REG_ESPACE;
1347       if (node->token.type == ANCHOR)
1348         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1349     }
1350   return REG_NOERROR;
1351 }
1352
1353 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1354 static reg_errcode_t
1355 calc_next (void *extra, bin_tree_t *node)
1356 {
1357   switch (node->token.type)
1358     {
1359     case OP_DUP_ASTERISK:
1360       node->left->next = node;
1361       break;
1362     case CONCAT:
1363       node->left->next = node->right->first;
1364       node->right->next = node->next;
1365       break;
1366     default:
1367       if (node->left)
1368         node->left->next = node->next;
1369       if (node->right)
1370         node->right->next = node->next;
1371       break;
1372     }
1373   return REG_NOERROR;
1374 }
1375
1376 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1377 static reg_errcode_t
1378 link_nfa_nodes (void *extra, bin_tree_t *node)
1379 {
1380   re_dfa_t *dfa = (re_dfa_t *) extra;
1381   int idx = node->node_idx;
1382   reg_errcode_t err = REG_NOERROR;
1383
1384   switch (node->token.type)
1385     {
1386     case CONCAT:
1387       break;
1388
1389     case END_OF_RE:
1390       assert (node->next == NULL);
1391       break;
1392
1393     case OP_DUP_ASTERISK:
1394     case OP_ALT:
1395       {
1396         int left, right;
1397         dfa->has_plural_match = 1;
1398         if (node->left != NULL)
1399           left = node->left->first->node_idx;
1400         else
1401           left = node->next->node_idx;
1402         if (node->right != NULL)
1403           right = node->right->first->node_idx;
1404         else
1405           right = node->next->node_idx;
1406         assert (left > -1);
1407         assert (right > -1);
1408         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1409       }
1410       break;
1411
1412     case ANCHOR:
1413     case OP_OPEN_SUBEXP:
1414     case OP_CLOSE_SUBEXP:
1415       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1416       break;
1417
1418     case OP_BACK_REF:
1419       dfa->nexts[idx] = node->next->node_idx;
1420       if (node->token.type == OP_BACK_REF)
1421         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1422       break;
1423
1424     default:
1425       assert (!IS_EPSILON_NODE (node->token.type));
1426       dfa->nexts[idx] = node->next->node_idx;
1427       break;
1428     }
1429
1430   return err;
1431 }
1432
1433 /* Duplicate the epsilon closure of the node ROOT_NODE.
1434    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1435    to their own constraint.  */
1436
1437 static reg_errcode_t
1438 internal_function
1439 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1440                         int root_node, unsigned int init_constraint)
1441 {
1442   int org_node, clone_node, ret;
1443   unsigned int constraint = init_constraint;
1444   for (org_node = top_org_node, clone_node = top_clone_node;;)
1445     {
1446       int org_dest, clone_dest;
1447       if (dfa->nodes[org_node].type == OP_BACK_REF)
1448         {
1449           /* If the back reference epsilon-transit, its destination must
1450              also have the constraint.  Then duplicate the epsilon closure
1451              of the destination of the back reference, and store it in
1452              edests of the back reference.  */
1453           org_dest = dfa->nexts[org_node];
1454           re_node_set_empty (dfa->edests + clone_node);
1455           clone_dest = duplicate_node (dfa, org_dest, constraint);
1456           if (BE (clone_dest == -1, 0))
1457             return REG_ESPACE;
1458           dfa->nexts[clone_node] = dfa->nexts[org_node];
1459           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1460           if (BE (ret < 0, 0))
1461             return REG_ESPACE;
1462         }
1463       else if (dfa->edests[org_node].nelem == 0)
1464         {
1465           /* In case of the node can't epsilon-transit, don't duplicate the
1466              destination and store the original destination as the
1467              destination of the node.  */
1468           dfa->nexts[clone_node] = dfa->nexts[org_node];
1469           break;
1470         }
1471       else if (dfa->edests[org_node].nelem == 1)
1472         {
1473           /* In case of the node can epsilon-transit, and it has only one
1474              destination.  */
1475           org_dest = dfa->edests[org_node].elems[0];
1476           re_node_set_empty (dfa->edests + clone_node);
1477           /* If the node is root_node itself, it means the epsilon clsoure
1478              has a loop.   Then tie it to the destination of the root_node.  */
1479           if (org_node == root_node && clone_node != org_node)
1480             {
1481               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1482               if (BE (ret < 0, 0))
1483                 return REG_ESPACE;
1484               break;
1485             }
1486           /* In case of the node has another constraint, add it.  */
1487           constraint |= dfa->nodes[org_node].constraint;
1488           clone_dest = duplicate_node (dfa, org_dest, constraint);
1489           if (BE (clone_dest == -1, 0))
1490             return REG_ESPACE;
1491           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492           if (BE (ret < 0, 0))
1493             return REG_ESPACE;
1494         }
1495       else /* dfa->edests[org_node].nelem == 2 */
1496         {
1497           /* In case of the node can epsilon-transit, and it has two
1498              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1499           org_dest = dfa->edests[org_node].elems[0];
1500           re_node_set_empty (dfa->edests + clone_node);
1501           /* Search for a duplicated node which satisfies the constraint.  */
1502           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1503           if (clone_dest == -1)
1504             {
1505               /* There is no such duplicated node, create a new one.  */
1506               reg_errcode_t err;
1507               clone_dest = duplicate_node (dfa, org_dest, constraint);
1508               if (BE (clone_dest == -1, 0))
1509                 return REG_ESPACE;
1510               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1511               if (BE (ret < 0, 0))
1512                 return REG_ESPACE;
1513               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1514                                             root_node, constraint);
1515               if (BE (err != REG_NOERROR, 0))
1516                 return err;
1517             }
1518           else
1519             {
1520               /* There is a duplicated node which satisfies the constraint,
1521                  use it to avoid infinite loop.  */
1522               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523               if (BE (ret < 0, 0))
1524                 return REG_ESPACE;
1525             }
1526
1527           org_dest = dfa->edests[org_node].elems[1];
1528           clone_dest = duplicate_node (dfa, org_dest, constraint);
1529           if (BE (clone_dest == -1, 0))
1530             return REG_ESPACE;
1531           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1532           if (BE (ret < 0, 0))
1533             return REG_ESPACE;
1534         }
1535       org_node = org_dest;
1536       clone_node = clone_dest;
1537     }
1538   return REG_NOERROR;
1539 }
1540
1541 /* Search for a node which is duplicated from the node ORG_NODE, and
1542    satisfies the constraint CONSTRAINT.  */
1543
1544 static int
1545 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1546                         unsigned int constraint)
1547 {
1548   int idx;
1549   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1550     {
1551       if (org_node == dfa->org_indices[idx]
1552           && constraint == dfa->nodes[idx].constraint)
1553         return idx; /* Found.  */
1554     }
1555   return -1; /* Not found.  */
1556 }
1557
1558 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1559    Return the index of the new node, or -1 if insufficient storage is
1560    available.  */
1561
1562 static int
1563 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1564 {
1565   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1566   if (BE (dup_idx != -1, 1))
1567     {
1568       dfa->nodes[dup_idx].constraint = constraint;
1569       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1570       dfa->nodes[dup_idx].duplicated = 1;
1571
1572       /* Store the index of the original node.  */
1573       dfa->org_indices[dup_idx] = org_idx;
1574     }
1575   return dup_idx;
1576 }
1577
1578 static reg_errcode_t
1579 calc_inveclosure (re_dfa_t *dfa)
1580 {
1581   int src, idx, ret;
1582   for (idx = 0; idx < dfa->nodes_len; ++idx)
1583     re_node_set_init_empty (dfa->inveclosures + idx);
1584
1585   for (src = 0; src < dfa->nodes_len; ++src)
1586     {
1587       int *elems = dfa->eclosures[src].elems;
1588       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1589         {
1590           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1591           if (BE (ret == -1, 0))
1592             return REG_ESPACE;
1593         }
1594     }
1595
1596   return REG_NOERROR;
1597 }
1598
1599 /* Calculate "eclosure" for all the node in DFA.  */
1600
1601 static reg_errcode_t
1602 calc_eclosure (re_dfa_t *dfa)
1603 {
1604   int node_idx, incomplete;
1605 #ifdef DEBUG
1606   assert (dfa->nodes_len > 0);
1607 #endif
1608   incomplete = 0;
1609   /* For each nodes, calculate epsilon closure.  */
1610   for (node_idx = 0; ; ++node_idx)
1611     {
1612       reg_errcode_t err;
1613       re_node_set eclosure_elem;
1614       if (node_idx == dfa->nodes_len)
1615         {
1616           if (!incomplete)
1617             break;
1618           incomplete = 0;
1619           node_idx = 0;
1620         }
1621
1622 #ifdef DEBUG
1623       assert (dfa->eclosures[node_idx].nelem != -1);
1624 #endif
1625
1626       /* If we have already calculated, skip it.  */
1627       if (dfa->eclosures[node_idx].nelem != 0)
1628         continue;
1629       /* Calculate epsilon closure of `node_idx'.  */
1630       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1631       if (BE (err != REG_NOERROR, 0))
1632         return err;
1633
1634       if (dfa->eclosures[node_idx].nelem == 0)
1635         {
1636           incomplete = 1;
1637           re_node_set_free (&eclosure_elem);
1638         }
1639     }
1640   return REG_NOERROR;
1641 }
1642
1643 /* Calculate epsilon closure of NODE.  */
1644
1645 static reg_errcode_t
1646 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1647 {
1648   reg_errcode_t err;
1649   int i;
1650   re_node_set eclosure;
1651   int ret;
1652   int incomplete = 0;
1653   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1654   if (BE (err != REG_NOERROR, 0))
1655     return err;
1656
1657   /* This indicates that we are calculating this node now.
1658      We reference this value to avoid infinite loop.  */
1659   dfa->eclosures[node].nelem = -1;
1660
1661   /* If the current node has constraints, duplicate all nodes
1662      since they must inherit the constraints.  */
1663   if (dfa->nodes[node].constraint
1664       && dfa->edests[node].nelem
1665       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1666     {
1667       err = duplicate_node_closure (dfa, node, node, node,
1668                                     dfa->nodes[node].constraint);
1669       if (BE (err != REG_NOERROR, 0))
1670         return err;
1671     }
1672
1673   /* Expand each epsilon destination nodes.  */
1674   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1675     for (i = 0; i < dfa->edests[node].nelem; ++i)
1676       {
1677         re_node_set eclosure_elem;
1678         int edest = dfa->edests[node].elems[i];
1679         /* If calculating the epsilon closure of `edest' is in progress,
1680            return intermediate result.  */
1681         if (dfa->eclosures[edest].nelem == -1)
1682           {
1683             incomplete = 1;
1684             continue;
1685           }
1686         /* If we haven't calculated the epsilon closure of `edest' yet,
1687            calculate now. Otherwise use calculated epsilon closure.  */
1688         if (dfa->eclosures[edest].nelem == 0)
1689           {
1690             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1691             if (BE (err != REG_NOERROR, 0))
1692               return err;
1693           }
1694         else
1695           eclosure_elem = dfa->eclosures[edest];
1696         /* Merge the epsilon closure of `edest'.  */
1697         err = re_node_set_merge (&eclosure, &eclosure_elem);
1698         if (BE (err != REG_NOERROR, 0))
1699           return err;
1700         /* If the epsilon closure of `edest' is incomplete,
1701            the epsilon closure of this node is also incomplete.  */
1702         if (dfa->eclosures[edest].nelem == 0)
1703           {
1704             incomplete = 1;
1705             re_node_set_free (&eclosure_elem);
1706           }
1707       }
1708
1709   /* An epsilon closure includes itself.  */
1710   ret = re_node_set_insert (&eclosure, node);
1711   if (BE (ret < 0, 0))
1712     return REG_ESPACE;
1713   if (incomplete && !root)
1714     dfa->eclosures[node].nelem = 0;
1715   else
1716     dfa->eclosures[node] = eclosure;
1717   *new_set = eclosure;
1718   return REG_NOERROR;
1719 }
1720 \f
1721 /* Functions for token which are used in the parser.  */
1722
1723 /* Fetch a token from INPUT.
1724    We must not use this function inside bracket expressions.  */
1725
1726 static void
1727 internal_function
1728 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1729 {
1730   re_string_skip_bytes (input, peek_token (result, input, syntax));
1731 }
1732
1733 /* Peek a token from INPUT, and return the length of the token.
1734    We must not use this function inside bracket expressions.  */
1735
1736 static int
1737 internal_function
1738 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1739 {
1740   unsigned char c;
1741
1742   if (re_string_eoi (input))
1743     {
1744       token->type = END_OF_RE;
1745       return 0;
1746     }
1747
1748   c = re_string_peek_byte (input, 0);
1749   token->opr.c = c;
1750
1751   token->word_char = 0;
1752 #ifdef RE_ENABLE_I18N
1753   token->mb_partial = 0;
1754   if (input->mb_cur_max > 1 &&
1755       !re_string_first_byte (input, re_string_cur_idx (input)))
1756     {
1757       token->type = CHARACTER;
1758       token->mb_partial = 1;
1759       return 1;
1760     }
1761 #endif
1762   if (c == '\\')
1763     {
1764       unsigned char c2;
1765       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1766         {
1767           token->type = BACK_SLASH;
1768           return 1;
1769         }
1770
1771       c2 = re_string_peek_byte_case (input, 1);
1772       token->opr.c = c2;
1773       token->type = CHARACTER;
1774 #ifdef RE_ENABLE_I18N
1775       if (input->mb_cur_max > 1)
1776         {
1777           wint_t wc = re_string_wchar_at (input,
1778                                           re_string_cur_idx (input) + 1);
1779           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1780         }
1781       else
1782 #endif
1783         token->word_char = IS_WORD_CHAR (c2) != 0;
1784
1785       switch (c2)
1786         {
1787         case '|':
1788           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1789             token->type = OP_ALT;
1790           break;
1791         case '1': case '2': case '3': case '4': case '5':
1792         case '6': case '7': case '8': case '9':
1793           if (!(syntax & RE_NO_BK_REFS))
1794             {
1795               token->type = OP_BACK_REF;
1796               token->opr.idx = c2 - '1';
1797             }
1798           break;
1799         case '<':
1800           if (!(syntax & RE_NO_GNU_OPS))
1801             {
1802               token->type = ANCHOR;
1803               token->opr.ctx_type = WORD_FIRST;
1804             }
1805           break;
1806         case '>':
1807           if (!(syntax & RE_NO_GNU_OPS))
1808             {
1809               token->type = ANCHOR;
1810               token->opr.ctx_type = WORD_LAST;
1811             }
1812           break;
1813         case 'b':
1814           if (!(syntax & RE_NO_GNU_OPS))
1815             {
1816               token->type = ANCHOR;
1817               token->opr.ctx_type = WORD_DELIM;
1818             }
1819           break;
1820         case 'B':
1821           if (!(syntax & RE_NO_GNU_OPS))
1822             {
1823               token->type = ANCHOR;
1824               token->opr.ctx_type = NOT_WORD_DELIM;
1825             }
1826           break;
1827         case 'w':
1828           if (!(syntax & RE_NO_GNU_OPS))
1829             token->type = OP_WORD;
1830           break;
1831         case 'W':
1832           if (!(syntax & RE_NO_GNU_OPS))
1833             token->type = OP_NOTWORD;
1834           break;
1835         case 's':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             token->type = OP_SPACE;
1838           break;
1839         case 'S':
1840           if (!(syntax & RE_NO_GNU_OPS))
1841             token->type = OP_NOTSPACE;
1842           break;
1843         case '`':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             {
1846               token->type = ANCHOR;
1847               token->opr.ctx_type = BUF_FIRST;
1848             }
1849           break;
1850         case '\'':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             {
1853               token->type = ANCHOR;
1854               token->opr.ctx_type = BUF_LAST;
1855             }
1856           break;
1857         case '(':
1858           if (!(syntax & RE_NO_BK_PARENS))
1859             token->type = OP_OPEN_SUBEXP;
1860           break;
1861         case ')':
1862           if (!(syntax & RE_NO_BK_PARENS))
1863             token->type = OP_CLOSE_SUBEXP;
1864           break;
1865         case '+':
1866           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1867             token->type = OP_DUP_PLUS;
1868           break;
1869         case '?':
1870           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1871             token->type = OP_DUP_QUESTION;
1872           break;
1873         case '{':
1874           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1875             token->type = OP_OPEN_DUP_NUM;
1876           break;
1877         case '}':
1878           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1879             token->type = OP_CLOSE_DUP_NUM;
1880           break;
1881         default:
1882           break;
1883         }
1884       return 2;
1885     }
1886
1887   token->type = CHARACTER;
1888 #ifdef RE_ENABLE_I18N
1889   if (input->mb_cur_max > 1)
1890     {
1891       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893     }
1894   else
1895 #endif
1896     token->word_char = IS_WORD_CHAR (token->opr.c);
1897
1898   switch (c)
1899     {
1900     case '\n':
1901       if (syntax & RE_NEWLINE_ALT)
1902         token->type = OP_ALT;
1903       break;
1904     case '|':
1905       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1906         token->type = OP_ALT;
1907       break;
1908     case '*':
1909       token->type = OP_DUP_ASTERISK;
1910       break;
1911     case '+':
1912       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1913         token->type = OP_DUP_PLUS;
1914       break;
1915     case '?':
1916       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1917         token->type = OP_DUP_QUESTION;
1918       break;
1919     case '{':
1920       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1921         token->type = OP_OPEN_DUP_NUM;
1922       break;
1923     case '}':
1924       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1925         token->type = OP_CLOSE_DUP_NUM;
1926       break;
1927     case '(':
1928       if (syntax & RE_NO_BK_PARENS)
1929         token->type = OP_OPEN_SUBEXP;
1930       break;
1931     case ')':
1932       if (syntax & RE_NO_BK_PARENS)
1933         token->type = OP_CLOSE_SUBEXP;
1934       break;
1935     case '[':
1936       token->type = OP_OPEN_BRACKET;
1937       break;
1938     case '.':
1939       token->type = OP_PERIOD;
1940       break;
1941     case '^':
1942       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1943           re_string_cur_idx (input) != 0)
1944         {
1945           char prev = re_string_peek_byte (input, -1);
1946           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1947             break;
1948         }
1949       token->type = ANCHOR;
1950       token->opr.ctx_type = LINE_FIRST;
1951       break;
1952     case '$':
1953       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1954           re_string_cur_idx (input) + 1 != re_string_length (input))
1955         {
1956           re_token_t next;
1957           re_string_skip_bytes (input, 1);
1958           peek_token (&next, input, syntax);
1959           re_string_skip_bytes (input, -1);
1960           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1961             break;
1962         }
1963       token->type = ANCHOR;
1964       token->opr.ctx_type = LINE_LAST;
1965       break;
1966     default:
1967       break;
1968     }
1969   return 1;
1970 }
1971
1972 /* Peek a token from INPUT, and return the length of the token.
1973    We must not use this function out of bracket expressions.  */
1974
1975 static int
1976 internal_function
1977 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1978 {
1979   unsigned char c;
1980   if (re_string_eoi (input))
1981     {
1982       token->type = END_OF_RE;
1983       return 0;
1984     }
1985   c = re_string_peek_byte (input, 0);
1986   token->opr.c = c;
1987
1988 #ifdef RE_ENABLE_I18N
1989   if (input->mb_cur_max > 1 &&
1990       !re_string_first_byte (input, re_string_cur_idx (input)))
1991     {
1992       token->type = CHARACTER;
1993       return 1;
1994     }
1995 #endif /* RE_ENABLE_I18N */
1996
1997   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1998       && re_string_cur_idx (input) + 1 < re_string_length (input))
1999     {
2000       /* In this case, '\' escape a character.  */
2001       unsigned char c2;
2002       re_string_skip_bytes (input, 1);
2003       c2 = re_string_peek_byte (input, 0);
2004       token->opr.c = c2;
2005       token->type = CHARACTER;
2006       return 1;
2007     }
2008   if (c == '[') /* '[' is a special char in a bracket exps.  */
2009     {
2010       unsigned char c2;
2011       int token_len;
2012       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2013         c2 = re_string_peek_byte (input, 1);
2014       else
2015         c2 = 0;
2016       token->opr.c = c2;
2017       token_len = 2;
2018       switch (c2)
2019         {
2020         case '.':
2021           token->type = OP_OPEN_COLL_ELEM;
2022           break;
2023         case '=':
2024           token->type = OP_OPEN_EQUIV_CLASS;
2025           break;
2026         case ':':
2027           if (syntax & RE_CHAR_CLASSES)
2028             {
2029               token->type = OP_OPEN_CHAR_CLASS;
2030               break;
2031             }
2032           /* else fall through.  */
2033         default:
2034           token->type = CHARACTER;
2035           token->opr.c = c;
2036           token_len = 1;
2037           break;
2038         }
2039       return token_len;
2040     }
2041   switch (c)
2042     {
2043     case '-':
2044       token->type = OP_CHARSET_RANGE;
2045       break;
2046     case ']':
2047       token->type = OP_CLOSE_BRACKET;
2048       break;
2049     case '^':
2050       token->type = OP_NON_MATCH_LIST;
2051       break;
2052     default:
2053       token->type = CHARACTER;
2054     }
2055   return 1;
2056 }
2057 \f
2058 /* Functions for parser.  */
2059
2060 /* Entry point of the parser.
2061    Parse the regular expression REGEXP and return the structure tree.
2062    If an error is occured, ERR is set by error code, and return NULL.
2063    This function build the following tree, from regular expression <reg_exp>:
2064            CAT
2065            / \
2066           /   \
2067    <reg_exp>  EOR
2068
2069    CAT means concatenation.
2070    EOR means end of regular expression.  */
2071
2072 static bin_tree_t *
2073 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2074        reg_errcode_t *err)
2075 {
2076   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2077   bin_tree_t *tree, *eor, *root;
2078   re_token_t current_token;
2079   dfa->syntax = syntax;
2080   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2081   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2082   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2083     return NULL;
2084   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2085   if (tree != NULL)
2086     root = create_tree (dfa, tree, eor, CONCAT);
2087   else
2088     root = eor;
2089   if (BE (eor == NULL || root == NULL, 0))
2090     {
2091       *err = REG_ESPACE;
2092       return NULL;
2093     }
2094   return root;
2095 }
2096
2097 /* This function build the following tree, from regular expression
2098    <branch1>|<branch2>:
2099            ALT
2100            / \
2101           /   \
2102    <branch1> <branch2>
2103
2104    ALT means alternative, which represents the operator `|'.  */
2105
2106 static bin_tree_t *
2107 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2108                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2109 {
2110   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111   bin_tree_t *tree, *branch = NULL;
2112   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2113   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114     return NULL;
2115
2116   while (token->type == OP_ALT)
2117     {
2118       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2119       if (token->type != OP_ALT && token->type != END_OF_RE
2120           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2121         {
2122           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2123           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2124             return NULL;
2125         }
2126       else
2127         branch = NULL;
2128       tree = create_tree (dfa, tree, branch, OP_ALT);
2129       if (BE (tree == NULL, 0))
2130         {
2131           *err = REG_ESPACE;
2132           return NULL;
2133         }
2134     }
2135   return tree;
2136 }
2137
2138 /* This function build the following tree, from regular expression
2139    <exp1><exp2>:
2140         CAT
2141         / \
2142        /   \
2143    <exp1> <exp2>
2144
2145    CAT means concatenation.  */
2146
2147 static bin_tree_t *
2148 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150 {
2151   bin_tree_t *tree, *exp;
2152   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2153   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2154   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155     return NULL;
2156
2157   while (token->type != OP_ALT && token->type != END_OF_RE
2158          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2159     {
2160       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2161       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2162         {
2163           return NULL;
2164         }
2165       if (tree != NULL && exp != NULL)
2166         {
2167           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2168           if (newtree == NULL)
2169             {
2170               postorder (exp, free_tree, NULL);
2171               postorder (tree, free_tree, NULL);
2172               *err = REG_ESPACE;
2173               return NULL;
2174             }
2175           tree = newtree;
2176         }
2177       else if (tree == NULL)
2178         tree = exp;
2179       /* Otherwise exp == NULL, we don't need to create new tree.  */
2180     }
2181   return tree;
2182 }
2183
2184 /* This function build the following tree, from regular expression a*:
2185          *
2186          |
2187          a
2188 */
2189
2190 static bin_tree_t *
2191 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2192                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2193 {
2194   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2195   bin_tree_t *tree;
2196   switch (token->type)
2197     {
2198     case CHARACTER:
2199       tree = create_token_tree (dfa, NULL, NULL, token);
2200       if (BE (tree == NULL, 0))
2201         {
2202           *err = REG_ESPACE;
2203           return NULL;
2204         }
2205 #ifdef RE_ENABLE_I18N
2206       if (dfa->mb_cur_max > 1)
2207         {
2208           while (!re_string_eoi (regexp)
2209                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2210             {
2211               bin_tree_t *mbc_remain;
2212               fetch_token (token, regexp, syntax);
2213               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2214               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2215               if (BE (mbc_remain == NULL || tree == NULL, 0))
2216                 {
2217                   *err = REG_ESPACE;
2218                   return NULL;
2219                 }
2220             }
2221         }
2222 #endif
2223       break;
2224     case OP_OPEN_SUBEXP:
2225       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2226       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227         return NULL;
2228       break;
2229     case OP_OPEN_BRACKET:
2230       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2231       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2232         return NULL;
2233       break;
2234     case OP_BACK_REF:
2235       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2236         {
2237           *err = REG_ESUBREG;
2238           return NULL;
2239         }
2240       dfa->used_bkref_map |= 1 << token->opr.idx;
2241       tree = create_token_tree (dfa, NULL, NULL, token);
2242       if (BE (tree == NULL, 0))
2243         {
2244           *err = REG_ESPACE;
2245           return NULL;
2246         }
2247       ++dfa->nbackref;
2248       dfa->has_mb_node = 1;
2249       break;
2250     case OP_OPEN_DUP_NUM:
2251       if (syntax & RE_CONTEXT_INVALID_DUP)
2252         {
2253           *err = REG_BADRPT;
2254           return NULL;
2255         }
2256       /* FALLTHROUGH */
2257     case OP_DUP_ASTERISK:
2258     case OP_DUP_PLUS:
2259     case OP_DUP_QUESTION:
2260       if (syntax & RE_CONTEXT_INVALID_OPS)
2261         {
2262           *err = REG_BADRPT;
2263           return NULL;
2264         }
2265       else if (syntax & RE_CONTEXT_INDEP_OPS)
2266         {
2267           fetch_token (token, regexp, syntax);
2268           return parse_expression (regexp, preg, token, syntax, nest, err);
2269         }
2270       /* else fall through  */
2271     case OP_CLOSE_SUBEXP:
2272       if ((token->type == OP_CLOSE_SUBEXP) &&
2273           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2274         {
2275           *err = REG_ERPAREN;
2276           return NULL;
2277         }
2278       /* else fall through  */
2279     case OP_CLOSE_DUP_NUM:
2280       /* We treat it as a normal character.  */
2281
2282       /* Then we can these characters as normal characters.  */
2283       token->type = CHARACTER;
2284       /* mb_partial and word_char bits should be initialized already
2285          by peek_token.  */
2286       tree = create_token_tree (dfa, NULL, NULL, token);
2287       if (BE (tree == NULL, 0))
2288         {
2289           *err = REG_ESPACE;
2290           return NULL;
2291         }
2292       break;
2293     case ANCHOR:
2294       if ((token->opr.ctx_type
2295            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2296           && dfa->word_ops_used == 0)
2297         init_word_char (dfa);
2298       if (token->opr.ctx_type == WORD_DELIM
2299           || token->opr.ctx_type == NOT_WORD_DELIM)
2300         {
2301           bin_tree_t *tree_first, *tree_last;
2302           if (token->opr.ctx_type == WORD_DELIM)
2303             {
2304               token->opr.ctx_type = WORD_FIRST;
2305               tree_first = create_token_tree (dfa, NULL, NULL, token);
2306               token->opr.ctx_type = WORD_LAST;
2307             }
2308           else
2309             {
2310               token->opr.ctx_type = INSIDE_WORD;
2311               tree_first = create_token_tree (dfa, NULL, NULL, token);
2312               token->opr.ctx_type = INSIDE_NOTWORD;
2313             }
2314           tree_last = create_token_tree (dfa, NULL, NULL, token);
2315           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2316           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2317             {
2318               *err = REG_ESPACE;
2319               return NULL;
2320             }
2321         }
2322       else
2323         {
2324           tree = create_token_tree (dfa, NULL, NULL, token);
2325           if (BE (tree == NULL, 0))
2326             {
2327               *err = REG_ESPACE;
2328               return NULL;
2329             }
2330         }
2331       /* We must return here, since ANCHORs can't be followed
2332          by repetition operators.
2333          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2334              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2335       fetch_token (token, regexp, syntax);
2336       return tree;
2337     case OP_PERIOD:
2338       tree = create_token_tree (dfa, NULL, NULL, token);
2339       if (BE (tree == NULL, 0))
2340         {
2341           *err = REG_ESPACE;
2342           return NULL;
2343         }
2344       if (dfa->mb_cur_max > 1)
2345         dfa->has_mb_node = 1;
2346       break;
2347     case OP_WORD:
2348     case OP_NOTWORD:
2349       tree = build_charclass_op (dfa, regexp->trans,
2350                                  (const unsigned char *) "alnum",
2351                                  (const unsigned char *) "_",
2352                                  token->type == OP_NOTWORD, err);
2353       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2354         return NULL;
2355       break;
2356     case OP_SPACE:
2357     case OP_NOTSPACE:
2358       tree = build_charclass_op (dfa, regexp->trans,
2359                                  (const unsigned char *) "space",
2360                                  (const unsigned char *) "",
2361                                  token->type == OP_NOTSPACE, err);
2362       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2363         return NULL;
2364       break;
2365     case OP_ALT:
2366     case END_OF_RE:
2367       return NULL;
2368     case BACK_SLASH:
2369       *err = REG_EESCAPE;
2370       return NULL;
2371     default:
2372       /* Must not happen?  */
2373 #ifdef DEBUG
2374       assert (0);
2375 #endif
2376       return NULL;
2377     }
2378   fetch_token (token, regexp, syntax);
2379
2380   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2381          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2382     {
2383       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2384       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385         return NULL;
2386       /* In BRE consecutive duplications are not allowed.  */
2387       if ((syntax & RE_CONTEXT_INVALID_DUP)
2388           && (token->type == OP_DUP_ASTERISK
2389               || token->type == OP_OPEN_DUP_NUM))
2390         {
2391           *err = REG_BADRPT;
2392           return NULL;
2393         }
2394     }
2395
2396   return tree;
2397 }
2398
2399 /* This function build the following tree, from regular expression
2400    (<reg_exp>):
2401          SUBEXP
2402             |
2403         <reg_exp>
2404 */
2405
2406 static bin_tree_t *
2407 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2408                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2409 {
2410   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2411   bin_tree_t *tree;
2412   size_t cur_nsub;
2413   cur_nsub = preg->re_nsub++;
2414
2415   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2416
2417   /* The subexpression may be a null string.  */
2418   if (token->type == OP_CLOSE_SUBEXP)
2419     tree = NULL;
2420   else
2421     {
2422       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2423       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2424         {
2425           if (tree != NULL)
2426             postorder (tree, free_tree, NULL);
2427           *err = REG_EPAREN;
2428         }
2429       if (BE (*err != REG_NOERROR, 0))
2430         return NULL;
2431     }
2432
2433   if (cur_nsub <= '9' - '1')
2434     dfa->completed_bkref_map |= 1 << cur_nsub;
2435
2436   tree = create_tree (dfa, tree, NULL, SUBEXP);
2437   if (BE (tree == NULL, 0))
2438     {
2439       *err = REG_ESPACE;
2440       return NULL;
2441     }
2442   tree->token.opr.idx = cur_nsub;
2443   return tree;
2444 }
2445
2446 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2447
2448 static bin_tree_t *
2449 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2450               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451 {
2452   bin_tree_t *tree = NULL, *old_tree = NULL;
2453   int i, start, end, start_idx = re_string_cur_idx (regexp);
2454   re_token_t start_token = *token;
2455
2456   if (token->type == OP_OPEN_DUP_NUM)
2457     {
2458       end = 0;
2459       start = fetch_number (regexp, token, syntax);
2460       if (start == -1)
2461         {
2462           if (token->type == CHARACTER && token->opr.c == ',')
2463             start = 0; /* We treat "{,m}" as "{0,m}".  */
2464           else
2465             {
2466               *err = REG_BADBR; /* <re>{} is invalid.  */
2467               return NULL;
2468             }
2469         }
2470       if (BE (start != -2, 1))
2471         {
2472           /* We treat "{n}" as "{n,n}".  */
2473           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2474                  : ((token->type == CHARACTER && token->opr.c == ',')
2475                     ? fetch_number (regexp, token, syntax) : -2));
2476         }
2477       if (BE (start == -2 || end == -2, 0))
2478         {
2479           /* Invalid sequence.  */
2480           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481             {
2482               if (token->type == END_OF_RE)
2483                 *err = REG_EBRACE;
2484               else
2485                 *err = REG_BADBR;
2486
2487               return NULL;
2488             }
2489
2490           /* If the syntax bit is set, rollback.  */
2491           re_string_set_index (regexp, start_idx);
2492           *token = start_token;
2493           token->type = CHARACTER;
2494           /* mb_partial and word_char bits should be already initialized by
2495              peek_token.  */
2496           return elem;
2497         }
2498
2499       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2500         {
2501           /* First number greater than second.  */
2502           *err = REG_BADBR;
2503           return NULL;
2504         }
2505     }
2506   else
2507     {
2508       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2509       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2510     }
2511
2512   fetch_token (token, regexp, syntax);
2513
2514   if (BE (elem == NULL, 0))
2515     return NULL;
2516   if (BE (start == 0 && end == 0, 0))
2517     {
2518       postorder (elem, free_tree, NULL);
2519       return NULL;
2520     }
2521
2522   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2523   if (BE (start > 0, 0))
2524     {
2525       tree = elem;
2526       for (i = 2; i <= start; ++i)
2527         {
2528           elem = duplicate_tree (elem, dfa);
2529           tree = create_tree (dfa, tree, elem, CONCAT);
2530           if (BE (elem == NULL || tree == NULL, 0))
2531             goto parse_dup_op_espace;
2532         }
2533
2534       if (start == end)
2535         return tree;
2536
2537       /* Duplicate ELEM before it is marked optional.  */
2538       elem = duplicate_tree (elem, dfa);
2539       old_tree = tree;
2540     }
2541   else
2542     old_tree = NULL;
2543
2544   if (elem->token.type == SUBEXP)
2545     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2546
2547   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2548   if (BE (tree == NULL, 0))
2549     goto parse_dup_op_espace;
2550
2551   /* This loop is actually executed only when end != -1,
2552      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2553      already created the start+1-th copy.  */
2554   for (i = start + 2; i <= end; ++i)
2555     {
2556       elem = duplicate_tree (elem, dfa);
2557       tree = create_tree (dfa, tree, elem, CONCAT);
2558       if (BE (elem == NULL || tree == NULL, 0))
2559         goto parse_dup_op_espace;
2560
2561       tree = create_tree (dfa, tree, NULL, OP_ALT);
2562       if (BE (tree == NULL, 0))
2563         goto parse_dup_op_espace;
2564     }
2565
2566   if (old_tree)
2567     tree = create_tree (dfa, old_tree, tree, CONCAT);
2568
2569   return tree;
2570
2571  parse_dup_op_espace:
2572   *err = REG_ESPACE;
2573   return NULL;
2574 }
2575
2576 /* Size of the names for collating symbol/equivalence_class/character_class.
2577    I'm not sure, but maybe enough.  */
2578 #define BRACKET_NAME_BUF_SIZE 32
2579
2580 #ifndef _LIBC
2581   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2582      Build the range expression which starts from START_ELEM, and ends
2583      at END_ELEM.  The result are written to MBCSET and SBCSET.
2584      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2585      mbcset->range_ends, is a pointer argument sinse we may
2586      update it.  */
2587
2588 static reg_errcode_t
2589 internal_function
2590 # ifdef RE_ENABLE_I18N
2591 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2592                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2593 # else /* not RE_ENABLE_I18N */
2594 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2595                  bracket_elem_t *end_elem)
2596 # endif /* not RE_ENABLE_I18N */
2597 {
2598   unsigned int start_ch, end_ch;
2599   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2600   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2601           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2602           0))
2603     return REG_ERANGE;
2604
2605   /* We can handle no multi character collating elements without libc
2606      support.  */
2607   if (BE ((start_elem->type == COLL_SYM
2608            && strlen ((char *) start_elem->opr.name) > 1)
2609           || (end_elem->type == COLL_SYM
2610               && strlen ((char *) end_elem->opr.name) > 1), 0))
2611     return REG_ECOLLATE;
2612
2613 # ifdef RE_ENABLE_I18N
2614   {
2615     wchar_t wc;
2616     wint_t start_wc;
2617     wint_t end_wc;
2618     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2619
2620     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2621                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2622                    : 0));
2623     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2624               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2625                  : 0));
2626     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2627                 ? __btowc (start_ch) : start_elem->opr.wch);
2628     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2629               ? __btowc (end_ch) : end_elem->opr.wch);
2630     if (start_wc == WEOF || end_wc == WEOF)
2631       return REG_ECOLLATE;
2632     cmp_buf[0] = start_wc;
2633     cmp_buf[4] = end_wc;
2634     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2635       return REG_ERANGE;
2636
2637     /* Got valid collation sequence values, add them as a new entry.
2638        However, for !_LIBC we have no collation elements: if the
2639        character set is single byte, the single byte character set
2640        that we build below suffices.  parse_bracket_exp passes
2641        no MBCSET if dfa->mb_cur_max == 1.  */
2642     if (mbcset)
2643       {
2644         /* Check the space of the arrays.  */
2645         if (BE (*range_alloc == mbcset->nranges, 0))
2646           {
2647             /* There is not enough space, need realloc.  */
2648             wchar_t *new_array_start, *new_array_end;
2649             int new_nranges;
2650
2651             /* +1 in case of mbcset->nranges is 0.  */
2652             new_nranges = 2 * mbcset->nranges + 1;
2653             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2654                are NULL if *range_alloc == 0.  */
2655             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2656                                           new_nranges);
2657             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2658                                         new_nranges);
2659
2660             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2661               return REG_ESPACE;
2662
2663             mbcset->range_starts = new_array_start;
2664             mbcset->range_ends = new_array_end;
2665             *range_alloc = new_nranges;
2666           }
2667
2668         mbcset->range_starts[mbcset->nranges] = start_wc;
2669         mbcset->range_ends[mbcset->nranges++] = end_wc;
2670       }
2671
2672     /* Build the table for single byte characters.  */
2673     for (wc = 0; wc < SBC_MAX; ++wc)
2674       {
2675         cmp_buf[2] = wc;
2676         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2677             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2678           bitset_set (sbcset, wc);
2679       }
2680   }
2681 # else /* not RE_ENABLE_I18N */
2682   {
2683     unsigned int ch;
2684     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2685                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2686                    : 0));
2687     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2688               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2689                  : 0));
2690     if (start_ch > end_ch)
2691       return REG_ERANGE;
2692     /* Build the table for single byte characters.  */
2693     for (ch = 0; ch < SBC_MAX; ++ch)
2694       if (start_ch <= ch  && ch <= end_ch)
2695         bitset_set (sbcset, ch);
2696   }
2697 # endif /* not RE_ENABLE_I18N */
2698   return REG_NOERROR;
2699 }
2700 #endif /* not _LIBC */
2701
2702 #ifndef _LIBC
2703 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2704    Build the collating element which is represented by NAME.
2705    The result are written to MBCSET and SBCSET.
2706    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2707    pointer argument since we may update it.  */
2708
2709 static reg_errcode_t
2710 internal_function
2711 # ifdef RE_ENABLE_I18N
2712 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2713                         int *coll_sym_alloc, const unsigned char *name)
2714 # else /* not RE_ENABLE_I18N */
2715 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2716 # endif /* not RE_ENABLE_I18N */
2717 {
2718   size_t name_len = strlen ((const char *) name);
2719   if (BE (name_len != 1, 0))
2720     return REG_ECOLLATE;
2721   else
2722     {
2723       bitset_set (sbcset, name[0]);
2724       return REG_NOERROR;
2725     }
2726 }
2727 #endif /* not _LIBC */
2728
2729 /* This function parse bracket expression like "[abc]", "[a-c]",
2730    "[[.a-a.]]" etc.  */
2731
2732 static bin_tree_t *
2733 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2734                    reg_syntax_t syntax, reg_errcode_t *err)
2735 {
2736 #ifdef _LIBC
2737   const unsigned char *collseqmb;
2738   const char *collseqwc;
2739   uint32_t nrules;
2740   int32_t table_size;
2741   const int32_t *symb_table;
2742   const unsigned char *extra;
2743
2744   /* Local function for parse_bracket_exp used in _LIBC environement.
2745      Seek the collating symbol entry correspondings to NAME.
2746      Return the index of the symbol in the SYMB_TABLE.  */
2747
2748   auto inline int32_t
2749   __attribute ((always_inline))
2750   seek_collating_symbol_entry (name, name_len)
2751          const unsigned char *name;
2752          size_t name_len;
2753     {
2754       int32_t hash = elem_hash ((const char *) name, name_len);
2755       int32_t elem = hash % table_size;
2756       if (symb_table[2 * elem] != 0)
2757         {
2758           int32_t second = hash % (table_size - 2) + 1;
2759
2760           do
2761             {
2762               /* First compare the hashing value.  */
2763               if (symb_table[2 * elem] == hash
2764                   /* Compare the length of the name.  */
2765                   && name_len == extra[symb_table[2 * elem + 1]]
2766                   /* Compare the name.  */
2767                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2768                              name_len) == 0)
2769                 {
2770                   /* Yep, this is the entry.  */
2771                   break;
2772                 }
2773
2774               /* Next entry.  */
2775               elem += second;
2776             }
2777           while (symb_table[2 * elem] != 0);
2778         }
2779       return elem;
2780     }
2781
2782   /* Local function for parse_bracket_exp used in _LIBC environment.
2783      Look up the collation sequence value of BR_ELEM.
2784      Return the value if succeeded, UINT_MAX otherwise.  */
2785
2786   auto inline unsigned int
2787   __attribute ((always_inline))
2788   lookup_collation_sequence_value (br_elem)
2789          bracket_elem_t *br_elem;
2790     {
2791       if (br_elem->type == SB_CHAR)
2792         {
2793           /*
2794           if (MB_CUR_MAX == 1)
2795           */
2796           if (nrules == 0)
2797             return collseqmb[br_elem->opr.ch];
2798           else
2799             {
2800               wint_t wc = __btowc (br_elem->opr.ch);
2801               return __collseq_table_lookup (collseqwc, wc);
2802             }
2803         }
2804       else if (br_elem->type == MB_CHAR)
2805         {
2806           if (nrules != 0)
2807             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808         }
2809       else if (br_elem->type == COLL_SYM)
2810         {
2811           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2812           if (nrules != 0)
2813             {
2814               int32_t elem, idx;
2815               elem = seek_collating_symbol_entry (br_elem->opr.name,
2816                                                   sym_name_len);
2817               if (symb_table[2 * elem] != 0)
2818                 {
2819                   /* We found the entry.  */
2820                   idx = symb_table[2 * elem + 1];
2821                   /* Skip the name of collating element name.  */
2822                   idx += 1 + extra[idx];
2823                   /* Skip the byte sequence of the collating element.  */
2824                   idx += 1 + extra[idx];
2825                   /* Adjust for the alignment.  */
2826                   idx = (idx + 3) & ~3;
2827                   /* Skip the multibyte collation sequence value.  */
2828                   idx += sizeof (unsigned int);
2829                   /* Skip the wide char sequence of the collating element.  */
2830                   idx += sizeof (unsigned int) *
2831                     (1 + *(unsigned int *) (extra + idx));
2832                   /* Return the collation sequence value.  */
2833                   return *(unsigned int *) (extra + idx);
2834                 }
2835               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836                 {
2837                   /* No valid character.  Match it as a single byte
2838                      character.  */
2839                   return collseqmb[br_elem->opr.name[0]];
2840                 }
2841             }
2842           else if (sym_name_len == 1)
2843             return collseqmb[br_elem->opr.name[0]];
2844         }
2845       return UINT_MAX;
2846     }
2847
2848   /* Local function for parse_bracket_exp used in _LIBC environement.
2849      Build the range expression which starts from START_ELEM, and ends
2850      at END_ELEM.  The result are written to MBCSET and SBCSET.
2851      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2852      mbcset->range_ends, is a pointer argument sinse we may
2853      update it.  */
2854
2855   auto inline reg_errcode_t
2856   __attribute ((always_inline))
2857   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2858          re_charset_t *mbcset;
2859          int *range_alloc;
2860          bitset_t sbcset;
2861          bracket_elem_t *start_elem, *end_elem;
2862     {
2863       unsigned int ch;
2864       uint32_t start_collseq;
2865       uint32_t end_collseq;
2866
2867       /* Equivalence Classes and Character Classes can't be a range
2868          start/end.  */
2869       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2870               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2871               0))
2872         return REG_ERANGE;
2873
2874       start_collseq = lookup_collation_sequence_value (start_elem);
2875       end_collseq = lookup_collation_sequence_value (end_elem);
2876       /* Check start/end collation sequence values.  */
2877       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2878         return REG_ECOLLATE;
2879       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880         return REG_ERANGE;
2881
2882       /* Got valid collation sequence values, add them as a new entry.
2883          However, if we have no collation elements, and the character set
2884          is single byte, the single byte character set that we
2885          build below suffices. */
2886       if (nrules > 0 || dfa->mb_cur_max > 1)
2887         {
2888           /* Check the space of the arrays.  */
2889           if (BE (*range_alloc == mbcset->nranges, 0))
2890             {
2891               /* There is not enough space, need realloc.  */
2892               uint32_t *new_array_start;
2893               uint32_t *new_array_end;
2894               int new_nranges;
2895
2896               /* +1 in case of mbcset->nranges is 0.  */
2897               new_nranges = 2 * mbcset->nranges + 1;
2898               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899                                             new_nranges);
2900               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901                                           new_nranges);
2902
2903               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904                 return REG_ESPACE;
2905
2906               mbcset->range_starts = new_array_start;
2907               mbcset->range_ends = new_array_end;
2908               *range_alloc = new_nranges;
2909             }
2910
2911           mbcset->range_starts[mbcset->nranges] = start_collseq;
2912           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913         }
2914
2915       /* Build the table for single byte characters.  */
2916       for (ch = 0; ch < SBC_MAX; ch++)
2917         {
2918           uint32_t ch_collseq;
2919           /*
2920           if (MB_CUR_MAX == 1)
2921           */
2922           if (nrules == 0)
2923             ch_collseq = collseqmb[ch];
2924           else
2925             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2926           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2927             bitset_set (sbcset, ch);
2928         }
2929       return REG_NOERROR;
2930     }
2931
2932   /* Local function for parse_bracket_exp used in _LIBC environement.
2933      Build the collating element which is represented by NAME.
2934      The result are written to MBCSET and SBCSET.
2935      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2936      pointer argument sinse we may update it.  */
2937
2938   auto inline reg_errcode_t
2939   __attribute ((always_inline))
2940   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2941          re_charset_t *mbcset;
2942          int *coll_sym_alloc;
2943          bitset_t sbcset;
2944          const unsigned char *name;
2945     {
2946       int32_t elem, idx;
2947       size_t name_len = strlen ((const char *) name);
2948       if (nrules != 0)
2949         {
2950           elem = seek_collating_symbol_entry (name, name_len);
2951           if (symb_table[2 * elem] != 0)
2952             {
2953               /* We found the entry.  */
2954               idx = symb_table[2 * elem + 1];
2955               /* Skip the name of collating element name.  */
2956               idx += 1 + extra[idx];
2957             }
2958           else if (symb_table[2 * elem] == 0 && name_len == 1)
2959             {
2960               /* No valid character, treat it as a normal
2961                  character.  */
2962               bitset_set (sbcset, name[0]);
2963               return REG_NOERROR;
2964             }
2965           else
2966             return REG_ECOLLATE;
2967
2968           /* Got valid collation sequence, add it as a new entry.  */
2969           /* Check the space of the arrays.  */
2970           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971             {
2972               /* Not enough, realloc it.  */
2973               /* +1 in case of mbcset->ncoll_syms is 0.  */
2974               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2975               /* Use realloc since mbcset->coll_syms is NULL
2976                  if *alloc == 0.  */
2977               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2978                                                    new_coll_sym_alloc);
2979               if (BE (new_coll_syms == NULL, 0))
2980                 return REG_ESPACE;
2981               mbcset->coll_syms = new_coll_syms;
2982               *coll_sym_alloc = new_coll_sym_alloc;
2983             }
2984           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2985           return REG_NOERROR;
2986         }
2987       else
2988         {
2989           if (BE (name_len != 1, 0))
2990             return REG_ECOLLATE;
2991           else
2992             {
2993               bitset_set (sbcset, name[0]);
2994               return REG_NOERROR;
2995             }
2996         }
2997     }
2998 #endif
2999
3000   re_token_t br_token;
3001   re_bitset_ptr_t sbcset;
3002 #ifdef RE_ENABLE_I18N
3003   re_charset_t *mbcset;
3004   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3005   int equiv_class_alloc = 0, char_class_alloc = 0;
3006 #endif /* not RE_ENABLE_I18N */
3007   int non_match = 0;
3008   bin_tree_t *work_tree;
3009   int token_len;
3010   int first_round = 1;
3011 #ifdef _LIBC
3012   collseqmb = (const unsigned char *)
3013     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3014   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3015   if (nrules)
3016     {
3017       /*
3018       if (MB_CUR_MAX > 1)
3019       */
3020       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3021       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3022       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3023                                                   _NL_COLLATE_SYMB_TABLEMB);
3024       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3025                                                    _NL_COLLATE_SYMB_EXTRAMB);
3026     }
3027 #endif
3028   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3029 #ifdef RE_ENABLE_I18N
3030   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3031 #endif /* RE_ENABLE_I18N */
3032 #ifdef RE_ENABLE_I18N
3033   if (BE (sbcset == NULL || mbcset == NULL, 0))
3034 #else
3035   if (BE (sbcset == NULL, 0))
3036 #endif /* RE_ENABLE_I18N */
3037     {
3038       re_free (sbcset);
3039 #ifdef RE_ENABLE_I18N
3040       re_free (mbcset);
3041 #endif
3042       *err = REG_ESPACE;
3043       return NULL;
3044     }
3045
3046   token_len = peek_token_bracket (token, regexp, syntax);
3047   if (BE (token->type == END_OF_RE, 0))
3048     {
3049       *err = REG_BADPAT;
3050       goto parse_bracket_exp_free_return;
3051     }
3052   if (token->type == OP_NON_MATCH_LIST)
3053     {
3054 #ifdef RE_ENABLE_I18N
3055       mbcset->non_match = 1;
3056 #endif /* not RE_ENABLE_I18N */
3057       non_match = 1;
3058       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3059         bitset_set (sbcset, '\n');
3060       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3061       token_len = peek_token_bracket (token, regexp, syntax);
3062       if (BE (token->type == END_OF_RE, 0))
3063         {
3064           *err = REG_BADPAT;
3065           goto parse_bracket_exp_free_return;
3066         }
3067     }
3068
3069   /* We treat the first ']' as a normal character.  */
3070   if (token->type == OP_CLOSE_BRACKET)
3071     token->type = CHARACTER;
3072
3073   while (1)
3074     {
3075       bracket_elem_t start_elem, end_elem;
3076       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3077       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3078       reg_errcode_t ret;
3079       int token_len2 = 0, is_range_exp = 0;
3080       re_token_t token2;
3081
3082       start_elem.opr.name = start_name_buf;
3083       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3084                                    syntax, first_round);
3085       if (BE (ret != REG_NOERROR, 0))
3086         {
3087           *err = ret;
3088           goto parse_bracket_exp_free_return;
3089         }
3090       first_round = 0;
3091
3092       /* Get information about the next token.  We need it in any case.  */
3093       token_len = peek_token_bracket (token, regexp, syntax);
3094
3095       /* Do not check for ranges if we know they are not allowed.  */
3096       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3097         {
3098           if (BE (token->type == END_OF_RE, 0))
3099             {
3100               *err = REG_EBRACK;
3101               goto parse_bracket_exp_free_return;
3102             }
3103           if (token->type == OP_CHARSET_RANGE)
3104             {
3105               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3106               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3107               if (BE (token2.type == END_OF_RE, 0))
3108                 {
3109                   *err = REG_EBRACK;
3110                   goto parse_bracket_exp_free_return;
3111                 }
3112               if (token2.type == OP_CLOSE_BRACKET)
3113                 {
3114                   /* We treat the last '-' as a normal character.  */
3115                   re_string_skip_bytes (regexp, -token_len);
3116                   token->type = CHARACTER;
3117                 }
3118               else
3119                 is_range_exp = 1;
3120             }
3121         }
3122
3123       if (is_range_exp == 1)
3124         {
3125           end_elem.opr.name = end_name_buf;
3126           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3127                                        dfa, syntax, 1);
3128           if (BE (ret != REG_NOERROR, 0))
3129             {
3130               *err = ret;
3131               goto parse_bracket_exp_free_return;
3132             }
3133
3134           token_len = peek_token_bracket (token, regexp, syntax);
3135
3136 #ifdef _LIBC
3137           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3138                                   &start_elem, &end_elem);
3139 #else
3140 # ifdef RE_ENABLE_I18N
3141           *err = build_range_exp (sbcset,
3142                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3143                                   &range_alloc, &start_elem, &end_elem);
3144 # else
3145           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3146 # endif
3147 #endif /* RE_ENABLE_I18N */
3148           if (BE (*err != REG_NOERROR, 0))
3149             goto parse_bracket_exp_free_return;
3150         }
3151       else
3152         {
3153           switch (start_elem.type)
3154             {
3155             case SB_CHAR:
3156               bitset_set (sbcset, start_elem.opr.ch);
3157               break;
3158 #ifdef RE_ENABLE_I18N
3159             case MB_CHAR:
3160               /* Check whether the array has enough space.  */
3161               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3162                 {
3163                   wchar_t *new_mbchars;
3164                   /* Not enough, realloc it.  */
3165                   /* +1 in case of mbcset->nmbchars is 0.  */
3166                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3167                   /* Use realloc since array is NULL if *alloc == 0.  */
3168                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3169                                             mbchar_alloc);
3170                   if (BE (new_mbchars == NULL, 0))
3171                     goto parse_bracket_exp_espace;
3172                   mbcset->mbchars = new_mbchars;
3173                 }
3174               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3175               break;
3176 #endif /* RE_ENABLE_I18N */
3177             case EQUIV_CLASS:
3178               *err = build_equiv_class (sbcset,
3179 #ifdef RE_ENABLE_I18N
3180                                         mbcset, &equiv_class_alloc,
3181 #endif /* RE_ENABLE_I18N */
3182                                         start_elem.opr.name);
3183               if (BE (*err != REG_NOERROR, 0))
3184                 goto parse_bracket_exp_free_return;
3185               break;
3186             case COLL_SYM:
3187               *err = build_collating_symbol (sbcset,
3188 #ifdef RE_ENABLE_I18N
3189                                              mbcset, &coll_sym_alloc,
3190 #endif /* RE_ENABLE_I18N */
3191                                              start_elem.opr.name);
3192               if (BE (*err != REG_NOERROR, 0))
3193                 goto parse_bracket_exp_free_return;
3194               break;
3195             case CHAR_CLASS:
3196               *err = build_charclass (regexp->trans, sbcset,
3197 #ifdef RE_ENABLE_I18N
3198                                       mbcset, &char_class_alloc,
3199 #endif /* RE_ENABLE_I18N */
3200                                       start_elem.opr.name, syntax);
3201               if (BE (*err != REG_NOERROR, 0))
3202                goto parse_bracket_exp_free_return;
3203               break;
3204             default:
3205               assert (0);
3206               break;
3207             }
3208         }
3209       if (BE (token->type == END_OF_RE, 0))
3210         {
3211           *err = REG_EBRACK;
3212           goto parse_bracket_exp_free_return;
3213         }
3214       if (token->type == OP_CLOSE_BRACKET)
3215         break;
3216     }
3217
3218   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3219
3220   /* If it is non-matching list.  */
3221   if (non_match)
3222     bitset_not (sbcset);
3223
3224 #ifdef RE_ENABLE_I18N
3225   /* Ensure only single byte characters are set.  */
3226   if (dfa->mb_cur_max > 1)
3227     bitset_mask (sbcset, dfa->sb_char);
3228
3229   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3230       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3231                                                      || mbcset->non_match)))
3232     {
3233       bin_tree_t *mbc_tree;
3234       int sbc_idx;
3235       /* Build a tree for complex bracket.  */
3236       dfa->has_mb_node = 1;
3237       br_token.type = COMPLEX_BRACKET;
3238       br_token.opr.mbcset = mbcset;
3239       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3240       if (BE (mbc_tree == NULL, 0))
3241         goto parse_bracket_exp_espace;
3242       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3243         if (sbcset[sbc_idx])
3244           break;
3245       /* If there are no bits set in sbcset, there is no point
3246          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3247       if (sbc_idx < BITSET_WORDS)
3248         {
3249           /* Build a tree for simple bracket.  */
3250           br_token.type = SIMPLE_BRACKET;
3251           br_token.opr.sbcset = sbcset;
3252           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3253           if (BE (work_tree == NULL, 0))
3254             goto parse_bracket_exp_espace;
3255
3256           /* Then join them by ALT node.  */
3257           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3258           if (BE (work_tree == NULL, 0))
3259             goto parse_bracket_exp_espace;
3260         }
3261       else
3262         {
3263           re_free (sbcset);
3264           work_tree = mbc_tree;
3265         }
3266     }
3267   else
3268 #endif /* not RE_ENABLE_I18N */
3269     {
3270 #ifdef RE_ENABLE_I18N
3271       free_charset (mbcset);
3272 #endif
3273       /* Build a tree for simple bracket.  */
3274       br_token.type = SIMPLE_BRACKET;
3275       br_token.opr.sbcset = sbcset;
3276       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277       if (BE (work_tree == NULL, 0))
3278         goto parse_bracket_exp_espace;
3279     }
3280   return work_tree;
3281
3282  parse_bracket_exp_espace:
3283   *err = REG_ESPACE;
3284  parse_bracket_exp_free_return:
3285   re_free (sbcset);
3286 #ifdef RE_ENABLE_I18N
3287   free_charset (mbcset);
3288 #endif /* RE_ENABLE_I18N */
3289   return NULL;
3290 }
3291
3292 /* Parse an element in the bracket expression.  */
3293
3294 static reg_errcode_t
3295 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3296                        re_token_t *token, int token_len, re_dfa_t *dfa,
3297                        reg_syntax_t syntax, int accept_hyphen)
3298 {
3299 #ifdef RE_ENABLE_I18N
3300   int cur_char_size;
3301   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3302   if (cur_char_size > 1)
3303     {
3304       elem->type = MB_CHAR;
3305       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3306       re_string_skip_bytes (regexp, cur_char_size);
3307       return REG_NOERROR;
3308     }
3309 #endif /* RE_ENABLE_I18N */
3310   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3311   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3312       || token->type == OP_OPEN_EQUIV_CLASS)
3313     return parse_bracket_symbol (elem, regexp, token);
3314   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3315     {
3316       /* A '-' must only appear as anything but a range indicator before
3317          the closing bracket.  Everything else is an error.  */
3318       re_token_t token2;
3319       (void) peek_token_bracket (&token2, regexp, syntax);
3320       if (token2.type != OP_CLOSE_BRACKET)
3321         /* The actual error value is not standardized since this whole
3322            case is undefined.  But ERANGE makes good sense.  */
3323         return REG_ERANGE;
3324     }
3325   elem->type = SB_CHAR;
3326   elem->opr.ch = token->opr.c;
3327   return REG_NOERROR;
3328 }
3329
3330 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3331    such as [:<character_class>:], [.<collating_element>.], and
3332    [=<equivalent_class>=].  */
3333
3334 static reg_errcode_t
3335 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3336                       re_token_t *token)
3337 {
3338   unsigned char ch, delim = token->opr.c;
3339   int i = 0;
3340   if (re_string_eoi(regexp))
3341     return REG_EBRACK;
3342   for (;; ++i)
3343     {
3344       if (i >= BRACKET_NAME_BUF_SIZE)
3345         return REG_EBRACK;
3346       if (token->type == OP_OPEN_CHAR_CLASS)
3347         ch = re_string_fetch_byte_case (regexp);
3348       else
3349         ch = re_string_fetch_byte (regexp);
3350       if (re_string_eoi(regexp))
3351         return REG_EBRACK;
3352       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3353         break;
3354       elem->opr.name[i] = ch;
3355     }
3356   re_string_skip_bytes (regexp, 1);
3357   elem->opr.name[i] = '\0';
3358   switch (token->type)
3359     {
3360     case OP_OPEN_COLL_ELEM:
3361       elem->type = COLL_SYM;
3362       break;
3363     case OP_OPEN_EQUIV_CLASS:
3364       elem->type = EQUIV_CLASS;
3365       break;
3366     case OP_OPEN_CHAR_CLASS:
3367       elem->type = CHAR_CLASS;
3368       break;
3369     default:
3370       break;
3371     }
3372   return REG_NOERROR;
3373 }
3374
3375   /* Helper function for parse_bracket_exp.
3376      Build the equivalence class which is represented by NAME.
3377      The result are written to MBCSET and SBCSET.
3378      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3379      is a pointer argument sinse we may update it.  */
3380
3381 static reg_errcode_t
3382 #ifdef RE_ENABLE_I18N
3383 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384                    int *equiv_class_alloc, const unsigned char *name)
3385 #else /* not RE_ENABLE_I18N */
3386 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3387 #endif /* not RE_ENABLE_I18N */
3388 {
3389 #ifdef _LIBC
3390   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3391   if (nrules != 0)
3392     {
3393       const int32_t *table, *indirect;
3394       const unsigned char *weights, *extra, *cp;
3395       unsigned char char_buf[2];
3396       int32_t idx1, idx2;
3397       unsigned int ch;
3398       size_t len;
3399       /* This #include defines a local function!  */
3400 # include <locale/weight.h>
3401       /* Calculate the index for equivalence class.  */
3402       cp = name;
3403       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3404       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3405                                                _NL_COLLATE_WEIGHTMB);
3406       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3407                                                    _NL_COLLATE_EXTRAMB);
3408       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3409                                                 _NL_COLLATE_INDIRECTMB);
3410       idx1 = findidx (&cp);
3411       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3412         /* This isn't a valid character.  */
3413         return REG_ECOLLATE;
3414
3415       /* Build single byte matcing table for this equivalence class.  */
3416       char_buf[1] = (unsigned char) '\0';
3417       len = weights[idx1 & 0xffffff];
3418       for (ch = 0; ch < SBC_MAX; ++ch)
3419         {
3420           char_buf[0] = ch;
3421           cp = char_buf;
3422           idx2 = findidx (&cp);
3423 /*
3424           idx2 = table[ch];
3425 */
3426           if (idx2 == 0)
3427             /* This isn't a valid character.  */
3428             continue;
3429           /* Compare only if the length matches and the collation rule
3430              index is the same.  */
3431           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3432             {
3433               int cnt = 0;
3434
3435               while (cnt <= len &&
3436                      weights[(idx1 & 0xffffff) + 1 + cnt]
3437                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3438                 ++cnt;
3439
3440               if (cnt > len)
3441                 bitset_set (sbcset, ch);
3442             }
3443         }
3444       /* Check whether the array has enough space.  */
3445       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3446         {
3447           /* Not enough, realloc it.  */
3448           /* +1 in case of mbcset->nequiv_classes is 0.  */
3449           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3450           /* Use realloc since the array is NULL if *alloc == 0.  */
3451           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3452                                                    int32_t,
3453                                                    new_equiv_class_alloc);
3454           if (BE (new_equiv_classes == NULL, 0))
3455             return REG_ESPACE;
3456           mbcset->equiv_classes = new_equiv_classes;
3457           *equiv_class_alloc = new_equiv_class_alloc;
3458         }
3459       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3460     }
3461   else
3462 #endif /* _LIBC */
3463     {
3464       if (BE (strlen ((const char *) name) != 1, 0))
3465         return REG_ECOLLATE;
3466       bitset_set (sbcset, *name);
3467     }
3468   return REG_NOERROR;
3469 }
3470
3471   /* Helper function for parse_bracket_exp.
3472      Build the character class which is represented by NAME.
3473      The result are written to MBCSET and SBCSET.
3474      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3475      is a pointer argument sinse we may update it.  */
3476
3477 static reg_errcode_t
3478 #ifdef RE_ENABLE_I18N
3479 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3480                  re_charset_t *mbcset, int *char_class_alloc,
3481                  const unsigned char *class_name, reg_syntax_t syntax)
3482 #else /* not RE_ENABLE_I18N */
3483 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3484                  const unsigned char *class_name, reg_syntax_t syntax)
3485 #endif /* not RE_ENABLE_I18N */
3486 {
3487   int i;
3488   const char *name = (const char *) class_name;
3489
3490   /* In case of REG_ICASE "upper" and "lower" match the both of
3491      upper and lower cases.  */
3492   if ((syntax & RE_ICASE)
3493       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3494     name = "alpha";
3495
3496 #ifdef RE_ENABLE_I18N
3497   /* Check the space of the arrays.  */
3498   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3499     {
3500       /* Not enough, realloc it.  */
3501       /* +1 in case of mbcset->nchar_classes is 0.  */
3502       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3503       /* Use realloc since array is NULL if *alloc == 0.  */
3504       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3505                                                new_char_class_alloc);
3506       if (BE (new_char_classes == NULL, 0))
3507         return REG_ESPACE;
3508       mbcset->char_classes = new_char_classes;
3509       *char_class_alloc = new_char_class_alloc;
3510     }
3511   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3512 #endif /* RE_ENABLE_I18N */
3513
3514 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3515   do {                                          \
3516     if (BE (trans != NULL, 0))                  \
3517       {                                         \
3518         for (i = 0; i < SBC_MAX; ++i)           \
3519           if (ctype_func (i))                   \
3520             bitset_set (sbcset, trans[i]);      \
3521       }                                         \
3522     else                                        \
3523       {                                         \
3524         for (i = 0; i < SBC_MAX; ++i)           \
3525           if (ctype_func (i))                   \
3526             bitset_set (sbcset, i);             \
3527       }                                         \
3528   } while (0)
3529
3530   if (strcmp (name, "alnum") == 0)
3531     BUILD_CHARCLASS_LOOP (isalnum);
3532   else if (strcmp (name, "cntrl") == 0)
3533     BUILD_CHARCLASS_LOOP (iscntrl);
3534   else if (strcmp (name, "lower") == 0)
3535     BUILD_CHARCLASS_LOOP (islower);
3536   else if (strcmp (name, "space") == 0)
3537     BUILD_CHARCLASS_LOOP (isspace);
3538   else if (strcmp (name, "alpha") == 0)
3539     BUILD_CHARCLASS_LOOP (isalpha);
3540   else if (strcmp (name, "digit") == 0)
3541     BUILD_CHARCLASS_LOOP (isdigit);
3542   else if (strcmp (name, "print") == 0)
3543     BUILD_CHARCLASS_LOOP (isprint);
3544   else if (strcmp (name, "upper") == 0)
3545     BUILD_CHARCLASS_LOOP (isupper);
3546   else if (strcmp (name, "blank") == 0)
3547     BUILD_CHARCLASS_LOOP (isblank);
3548   else if (strcmp (name, "graph") == 0)
3549     BUILD_CHARCLASS_LOOP (isgraph);
3550   else if (strcmp (name, "punct") == 0)
3551     BUILD_CHARCLASS_LOOP (ispunct);
3552   else if (strcmp (name, "xdigit") == 0)
3553     BUILD_CHARCLASS_LOOP (isxdigit);
3554   else
3555     return REG_ECTYPE;
3556
3557   return REG_NOERROR;
3558 }
3559
3560 static bin_tree_t *
3561 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3562                     const unsigned char *class_name,
3563                     const unsigned char *extra, int non_match,
3564                     reg_errcode_t *err)
3565 {
3566   re_bitset_ptr_t sbcset;
3567 #ifdef RE_ENABLE_I18N
3568   re_charset_t *mbcset;
3569   int alloc = 0;
3570 #endif /* not RE_ENABLE_I18N */
3571   reg_errcode_t ret;
3572   re_token_t br_token;
3573   bin_tree_t *tree;
3574
3575   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3576 #ifdef RE_ENABLE_I18N
3577   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3578 #endif /* RE_ENABLE_I18N */
3579
3580 #ifdef RE_ENABLE_I18N
3581   if (BE (sbcset == NULL || mbcset == NULL, 0))
3582 #else /* not RE_ENABLE_I18N */
3583   if (BE (sbcset == NULL, 0))
3584 #endif /* not RE_ENABLE_I18N */
3585     {
3586       *err = REG_ESPACE;
3587       return NULL;
3588     }
3589
3590   if (non_match)
3591     {
3592 #ifdef RE_ENABLE_I18N
3593       mbcset->non_match = 1;
3594 #endif /* not RE_ENABLE_I18N */
3595     }
3596
3597   /* We don't care the syntax in this case.  */
3598   ret = build_charclass (trans, sbcset,
3599 #ifdef RE_ENABLE_I18N
3600                          mbcset, &alloc,
3601 #endif /* RE_ENABLE_I18N */
3602                          class_name, 0);
3603
3604   if (BE (ret != REG_NOERROR, 0))
3605     {
3606       re_free (sbcset);
3607 #ifdef RE_ENABLE_I18N
3608       free_charset (mbcset);
3609 #endif /* RE_ENABLE_I18N */
3610       *err = ret;
3611       return NULL;
3612     }
3613   /* \w match '_' also.  */
3614   for (; *extra; extra++)
3615     bitset_set (sbcset, *extra);
3616
3617   /* If it is non-matching list.  */
3618   if (non_match)
3619     bitset_not (sbcset);
3620
3621 #ifdef RE_ENABLE_I18N
3622   /* Ensure only single byte characters are set.  */
3623   if (dfa->mb_cur_max > 1)
3624     bitset_mask (sbcset, dfa->sb_char);
3625 #endif
3626
3627   /* Build a tree for simple bracket.  */
3628   br_token.type = SIMPLE_BRACKET;
3629   br_token.opr.sbcset = sbcset;
3630   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3631   if (BE (tree == NULL, 0))
3632     goto build_word_op_espace;
3633
3634 #ifdef RE_ENABLE_I18N
3635   if (dfa->mb_cur_max > 1)
3636     {
3637       bin_tree_t *mbc_tree;
3638       /* Build a tree for complex bracket.  */
3639       br_token.type = COMPLEX_BRACKET;
3640       br_token.opr.mbcset = mbcset;
3641       dfa->has_mb_node = 1;
3642       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3643       if (BE (mbc_tree == NULL, 0))
3644         goto build_word_op_espace;
3645       /* Then join them by ALT node.  */
3646       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3647       if (BE (mbc_tree != NULL, 1))
3648         return tree;
3649     }
3650   else
3651     {
3652       free_charset (mbcset);
3653       return tree;
3654     }
3655 #else /* not RE_ENABLE_I18N */
3656   return tree;
3657 #endif /* not RE_ENABLE_I18N */
3658
3659  build_word_op_espace:
3660   re_free (sbcset);
3661 #ifdef RE_ENABLE_I18N
3662   free_charset (mbcset);
3663 #endif /* RE_ENABLE_I18N */
3664   *err = REG_ESPACE;
3665   return NULL;
3666 }
3667
3668 /* This is intended for the expressions like "a{1,3}".
3669    Fetch a number from `input', and return the number.
3670    Return -1, if the number field is empty like "{,1}".
3671    Return -2, If an error is occured.  */
3672
3673 static int
3674 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3675 {
3676   int num = -1;
3677   unsigned char c;
3678   while (1)
3679     {
3680       fetch_token (token, input, syntax);
3681       c = token->opr.c;
3682       if (BE (token->type == END_OF_RE, 0))
3683         return -2;
3684       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3685         break;
3686       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3687              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3688       num = (num > RE_DUP_MAX) ? -2 : num;
3689     }
3690   return num;
3691 }
3692 \f
3693 #ifdef RE_ENABLE_I18N
3694 static void
3695 free_charset (re_charset_t *cset)
3696 {
3697   re_free (cset->mbchars);
3698 # ifdef _LIBC
3699   re_free (cset->coll_syms);
3700   re_free (cset->equiv_classes);
3701   re_free (cset->range_starts);
3702   re_free (cset->range_ends);
3703 # endif
3704   re_free (cset->char_classes);
3705   re_free (cset);
3706 }
3707 #endif /* RE_ENABLE_I18N */
3708 \f
3709 /* Functions for binary tree operation.  */
3710
3711 /* Create a tree node.  */
3712
3713 static bin_tree_t *
3714 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3715              re_token_type_t type)
3716 {
3717   re_token_t t;
3718   t.type = type;
3719   return create_token_tree (dfa, left, right, &t);
3720 }
3721
3722 static bin_tree_t *
3723 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3724                    const re_token_t *token)
3725 {
3726   bin_tree_t *tree;
3727   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3728     {
3729       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3730
3731       if (storage == NULL)
3732         return NULL;
3733       storage->next = dfa->str_tree_storage;
3734       dfa->str_tree_storage = storage;
3735       dfa->str_tree_storage_idx = 0;
3736     }
3737   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3738
3739   tree->parent = NULL;
3740   tree->left = left;
3741   tree->right = right;
3742   tree->token = *token;
3743   tree->token.duplicated = 0;
3744   tree->token.opt_subexp = 0;
3745   tree->first = NULL;
3746   tree->next = NULL;
3747   tree->node_idx = -1;
3748
3749   if (left != NULL)
3750     left->parent = tree;
3751   if (right != NULL)
3752     right->parent = tree;
3753   return tree;
3754 }
3755
3756 /* Mark the tree SRC as an optional subexpression.
3757    To be called from preorder or postorder.  */
3758
3759 static reg_errcode_t
3760 mark_opt_subexp (void *extra, bin_tree_t *node)
3761 {
3762   int idx = (int) (long) extra;
3763   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3764     node->token.opt_subexp = 1;
3765
3766   return REG_NOERROR;
3767 }
3768
3769 /* Free the allocated memory inside NODE. */
3770
3771 static void
3772 free_token (re_token_t *node)
3773 {
3774 #ifdef RE_ENABLE_I18N
3775   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3776     free_charset (node->opr.mbcset);
3777   else
3778 #endif /* RE_ENABLE_I18N */
3779     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3780       re_free (node->opr.sbcset);
3781 }
3782
3783 /* Worker function for tree walking.  Free the allocated memory inside NODE
3784    and its children. */
3785
3786 static reg_errcode_t
3787 free_tree (void *extra, bin_tree_t *node)
3788 {
3789   free_token (&node->token);
3790   return REG_NOERROR;
3791 }
3792
3793
3794 /* Duplicate the node SRC, and return new node.  This is a preorder
3795    visit similar to the one implemented by the generic visitor, but
3796    we need more infrastructure to maintain two parallel trees --- so,
3797    it's easier to duplicate.  */
3798
3799 static bin_tree_t *
3800 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3801 {
3802   const bin_tree_t *node;
3803   bin_tree_t *dup_root;
3804   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3805
3806   for (node = root; ; )
3807     {
3808       /* Create a new tree and link it back to the current parent.  */
3809       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3810       if (*p_new == NULL)
3811         return NULL;
3812       (*p_new)->parent = dup_node;
3813       (*p_new)->token.duplicated = 1;
3814       dup_node = *p_new;
3815
3816       /* Go to the left node, or up and to the right.  */
3817       if (node->left)
3818         {
3819           node = node->left;
3820           p_new = &dup_node->left;
3821         }
3822       else
3823         {
3824           const bin_tree_t *prev = NULL;
3825           while (node->right == prev || node->right == NULL)
3826             {
3827               prev = node;
3828               node = node->parent;
3829               dup_node = dup_node->parent;
3830               if (!node)
3831                 return dup_root;
3832             }
3833           node = node->right;
3834           p_new = &dup_node->right;
3835         }
3836     }
3837 }