posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 {
 219   reg_errcode_t ret;
 220
 221   /* And GNU code determines whether or not to get register information
 222      by passing null for the REGS argument to re_match, etc., not by
 223      setting no_sub, unless RE_NO_SUB is set.  */
 224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 225
 226   /* Match anchors at newline.  */
 227   bufp->newline_anchor = 1;
 228
 229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 230
 231   if (!ret)
 232     return NULL;
 233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 234 }
 235 #ifdef _LIBC
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237 #endif
 238
 239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 240    also be assigned to arbitrarily: each pattern buffer stores its own
 241    syntax, so it can be changed between regex compilations.  */
 242 /* This has no initializer because initialized variables in Emacs
 243    become read-only after dumping.  */
 244 reg_syntax_t re_syntax_options;
 245
 246
 247 /* Specify the precise syntax of regexps for compilation.  This provides
 248    for compatibility for various utilities which historically have
 249    different, incompatible syntaxes.
 250
 251    The argument SYNTAX is a bit mask comprised of the various bits
 252    defined in regex.h.  We return the old syntax.  */
 253
 254 reg_syntax_t
 255 re_set_syntax (syntax)
 256     reg_syntax_t syntax;
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (bufp)
 269     struct re_pattern_buffer *bufp;
 270 {
 271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 272   char *fastmap = bufp->fastmap;
 273
 274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 276   if (dfa->init_state != dfa->init_state_word)
 277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 278   if (dfa->init_state != dfa->init_state_nl)
 279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 280   if (dfa->init_state != dfa->init_state_begbuf)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 282   bufp->fastmap_accurate = 1;
 283   return 0;
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 287 #endif
 288
 289 static inline void
 290 __attribute ((always_inline))
 291 re_set_fastmap (char *fastmap, int icase, int ch)
 292 {
 293   fastmap[ch] = 1;
 294   if (icase)
 295     fastmap[tolower (ch)] = 1;
 296 }
 297
 298 /* Helper function for re_compile_fastmap.
 299    Compile fastmap for the initial_state INIT_STATE.  */
 300
 301 static void
 302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 303                          char *fastmap)
 304 {
 305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 306   int node_cnt;
 307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 309     {
 310       int node = init_state->nodes.elems[node_cnt];
 311       re_token_type_t type = dfa->nodes[node].type;
 312
 313       if (type == CHARACTER)
 314         {
 315           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 316 #ifdef RE_ENABLE_I18N
 317           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 318             {
 319               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 331                              &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, 0, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 354           int i;
 355
 356 # ifdef _LIBC
 357           /* See if we have to try all bytes which start multiple collation
 358              elements.
 359              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 360                   collation element, and don't catch 'b' since 'b' is
 361                   the only collation element which starts from 'b' (and
 362                   it is caught by SIMPLE_BRACKET).  */
 363               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 364                   && (cset->ncoll_syms || cset->nranges))
 365                 {
 366                   const int32_t *table = (const int32_t *)
 367                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 368                   for (i = 0; i < SBC_MAX; ++i)
 369                     if (table[i] < 0)
 370                       re_set_fastmap (fastmap, icase, i);
 371                 }
 372 # endif /* _LIBC */
 373
 374           /* See if we have to start the match at all multibyte characters,
 375              i.e. where we would not find an invalid sequence.  This only
 376              applies to multibyte character sets; for single byte character
 377              sets, the SIMPLE_BRACKET again suffices.  */
 378           if (dfa->mb_cur_max > 1
 379               && (cset->nchar_classes || cset->non_match || cset->nranges
 380 # ifdef _LIBC
 381                   || cset->nequiv_classes
 382 # endif /* _LIBC */
 383                  ))
 384             {
 385               unsigned char c = 0;
 386               do
 387                 {
 388                   mbstate_t mbs;
 389                   memset (&mbs, 0, sizeof (mbs));
 390                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 391                     re_set_fastmap (fastmap, false, (int) c);
 392                 }
 393               while (++c != 0);
 394             }
 395
 396           else
 397             {
 398               /* ... Else catch all bytes which can start the mbchars.  */
 399               for (i = 0; i < cset->nmbchars; ++i)
 400                 {
 401                   char buf[256];
 402                   mbstate_t state;
 403                   memset (&state, '\0', sizeof (state));
 404                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 405                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 406                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 407                     {
 408                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 409                           != (size_t) -1)
 410                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 411                     }
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      `buffer' to the compiled pattern;
 437      `used' to the length of the compiled pattern;
 438      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      `fastmap' to an allocated space for the fastmap;
 443      `fastmap_accurate' to zero;
 444      `re_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (preg, pattern, cflags)
 468     regex_t *__restrict preg;
 469     const char *__restrict pattern;
 470     int cflags;
 471 {
 472   reg_errcode_t ret;
 473   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 474                          : RE_SYNTAX_POSIX_BASIC);
 475
 476   preg->buffer = NULL;
 477   preg->allocated = 0;
 478   preg->used = 0;
 479
 480   /* Try to allocate space for the fastmap.  */
 481   preg->fastmap = re_malloc (char, SBC_MAX);
 482   if (BE (preg->fastmap == NULL, 0))
 483     return REG_ESPACE;
 484
 485   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 486
 487   /* If REG_NEWLINE is set, newlines are treated differently.  */
 488   if (cflags & REG_NEWLINE)
 489     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 490       syntax &= ~RE_DOT_NEWLINE;
 491       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 492       /* It also changes the matching behavior.  */
 493       preg->newline_anchor = 1;
 494     }
 495   else
 496     preg->newline_anchor = 0;
 497   preg->no_sub = !!(cflags & REG_NOSUB);
 498   preg->translate = NULL;
 499
 500   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 501
 502   /* POSIX doesn't distinguish between an unmatched open-group and an
 503      unmatched close-group: both are REG_EPAREN.  */
 504   if (ret == REG_ERPAREN)
 505     ret = REG_EPAREN;
 506
 507   /* We have already checked preg->fastmap != NULL.  */
 508   if (BE (ret == REG_NOERROR, 1))
 509     /* Compute the fastmap now, since regexec cannot modify the pattern
 510        buffer.  This function never fails in this implementation.  */
 511     (void) re_compile_fastmap (preg);
 512   else
 513     {
 514       /* Some error occurred while compiling the expression.  */
 515       re_free (preg->fastmap);
 516       preg->fastmap = NULL;
 517     }
 518
 519   return (int) ret;
 520 }
 521 #ifdef _LIBC
 522 weak_alias (__regcomp, regcomp)
 523 #endif
 524
 525 /* Returns a message corresponding to an error code, ERRCODE, returned
 526    from either regcomp or regexec.   We don't use PREG here.  */
 527
 528 size_t
 529 regerror (errcode, preg, errbuf, errbuf_size)
 530     int errcode;
 531     const regex_t *__restrict preg;
 532     char *__restrict errbuf;
 533     size_t errbuf_size;
 534 {
 535   const char *msg;
 536   size_t msg_size;
 537
 538   if (BE (errcode < 0
 539           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 540                                / sizeof (__re_error_msgid_idx[0])), 0))
 541     /* Only error codes returned by the rest of the code should be passed
 542        to this routine.  If we are given anything else, or if other regex
 543        code generates an invalid error code, then the program has a bug.
 544        Dump core so we can fix it.  */
 545     abort ();
 546
 547   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 548
 549   msg_size = strlen (msg) + 1; /* Includes the null.  */
 550
 551   if (BE (errbuf_size != 0, 1))
 552     {
 553       if (BE (msg_size > errbuf_size, 0))
 554         {
 555 #if defined HAVE_MEMPCPY || defined _LIBC
 556           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 557 #else
 558           memcpy (errbuf, msg, errbuf_size - 1);
 559           errbuf[errbuf_size - 1] = 0;
 560 #endif
 561         }
 562       else
 563         memcpy (errbuf, msg, msg_size);
 564     }
 565
 566   return msg_size;
 567 }
 568 #ifdef _LIBC
 569 weak_alias (__regerror, regerror)
 570 #endif
 571
 572
 573 #ifdef RE_ENABLE_I18N
 574 /* This static array is used for the map to single-byte characters when
 575    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 576    it the same all the time.  UTF-8 is the preferred encoding so this is
 577    a worthwhile optimization.  */
 578 static const bitset_t utf8_sb_map =
 579 {
 580   /* Set the first 128 bits.  */
 581   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 582 };
 583 #endif
 584
 585
 586 static void
 587 free_dfa_content (re_dfa_t *dfa)
 588 {
 589   int i, j;
 590
 591   if (dfa->nodes)
 592     for (i = 0; i < dfa->nodes_len; ++i)
 593       free_token (dfa->nodes + i);
 594   re_free (dfa->nexts);
 595   for (i = 0; i < dfa->nodes_len; ++i)
 596     {
 597       if (dfa->eclosures != NULL)
 598         re_node_set_free (dfa->eclosures + i);
 599       if (dfa->inveclosures != NULL)
 600         re_node_set_free (dfa->inveclosures + i);
 601       if (dfa->edests != NULL)
 602         re_node_set_free (dfa->edests + i);
 603     }
 604   re_free (dfa->edests);
 605   re_free (dfa->eclosures);
 606   re_free (dfa->inveclosures);
 607   re_free (dfa->nodes);
 608
 609   if (dfa->state_table)
 610     for (i = 0; i <= dfa->state_hash_mask; ++i)
 611       {
 612         struct re_state_table_entry *entry = dfa->state_table + i;
 613         for (j = 0; j < entry->num; ++j)
 614           {
 615             re_dfastate_t *state = entry->array[j];
 616             free_state (state);
 617           }
 618         re_free (entry->array);
 619       }
 620   re_free (dfa->state_table);
 621 #ifdef RE_ENABLE_I18N
 622   if (dfa->sb_char != utf8_sb_map)
 623     re_free (dfa->sb_char);
 624 #endif
 625   re_free (dfa->subexp_map);
 626 #ifdef DEBUG
 627   re_free (dfa->re_str);
 628 #endif
 629
 630   re_free (dfa);
 631 }
 632
 633
 634 /* Free dynamically allocated space used by PREG.  */
 635
 636 void
 637 regfree (preg)
 638     regex_t *preg;
 639 {
 640   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 641   if (BE (dfa != NULL, 1))
 642     free_dfa_content (dfa);
 643   preg->buffer = NULL;
 644   preg->allocated = 0;
 645
 646   re_free (preg->fastmap);
 647   preg->fastmap = NULL;
 648
 649   re_free (preg->translate);
 650   preg->translate = NULL;
 651 }
 652 #ifdef _LIBC
 653 weak_alias (__regfree, regfree)
 654 #endif
 655 \f
 656 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 657    them unless specifically requested.  */
 658
 659 #if defined _REGEX_RE_COMP || defined _LIBC
 660
 661 /* BSD has one and only one pattern buffer.  */
 662 static struct re_pattern_buffer re_comp_buf;
 663
 664 char *
 665 # ifdef _LIBC
 666 /* Make these definitions weak in libc, so POSIX programs can redefine
 667    these names if they don't use our functions, and still use
 668    regcomp/regexec above without link errors.  */
 669 weak_function
 670 # endif
 671 re_comp (s)
 672      const char *s;
 673 {
 674   reg_errcode_t ret;
 675   char *fastmap;
 676
 677   if (!s)
 678     {
 679       if (!re_comp_buf.buffer)
 680         return gettext ("No previous regular expression");
 681       return 0;
 682     }
 683
 684   if (re_comp_buf.buffer)
 685     {
 686       fastmap = re_comp_buf.fastmap;
 687       re_comp_buf.fastmap = NULL;
 688       __regfree (&re_comp_buf);
 689       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 690       re_comp_buf.fastmap = fastmap;
 691     }
 692
 693   if (re_comp_buf.fastmap == NULL)
 694     {
 695       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 696       if (re_comp_buf.fastmap == NULL)
 697         return (char *) gettext (__re_error_msgid
 698                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 699     }
 700
 701   /* Since `re_exec' always passes NULL for the `regs' argument, we
 702      don't need to initialize the pattern buffer fields which affect it.  */
 703
 704   /* Match anchors at newlines.  */
 705   re_comp_buf.newline_anchor = 1;
 706
 707   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 708
 709   if (!ret)
 710     return NULL;
 711
 712   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 713   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 714 }
 715
 716 #ifdef _LIBC
 717 libc_freeres_fn (free_mem)
 718 {
 719   __regfree (&re_comp_buf);
 720 }
 721 #endif
 722
 723 #endif /* _REGEX_RE_COMP */
 724 \f
 725 /* Internal entry point.
 726    Compile the regular expression PATTERN, whose length is LENGTH.
 727    SYNTAX indicate regular expression's syntax.  */
 728
 729 static reg_errcode_t
 730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 731                      reg_syntax_t syntax)
 732 {
 733   reg_errcode_t err = REG_NOERROR;
 734   re_dfa_t *dfa;
 735   re_string_t regexp;
 736
 737   /* Initialize the pattern buffer.  */
 738   preg->fastmap_accurate = 0;
 739   preg->syntax = syntax;
 740   preg->not_bol = preg->not_eol = 0;
 741   preg->used = 0;
 742   preg->re_nsub = 0;
 743   preg->can_be_null = 0;
 744   preg->regs_allocated = REGS_UNALLOCATED;
 745
 746   /* Initialize the dfa.  */
 747   dfa = (re_dfa_t *) preg->buffer;
 748   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 749     {
 750       /* If zero allocated, but buffer is non-null, try to realloc
 751          enough space.  This loses if buffer's address is bogus, but
 752          that is the user's responsibility.  If ->buffer is NULL this
 753          is a simple allocation.  */
 754       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 755       if (dfa == NULL)
 756         return REG_ESPACE;
 757       preg->allocated = sizeof (re_dfa_t);
 758       preg->buffer = (unsigned char *) dfa;
 759     }
 760   preg->used = sizeof (re_dfa_t);
 761
 762   err = init_dfa (dfa, length);
 763   if (BE (err != REG_NOERROR, 0))
 764     {
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770 #ifdef DEBUG
 771   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 772   dfa->re_str = re_malloc (char, length + 1);
 773   strncpy (dfa->re_str, pattern, length + 1);
 774 #endif
 775
 776   __libc_lock_init (dfa->lock);
 777
 778   err = re_string_construct (&regexp, pattern, length, preg->translate,
 779                              syntax & RE_ICASE, dfa);
 780   if (BE (err != REG_NOERROR, 0))
 781     {
 782     re_compile_internal_free_return:
 783       free_workarea_compile (preg);
 784       re_string_destruct (&regexp);
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790
 791   /* Parse the regular expression, and build a structure tree.  */
 792   preg->re_nsub = 0;
 793   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 794   if (BE (dfa->str_tree == NULL, 0))
 795     goto re_compile_internal_free_return;
 796
 797   /* Analyze the tree and create the nfa.  */
 798   err = analyze (preg);
 799   if (BE (err != REG_NOERROR, 0))
 800     goto re_compile_internal_free_return;
 801
 802 #ifdef RE_ENABLE_I18N
 803   /* If possible, do searching in single byte encoding to speed things up.  */
 804   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 805     optimize_utf8 (dfa);
 806 #endif
 807
 808   /* Then create the initial state of the dfa.  */
 809   err = create_initial_state (dfa);
 810
 811   /* Release work areas.  */
 812   free_workarea_compile (preg);
 813   re_string_destruct (&regexp);
 814
 815   if (BE (err != REG_NOERROR, 0))
 816     {
 817       free_dfa_content (dfa);
 818       preg->buffer = NULL;
 819       preg->allocated = 0;
 820     }
 821
 822   return err;
 823 }
 824
 825 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 826    as the initial length of some arrays.  */
 827
 828 static reg_errcode_t
 829 init_dfa (re_dfa_t *dfa, size_t pat_len)
 830 {
 831   unsigned int table_size;
 832 #ifndef _LIBC
 833   char *codeset_name;
 834 #endif
 835
 836   memset (dfa, '\0', sizeof (re_dfa_t));
 837
 838   /* Force allocation of str_tree_storage the first time.  */
 839   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 840
 841   /* Avoid overflows.  */
 842   if (pat_len == SIZE_MAX)
 843     return REG_ESPACE;
 844
 845   dfa->nodes_alloc = pat_len + 1;
 846   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 847
 848   /*  table_size = 2 ^ ceil(log pat_len) */
 849   for (table_size = 1; ; table_size <<= 1)
 850     if (table_size > pat_len)
 851       break;
 852
 853   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 854   dfa->state_hash_mask = table_size - 1;
 855
 856   dfa->mb_cur_max = MB_CUR_MAX;
 857 #ifdef _LIBC
 858   if (dfa->mb_cur_max == 6
 859       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 860     dfa->is_utf8 = 1;
 861   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 862                        != 0);
 863 #else
 864 # ifdef HAVE_LANGINFO_CODESET
 865   codeset_name = nl_langinfo (CODESET);
 866 # else
 867   codeset_name = getenv ("LC_ALL");
 868   if (codeset_name == NULL || codeset_name[0] == '\0')
 869     codeset_name = getenv ("LC_CTYPE");
 870   if (codeset_name == NULL || codeset_name[0] == '\0')
 871     codeset_name = getenv ("LANG");
 872   if (codeset_name == NULL)
 873     codeset_name = "";
 874   else if (strchr (codeset_name, '.') !=  NULL)
 875     codeset_name = strchr (codeset_name, '.') + 1;
 876 # endif
 877
 878   if (strcasecmp (codeset_name, "UTF-8") == 0
 879       || strcasecmp (codeset_name, "UTF8") == 0)
 880     dfa->is_utf8 = 1;
 881
 882   /* We check exhaustively in the loop below if this charset is a
 883      superset of ASCII.  */
 884   dfa->map_notascii = 0;
 885 #endif
 886
 887 #ifdef RE_ENABLE_I18N
 888   if (dfa->mb_cur_max > 1)
 889     {
 890       if (dfa->is_utf8)
 891         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 892       else
 893         {
 894           int i, j, ch;
 895
 896           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 897           if (BE (dfa->sb_char == NULL, 0))
 898             return REG_ESPACE;
 899
 900           /* Set the bits corresponding to single byte chars.  */
 901           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 902             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 903               {
 904                 wint_t wch = __btowc (ch);
 905                 if (wch != WEOF)
 906                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 907 # ifndef _LIBC
 908                 if (isascii (ch) && wch != ch)
 909                   dfa->map_notascii = 1;
 910 # endif
 911               }
 912         }
 913     }
 914 #endif
 915
 916   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 917     return REG_ESPACE;
 918   return REG_NOERROR;
 919 }
 920
 921 /* Initialize WORD_CHAR table, which indicate which character is
 922    "word".  In this case "word" means that it is the word construction
 923    character used by some operators like "\<", "\>", etc.  */
 924
 925 static void
 926 internal_function
 927 init_word_char (re_dfa_t *dfa)
 928 {
 929   dfa->word_ops_used = 1;
 930   int i = 0;
 931   int ch = 0;
 932   if (BE (dfa->map_notascii == 0, 1))
 933     {
 934       if (sizeof (dfa->word_char[0]) == 8)
 935         {
 936           dfa->word_char[0] = UINT64_C (0x03ff000000000000);
 937           dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
 938           i = 2;
 939         }
 940       else if (sizeof (dfa->word_char[0]) == 4)
 941         {
 942           dfa->word_char[0] = UINT32_C (0x00000000);
 943           dfa->word_char[1] = UINT32_C (0x03ff0000);
 944           dfa->word_char[2] = UINT32_C (0x87fffffe);
 945           dfa->word_char[3] = UINT32_C (0x07fffffe);
 946           i = 4;
 947         }
 948       else
 949         abort ();
 950       ch = 128;
 951
 952       if (BE (dfa->is_utf8, 1))
 953         {
 954           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 955           return;
 956         }
 957     }
 958
 959   for (; i < BITSET_WORDS; ++i)
 960     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 961       if (isalnum (ch) || ch == '_')
 962         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 963 }
 964
 965 /* Free the work area which are only used while compiling.  */
 966
 967 static void
 968 free_workarea_compile (regex_t *preg)
 969 {
 970   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 971   bin_tree_storage_t *storage, *next;
 972   for (storage = dfa->str_tree_storage; storage; storage = next)
 973     {
 974       next = storage->next;
 975       re_free (storage);
 976     }
 977   dfa->str_tree_storage = NULL;
 978   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 979   dfa->str_tree = NULL;
 980   re_free (dfa->org_indices);
 981   dfa->org_indices = NULL;
 982 }
 983
 984 /* Create initial states for all contexts.  */
 985
 986 static reg_errcode_t
 987 create_initial_state (re_dfa_t *dfa)
 988 {
 989   int first, i;
 990   reg_errcode_t err;
 991   re_node_set init_nodes;
 992
 993   /* Initial states have the epsilon closure of the node which is
 994      the first node of the regular expression.  */
 995   first = dfa->str_tree->first->node_idx;
 996   dfa->init_node = first;
 997   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 998   if (BE (err != REG_NOERROR, 0))
 999     return err;
1000
1001   /* The back-references which are in initial states can epsilon transit,
1002      since in this case all of the subexpressions can be null.
1003      Then we add epsilon closures of the nodes which are the next nodes of
1004      the back-references.  */
1005   if (dfa->nbackref > 0)
1006     for (i = 0; i < init_nodes.nelem; ++i)
1007       {
1008         int node_idx = init_nodes.elems[i];
1009         re_token_type_t type = dfa->nodes[node_idx].type;
1010
1011         int clexp_idx;
1012         if (type != OP_BACK_REF)
1013           continue;
1014         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1015           {
1016             re_token_t *clexp_node;
1017             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1018             if (clexp_node->type == OP_CLOSE_SUBEXP
1019                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1020               break;
1021           }
1022         if (clexp_idx == init_nodes.nelem)
1023           continue;
1024
1025         if (type == OP_BACK_REF)
1026           {
1027             int dest_idx = dfa->edests[node_idx].elems[0];
1028             if (!re_node_set_contains (&init_nodes, dest_idx))
1029               {
1030                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1031                                                        dfa->eclosures
1032                                                        + dest_idx);
1033                 if (err != REG_NOERROR)
1034                   return err;
1035                 i = 0;
1036               }
1037           }
1038       }
1039
1040   /* It must be the first time to invoke acquire_state.  */
1041   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1042   /* We don't check ERR here, since the initial state must not be NULL.  */
1043   if (BE (dfa->init_state == NULL, 0))
1044     return err;
1045   if (dfa->init_state->has_constraint)
1046     {
1047       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1048                                                        CONTEXT_WORD);
1049       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1050                                                      CONTEXT_NEWLINE);
1051       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1052                                                          &init_nodes,
1053                                                          CONTEXT_NEWLINE
1054                                                          | CONTEXT_BEGBUF);
1055       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1056               || dfa->init_state_begbuf == NULL, 0))
1057         return err;
1058     }
1059   else
1060     dfa->init_state_word = dfa->init_state_nl
1061       = dfa->init_state_begbuf = dfa->init_state;
1062
1063   re_node_set_free (&init_nodes);
1064   return REG_NOERROR;
1065 }
1066 \f
1067 #ifdef RE_ENABLE_I18N
1068 /* If it is possible to do searching in single byte encoding instead of UTF-8
1069    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1070    DFA nodes where needed.  */
1071
1072 static void
1073 optimize_utf8 (re_dfa_t *dfa)
1074 {
1075   int node, i, mb_chars = 0, has_period = 0;
1076
1077   for (node = 0; node < dfa->nodes_len; ++node)
1078     switch (dfa->nodes[node].type)
1079       {
1080       case CHARACTER:
1081         if (dfa->nodes[node].opr.c >= 0x80)
1082           mb_chars = 1;
1083         break;
1084       case ANCHOR:
1085         switch (dfa->nodes[node].opr.ctx_type)
1086           {
1087           case LINE_FIRST:
1088           case LINE_LAST:
1089           case BUF_FIRST:
1090           case BUF_LAST:
1091             break;
1092           default:
1093             /* Word anchors etc. cannot be handled.  It's okay to test
1094                opr.ctx_type since constraints (for all DFA nodes) are
1095                created by ORing one or more opr.ctx_type values.  */
1096             return;
1097           }
1098         break;
1099       case OP_PERIOD:
1100         has_period = 1;
1101         break;
1102       case OP_BACK_REF:
1103       case OP_ALT:
1104       case END_OF_RE:
1105       case OP_DUP_ASTERISK:
1106       case OP_OPEN_SUBEXP:
1107       case OP_CLOSE_SUBEXP:
1108         break;
1109       case COMPLEX_BRACKET:
1110         return;
1111       case SIMPLE_BRACKET:
1112         /* Just double check.  The non-ASCII range starts at 0x80.  */
1113         assert (0x80 % BITSET_WORD_BITS == 0);
1114         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115           if (dfa->nodes[node].opr.sbcset[i])
1116             return;
1117         break;
1118       default:
1119         abort ();
1120       }
1121
1122   if (mb_chars || has_period)
1123     for (node = 0; node < dfa->nodes_len; ++node)
1124       {
1125         if (dfa->nodes[node].type == CHARACTER
1126             && dfa->nodes[node].opr.c >= 0x80)
1127           dfa->nodes[node].mb_partial = 0;
1128         else if (dfa->nodes[node].type == OP_PERIOD)
1129           dfa->nodes[node].type = OP_UTF8_PERIOD;
1130       }
1131
1132   /* The search can be in single byte locale.  */
1133   dfa->mb_cur_max = 1;
1134   dfa->is_utf8 = 0;
1135   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1136 }
1137 #endif
1138 \f
1139 /* Analyze the structure tree, and calculate "first", "next", "edest",
1140    "eclosure", and "inveclosure".  */
1141
1142 static reg_errcode_t
1143 analyze (regex_t *preg)
1144 {
1145   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1146   reg_errcode_t ret;
1147
1148   /* Allocate arrays.  */
1149   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1150   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1151   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1152   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1153   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1154           || dfa->eclosures == NULL, 0))
1155     return REG_ESPACE;
1156
1157   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1158   if (dfa->subexp_map != NULL)
1159     {
1160       int i;
1161       for (i = 0; i < preg->re_nsub; i++)
1162         dfa->subexp_map[i] = i;
1163       preorder (dfa->str_tree, optimize_subexps, dfa);
1164       for (i = 0; i < preg->re_nsub; i++)
1165         if (dfa->subexp_map[i] != i)
1166           break;
1167       if (i == preg->re_nsub)
1168         {
1169           free (dfa->subexp_map);
1170           dfa->subexp_map = NULL;
1171         }
1172     }
1173
1174   ret = postorder (dfa->str_tree, lower_subexps, preg);
1175   if (BE (ret != REG_NOERROR, 0))
1176     return ret;
1177   ret = postorder (dfa->str_tree, calc_first, dfa);
1178   if (BE (ret != REG_NOERROR, 0))
1179     return ret;
1180   preorder (dfa->str_tree, calc_next, dfa);
1181   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184   ret = calc_eclosure (dfa);
1185   if (BE (ret != REG_NOERROR, 0))
1186     return ret;
1187
1188   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1189      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1190   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1191       || dfa->nbackref)
1192     {
1193       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1194       if (BE (dfa->inveclosures == NULL, 0))
1195         return REG_ESPACE;
1196       ret = calc_inveclosure (dfa);
1197     }
1198
1199   return ret;
1200 }
1201
1202 /* Our parse trees are very unbalanced, so we cannot use a stack to
1203    implement parse tree visits.  Instead, we use parent pointers and
1204    some hairy code in these two functions.  */
1205 static reg_errcode_t
1206 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1207            void *extra)
1208 {
1209   bin_tree_t *node, *prev;
1210
1211   for (node = root; ; )
1212     {
1213       /* Descend down the tree, preferably to the left (or to the right
1214          if that's the only child).  */
1215       while (node->left || node->right)
1216         if (node->left)
1217           node = node->left;
1218         else
1219           node = node->right;
1220
1221       do
1222         {
1223           reg_errcode_t err = fn (extra, node);
1224           if (BE (err != REG_NOERROR, 0))
1225             return err;
1226           if (node->parent == NULL)
1227             return REG_NOERROR;
1228           prev = node;
1229           node = node->parent;
1230         }
1231       /* Go up while we have a node that is reached from the right.  */
1232       while (node->right == prev || node->right == NULL);
1233       node = node->right;
1234     }
1235 }
1236
1237 static reg_errcode_t
1238 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1239           void *extra)
1240 {
1241   bin_tree_t *node;
1242
1243   for (node = root; ; )
1244     {
1245       reg_errcode_t err = fn (extra, node);
1246       if (BE (err != REG_NOERROR, 0))
1247         return err;
1248
1249       /* Go to the left node, or up and to the right.  */
1250       if (node->left)
1251         node = node->left;
1252       else
1253         {
1254           bin_tree_t *prev = NULL;
1255           while (node->right == prev || node->right == NULL)
1256             {
1257               prev = node;
1258               node = node->parent;
1259               if (!node)
1260                 return REG_NOERROR;
1261             }
1262           node = node->right;
1263         }
1264     }
1265 }
1266
1267 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1268    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1269    backreferences as well.  Requires a preorder visit.  */
1270 static reg_errcode_t
1271 optimize_subexps (void *extra, bin_tree_t *node)
1272 {
1273   re_dfa_t *dfa = (re_dfa_t *) extra;
1274
1275   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1276     {
1277       int idx = node->token.opr.idx;
1278       node->token.opr.idx = dfa->subexp_map[idx];
1279       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1280     }
1281
1282   else if (node->token.type == SUBEXP
1283            && node->left && node->left->token.type == SUBEXP)
1284     {
1285       int other_idx = node->left->token.opr.idx;
1286
1287       node->left = node->left->left;
1288       if (node->left)
1289         node->left->parent = node;
1290
1291       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1292       if (other_idx < BITSET_WORD_BITS)
1293           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1294     }
1295
1296   return REG_NOERROR;
1297 }
1298
1299 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1300    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1301 static reg_errcode_t
1302 lower_subexps (void *extra, bin_tree_t *node)
1303 {
1304   regex_t *preg = (regex_t *) extra;
1305   reg_errcode_t err = REG_NOERROR;
1306
1307   if (node->left && node->left->token.type == SUBEXP)
1308     {
1309       node->left = lower_subexp (&err, preg, node->left);
1310       if (node->left)
1311         node->left->parent = node;
1312     }
1313   if (node->right && node->right->token.type == SUBEXP)
1314     {
1315       node->right = lower_subexp (&err, preg, node->right);
1316       if (node->right)
1317         node->right->parent = node;
1318     }
1319
1320   return err;
1321 }
1322
1323 static bin_tree_t *
1324 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1325 {
1326   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1327   bin_tree_t *body = node->left;
1328   bin_tree_t *op, *cls, *tree1, *tree;
1329
1330   if (preg->no_sub
1331       /* We do not optimize empty subexpressions, because otherwise we may
1332          have bad CONCAT nodes with NULL children.  This is obviously not
1333          very common, so we do not lose much.  An example that triggers
1334          this case is the sed "script" /\(\)/x.  */
1335       && node->left != NULL
1336       && (node->token.opr.idx >= BITSET_WORD_BITS
1337           || !(dfa->used_bkref_map
1338                & ((bitset_word_t) 1 << node->token.opr.idx))))
1339     return node->left;
1340
1341   /* Convert the SUBEXP node to the concatenation of an
1342      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1343   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1344   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1345   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1346   tree = create_tree (dfa, op, tree1, CONCAT);
1347   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1348     {
1349       *err = REG_ESPACE;
1350       return NULL;
1351     }
1352
1353   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1354   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1355   return tree;
1356 }
1357
1358 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1359    nodes.  Requires a postorder visit.  */
1360 static reg_errcode_t
1361 calc_first (void *extra, bin_tree_t *node)
1362 {
1363   re_dfa_t *dfa = (re_dfa_t *) extra;
1364   if (node->token.type == CONCAT)
1365     {
1366       node->first = node->left->first;
1367       node->node_idx = node->left->node_idx;
1368     }
1369   else
1370     {
1371       node->first = node;
1372       node->node_idx = re_dfa_add_node (dfa, node->token);
1373       if (BE (node->node_idx == -1, 0))
1374         return REG_ESPACE;
1375       if (node->token.type == ANCHOR)
1376         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1377     }
1378   return REG_NOERROR;
1379 }
1380
1381 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1382 static reg_errcode_t
1383 calc_next (void *extra, bin_tree_t *node)
1384 {
1385   switch (node->token.type)
1386     {
1387     case OP_DUP_ASTERISK:
1388       node->left->next = node;
1389       break;
1390     case CONCAT:
1391       node->left->next = node->right->first;
1392       node->right->next = node->next;
1393       break;
1394     default:
1395       if (node->left)
1396         node->left->next = node->next;
1397       if (node->right)
1398         node->right->next = node->next;
1399       break;
1400     }
1401   return REG_NOERROR;
1402 }
1403
1404 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1405 static reg_errcode_t
1406 link_nfa_nodes (void *extra, bin_tree_t *node)
1407 {
1408   re_dfa_t *dfa = (re_dfa_t *) extra;
1409   int idx = node->node_idx;
1410   reg_errcode_t err = REG_NOERROR;
1411
1412   switch (node->token.type)
1413     {
1414     case CONCAT:
1415       break;
1416
1417     case END_OF_RE:
1418       assert (node->next == NULL);
1419       break;
1420
1421     case OP_DUP_ASTERISK:
1422     case OP_ALT:
1423       {
1424         int left, right;
1425         dfa->has_plural_match = 1;
1426         if (node->left != NULL)
1427           left = node->left->first->node_idx;
1428         else
1429           left = node->next->node_idx;
1430         if (node->right != NULL)
1431           right = node->right->first->node_idx;
1432         else
1433           right = node->next->node_idx;
1434         assert (left > -1);
1435         assert (right > -1);
1436         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1437       }
1438       break;
1439
1440     case ANCHOR:
1441     case OP_OPEN_SUBEXP:
1442     case OP_CLOSE_SUBEXP:
1443       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1444       break;
1445
1446     case OP_BACK_REF:
1447       dfa->nexts[idx] = node->next->node_idx;
1448       if (node->token.type == OP_BACK_REF)
1449         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1450       break;
1451
1452     default:
1453       assert (!IS_EPSILON_NODE (node->token.type));
1454       dfa->nexts[idx] = node->next->node_idx;
1455       break;
1456     }
1457
1458   return err;
1459 }
1460
1461 /* Duplicate the epsilon closure of the node ROOT_NODE.
1462    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1463    to their own constraint.  */
1464
1465 static reg_errcode_t
1466 internal_function
1467 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1468                         int root_node, unsigned int init_constraint)
1469 {
1470   int org_node, clone_node, ret;
1471   unsigned int constraint = init_constraint;
1472   for (org_node = top_org_node, clone_node = top_clone_node;;)
1473     {
1474       int org_dest, clone_dest;
1475       if (dfa->nodes[org_node].type == OP_BACK_REF)
1476         {
1477           /* If the back reference epsilon-transit, its destination must
1478              also have the constraint.  Then duplicate the epsilon closure
1479              of the destination of the back reference, and store it in
1480              edests of the back reference.  */
1481           org_dest = dfa->nexts[org_node];
1482           re_node_set_empty (dfa->edests + clone_node);
1483           clone_dest = duplicate_node (dfa, org_dest, constraint);
1484           if (BE (clone_dest == -1, 0))
1485             return REG_ESPACE;
1486           dfa->nexts[clone_node] = dfa->nexts[org_node];
1487           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488           if (BE (ret < 0, 0))
1489             return REG_ESPACE;
1490         }
1491       else if (dfa->edests[org_node].nelem == 0)
1492         {
1493           /* In case of the node can't epsilon-transit, don't duplicate the
1494              destination and store the original destination as the
1495              destination of the node.  */
1496           dfa->nexts[clone_node] = dfa->nexts[org_node];
1497           break;
1498         }
1499       else if (dfa->edests[org_node].nelem == 1)
1500         {
1501           /* In case of the node can epsilon-transit, and it has only one
1502              destination.  */
1503           org_dest = dfa->edests[org_node].elems[0];
1504           re_node_set_empty (dfa->edests + clone_node);
1505           /* If the node is root_node itself, it means the epsilon clsoure
1506              has a loop.   Then tie it to the destination of the root_node.  */
1507           if (org_node == root_node && clone_node != org_node)
1508             {
1509               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1510               if (BE (ret < 0, 0))
1511                 return REG_ESPACE;
1512               break;
1513             }
1514           /* In case of the node has another constraint, add it.  */
1515           constraint |= dfa->nodes[org_node].constraint;
1516           clone_dest = duplicate_node (dfa, org_dest, constraint);
1517           if (BE (clone_dest == -1, 0))
1518             return REG_ESPACE;
1519           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520           if (BE (ret < 0, 0))
1521             return REG_ESPACE;
1522         }
1523       else /* dfa->edests[org_node].nelem == 2 */
1524         {
1525           /* In case of the node can epsilon-transit, and it has two
1526              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1527           org_dest = dfa->edests[org_node].elems[0];
1528           re_node_set_empty (dfa->edests + clone_node);
1529           /* Search for a duplicated node which satisfies the constraint.  */
1530           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1531           if (clone_dest == -1)
1532             {
1533               /* There is no such duplicated node, create a new one.  */
1534               reg_errcode_t err;
1535               clone_dest = duplicate_node (dfa, org_dest, constraint);
1536               if (BE (clone_dest == -1, 0))
1537                 return REG_ESPACE;
1538               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1539               if (BE (ret < 0, 0))
1540                 return REG_ESPACE;
1541               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1542                                             root_node, constraint);
1543               if (BE (err != REG_NOERROR, 0))
1544                 return err;
1545             }
1546           else
1547             {
1548               /* There is a duplicated node which satisfies the constraint,
1549                  use it to avoid infinite loop.  */
1550               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551               if (BE (ret < 0, 0))
1552                 return REG_ESPACE;
1553             }
1554
1555           org_dest = dfa->edests[org_node].elems[1];
1556           clone_dest = duplicate_node (dfa, org_dest, constraint);
1557           if (BE (clone_dest == -1, 0))
1558             return REG_ESPACE;
1559           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560           if (BE (ret < 0, 0))
1561             return REG_ESPACE;
1562         }
1563       org_node = org_dest;
1564       clone_node = clone_dest;
1565     }
1566   return REG_NOERROR;
1567 }
1568
1569 /* Search for a node which is duplicated from the node ORG_NODE, and
1570    satisfies the constraint CONSTRAINT.  */
1571
1572 static int
1573 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1574                         unsigned int constraint)
1575 {
1576   int idx;
1577   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1578     {
1579       if (org_node == dfa->org_indices[idx]
1580           && constraint == dfa->nodes[idx].constraint)
1581         return idx; /* Found.  */
1582     }
1583   return -1; /* Not found.  */
1584 }
1585
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587    Return the index of the new node, or -1 if insufficient storage is
1588    available.  */
1589
1590 static int
1591 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1592 {
1593   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1594   if (BE (dup_idx != -1, 1))
1595     {
1596       dfa->nodes[dup_idx].constraint = constraint;
1597       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1598       dfa->nodes[dup_idx].duplicated = 1;
1599
1600       /* Store the index of the original node.  */
1601       dfa->org_indices[dup_idx] = org_idx;
1602     }
1603   return dup_idx;
1604 }
1605
1606 static reg_errcode_t
1607 calc_inveclosure (re_dfa_t *dfa)
1608 {
1609   int src, idx, ret;
1610   for (idx = 0; idx < dfa->nodes_len; ++idx)
1611     re_node_set_init_empty (dfa->inveclosures + idx);
1612
1613   for (src = 0; src < dfa->nodes_len; ++src)
1614     {
1615       int *elems = dfa->eclosures[src].elems;
1616       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1617         {
1618           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1619           if (BE (ret == -1, 0))
1620             return REG_ESPACE;
1621         }
1622     }
1623
1624   return REG_NOERROR;
1625 }
1626
1627 /* Calculate "eclosure" for all the node in DFA.  */
1628
1629 static reg_errcode_t
1630 calc_eclosure (re_dfa_t *dfa)
1631 {
1632   int node_idx, incomplete;
1633 #ifdef DEBUG
1634   assert (dfa->nodes_len > 0);
1635 #endif
1636   incomplete = 0;
1637   /* For each nodes, calculate epsilon closure.  */
1638   for (node_idx = 0; ; ++node_idx)
1639     {
1640       reg_errcode_t err;
1641       re_node_set eclosure_elem;
1642       if (node_idx == dfa->nodes_len)
1643         {
1644           if (!incomplete)
1645             break;
1646           incomplete = 0;
1647           node_idx = 0;
1648         }
1649
1650 #ifdef DEBUG
1651       assert (dfa->eclosures[node_idx].nelem != -1);
1652 #endif
1653
1654       /* If we have already calculated, skip it.  */
1655       if (dfa->eclosures[node_idx].nelem != 0)
1656         continue;
1657       /* Calculate epsilon closure of `node_idx'.  */
1658       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1659       if (BE (err != REG_NOERROR, 0))
1660         return err;
1661
1662       if (dfa->eclosures[node_idx].nelem == 0)
1663         {
1664           incomplete = 1;
1665           re_node_set_free (&eclosure_elem);
1666         }
1667     }
1668   return REG_NOERROR;
1669 }
1670
1671 /* Calculate epsilon closure of NODE.  */
1672
1673 static reg_errcode_t
1674 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1675 {
1676   reg_errcode_t err;
1677   int i;
1678   re_node_set eclosure;
1679   int ret;
1680   int incomplete = 0;
1681   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1682   if (BE (err != REG_NOERROR, 0))
1683     return err;
1684
1685   /* This indicates that we are calculating this node now.
1686      We reference this value to avoid infinite loop.  */
1687   dfa->eclosures[node].nelem = -1;
1688
1689   /* If the current node has constraints, duplicate all nodes
1690      since they must inherit the constraints.  */
1691   if (dfa->nodes[node].constraint
1692       && dfa->edests[node].nelem
1693       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1694     {
1695       err = duplicate_node_closure (dfa, node, node, node,
1696                                     dfa->nodes[node].constraint);
1697       if (BE (err != REG_NOERROR, 0))
1698         return err;
1699     }
1700
1701   /* Expand each epsilon destination nodes.  */
1702   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1703     for (i = 0; i < dfa->edests[node].nelem; ++i)
1704       {
1705         re_node_set eclosure_elem;
1706         int edest = dfa->edests[node].elems[i];
1707         /* If calculating the epsilon closure of `edest' is in progress,
1708            return intermediate result.  */
1709         if (dfa->eclosures[edest].nelem == -1)
1710           {
1711             incomplete = 1;
1712             continue;
1713           }
1714         /* If we haven't calculated the epsilon closure of `edest' yet,
1715            calculate now. Otherwise use calculated epsilon closure.  */
1716         if (dfa->eclosures[edest].nelem == 0)
1717           {
1718             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1719             if (BE (err != REG_NOERROR, 0))
1720               return err;
1721           }
1722         else
1723           eclosure_elem = dfa->eclosures[edest];
1724         /* Merge the epsilon closure of `edest'.  */
1725         err = re_node_set_merge (&eclosure, &eclosure_elem);
1726         if (BE (err != REG_NOERROR, 0))
1727           return err;
1728         /* If the epsilon closure of `edest' is incomplete,
1729            the epsilon closure of this node is also incomplete.  */
1730         if (dfa->eclosures[edest].nelem == 0)
1731           {
1732             incomplete = 1;
1733             re_node_set_free (&eclosure_elem);
1734           }
1735       }
1736
1737   /* An epsilon closure includes itself.  */
1738   ret = re_node_set_insert (&eclosure, node);
1739   if (BE (ret < 0, 0))
1740     return REG_ESPACE;
1741   if (incomplete && !root)
1742     dfa->eclosures[node].nelem = 0;
1743   else
1744     dfa->eclosures[node] = eclosure;
1745   *new_set = eclosure;
1746   return REG_NOERROR;
1747 }
1748 \f
1749 /* Functions for token which are used in the parser.  */
1750
1751 /* Fetch a token from INPUT.
1752    We must not use this function inside bracket expressions.  */
1753
1754 static void
1755 internal_function
1756 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1757 {
1758   re_string_skip_bytes (input, peek_token (result, input, syntax));
1759 }
1760
1761 /* Peek a token from INPUT, and return the length of the token.
1762    We must not use this function inside bracket expressions.  */
1763
1764 static int
1765 internal_function
1766 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1767 {
1768   unsigned char c;
1769
1770   if (re_string_eoi (input))
1771     {
1772       token->type = END_OF_RE;
1773       return 0;
1774     }
1775
1776   c = re_string_peek_byte (input, 0);
1777   token->opr.c = c;
1778
1779   token->word_char = 0;
1780 #ifdef RE_ENABLE_I18N
1781   token->mb_partial = 0;
1782   if (input->mb_cur_max > 1 &&
1783       !re_string_first_byte (input, re_string_cur_idx (input)))
1784     {
1785       token->type = CHARACTER;
1786       token->mb_partial = 1;
1787       return 1;
1788     }
1789 #endif
1790   if (c == '\\')
1791     {
1792       unsigned char c2;
1793       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1794         {
1795           token->type = BACK_SLASH;
1796           return 1;
1797         }
1798
1799       c2 = re_string_peek_byte_case (input, 1);
1800       token->opr.c = c2;
1801       token->type = CHARACTER;
1802 #ifdef RE_ENABLE_I18N
1803       if (input->mb_cur_max > 1)
1804         {
1805           wint_t wc = re_string_wchar_at (input,
1806                                           re_string_cur_idx (input) + 1);
1807           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1808         }
1809       else
1810 #endif
1811         token->word_char = IS_WORD_CHAR (c2) != 0;
1812
1813       switch (c2)
1814         {
1815         case '|':
1816           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1817             token->type = OP_ALT;
1818           break;
1819         case '1': case '2': case '3': case '4': case '5':
1820         case '6': case '7': case '8': case '9':
1821           if (!(syntax & RE_NO_BK_REFS))
1822             {
1823               token->type = OP_BACK_REF;
1824               token->opr.idx = c2 - '1';
1825             }
1826           break;
1827         case '<':
1828           if (!(syntax & RE_NO_GNU_OPS))
1829             {
1830               token->type = ANCHOR;
1831               token->opr.ctx_type = WORD_FIRST;
1832             }
1833           break;
1834         case '>':
1835           if (!(syntax & RE_NO_GNU_OPS))
1836             {
1837               token->type = ANCHOR;
1838               token->opr.ctx_type = WORD_LAST;
1839             }
1840           break;
1841         case 'b':
1842           if (!(syntax & RE_NO_GNU_OPS))
1843             {
1844               token->type = ANCHOR;
1845               token->opr.ctx_type = WORD_DELIM;
1846             }
1847           break;
1848         case 'B':
1849           if (!(syntax & RE_NO_GNU_OPS))
1850             {
1851               token->type = ANCHOR;
1852               token->opr.ctx_type = NOT_WORD_DELIM;
1853             }
1854           break;
1855         case 'w':
1856           if (!(syntax & RE_NO_GNU_OPS))
1857             token->type = OP_WORD;
1858           break;
1859         case 'W':
1860           if (!(syntax & RE_NO_GNU_OPS))
1861             token->type = OP_NOTWORD;
1862           break;
1863         case 's':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             token->type = OP_SPACE;
1866           break;
1867         case 'S':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             token->type = OP_NOTSPACE;
1870           break;
1871         case '`':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             {
1874               token->type = ANCHOR;
1875               token->opr.ctx_type = BUF_FIRST;
1876             }
1877           break;
1878         case '\'':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             {
1881               token->type = ANCHOR;
1882               token->opr.ctx_type = BUF_LAST;
1883             }
1884           break;
1885         case '(':
1886           if (!(syntax & RE_NO_BK_PARENS))
1887             token->type = OP_OPEN_SUBEXP;
1888           break;
1889         case ')':
1890           if (!(syntax & RE_NO_BK_PARENS))
1891             token->type = OP_CLOSE_SUBEXP;
1892           break;
1893         case '+':
1894           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1895             token->type = OP_DUP_PLUS;
1896           break;
1897         case '?':
1898           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1899             token->type = OP_DUP_QUESTION;
1900           break;
1901         case '{':
1902           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1903             token->type = OP_OPEN_DUP_NUM;
1904           break;
1905         case '}':
1906           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1907             token->type = OP_CLOSE_DUP_NUM;
1908           break;
1909         default:
1910           break;
1911         }
1912       return 2;
1913     }
1914
1915   token->type = CHARACTER;
1916 #ifdef RE_ENABLE_I18N
1917   if (input->mb_cur_max > 1)
1918     {
1919       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1920       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1921     }
1922   else
1923 #endif
1924     token->word_char = IS_WORD_CHAR (token->opr.c);
1925
1926   switch (c)
1927     {
1928     case '\n':
1929       if (syntax & RE_NEWLINE_ALT)
1930         token->type = OP_ALT;
1931       break;
1932     case '|':
1933       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1934         token->type = OP_ALT;
1935       break;
1936     case '*':
1937       token->type = OP_DUP_ASTERISK;
1938       break;
1939     case '+':
1940       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1941         token->type = OP_DUP_PLUS;
1942       break;
1943     case '?':
1944       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1945         token->type = OP_DUP_QUESTION;
1946       break;
1947     case '{':
1948       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1949         token->type = OP_OPEN_DUP_NUM;
1950       break;
1951     case '}':
1952       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1953         token->type = OP_CLOSE_DUP_NUM;
1954       break;
1955     case '(':
1956       if (syntax & RE_NO_BK_PARENS)
1957         token->type = OP_OPEN_SUBEXP;
1958       break;
1959     case ')':
1960       if (syntax & RE_NO_BK_PARENS)
1961         token->type = OP_CLOSE_SUBEXP;
1962       break;
1963     case '[':
1964       token->type = OP_OPEN_BRACKET;
1965       break;
1966     case '.':
1967       token->type = OP_PERIOD;
1968       break;
1969     case '^':
1970       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1971           re_string_cur_idx (input) != 0)
1972         {
1973           char prev = re_string_peek_byte (input, -1);
1974           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1975             break;
1976         }
1977       token->type = ANCHOR;
1978       token->opr.ctx_type = LINE_FIRST;
1979       break;
1980     case '$':
1981       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1982           re_string_cur_idx (input) + 1 != re_string_length (input))
1983         {
1984           re_token_t next;
1985           re_string_skip_bytes (input, 1);
1986           peek_token (&next, input, syntax);
1987           re_string_skip_bytes (input, -1);
1988           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1989             break;
1990         }
1991       token->type = ANCHOR;
1992       token->opr.ctx_type = LINE_LAST;
1993       break;
1994     default:
1995       break;
1996     }
1997   return 1;
1998 }
1999
2000 /* Peek a token from INPUT, and return the length of the token.
2001    We must not use this function out of bracket expressions.  */
2002
2003 static int
2004 internal_function
2005 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2006 {
2007   unsigned char c;
2008   if (re_string_eoi (input))
2009     {
2010       token->type = END_OF_RE;
2011       return 0;
2012     }
2013   c = re_string_peek_byte (input, 0);
2014   token->opr.c = c;
2015
2016 #ifdef RE_ENABLE_I18N
2017   if (input->mb_cur_max > 1 &&
2018       !re_string_first_byte (input, re_string_cur_idx (input)))
2019     {
2020       token->type = CHARACTER;
2021       return 1;
2022     }
2023 #endif /* RE_ENABLE_I18N */
2024
2025   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2026       && re_string_cur_idx (input) + 1 < re_string_length (input))
2027     {
2028       /* In this case, '\' escape a character.  */
2029       unsigned char c2;
2030       re_string_skip_bytes (input, 1);
2031       c2 = re_string_peek_byte (input, 0);
2032       token->opr.c = c2;
2033       token->type = CHARACTER;
2034       return 1;
2035     }
2036   if (c == '[') /* '[' is a special char in a bracket exps.  */
2037     {
2038       unsigned char c2;
2039       int token_len;
2040       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2041         c2 = re_string_peek_byte (input, 1);
2042       else
2043         c2 = 0;
2044       token->opr.c = c2;
2045       token_len = 2;
2046       switch (c2)
2047         {
2048         case '.':
2049           token->type = OP_OPEN_COLL_ELEM;
2050           break;
2051         case '=':
2052           token->type = OP_OPEN_EQUIV_CLASS;
2053           break;
2054         case ':':
2055           if (syntax & RE_CHAR_CLASSES)
2056             {
2057               token->type = OP_OPEN_CHAR_CLASS;
2058               break;
2059             }
2060           /* else fall through.  */
2061         default:
2062           token->type = CHARACTER;
2063           token->opr.c = c;
2064           token_len = 1;
2065           break;
2066         }
2067       return token_len;
2068     }
2069   switch (c)
2070     {
2071     case '-':
2072       token->type = OP_CHARSET_RANGE;
2073       break;
2074     case ']':
2075       token->type = OP_CLOSE_BRACKET;
2076       break;
2077     case '^':
2078       token->type = OP_NON_MATCH_LIST;
2079       break;
2080     default:
2081       token->type = CHARACTER;
2082     }
2083   return 1;
2084 }
2085 \f
2086 /* Functions for parser.  */
2087
2088 /* Entry point of the parser.
2089    Parse the regular expression REGEXP and return the structure tree.
2090    If an error is occured, ERR is set by error code, and return NULL.
2091    This function build the following tree, from regular expression <reg_exp>:
2092            CAT
2093            / \
2094           /   \
2095    <reg_exp>  EOR
2096
2097    CAT means concatenation.
2098    EOR means end of regular expression.  */
2099
2100 static bin_tree_t *
2101 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2102        reg_errcode_t *err)
2103 {
2104   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105   bin_tree_t *tree, *eor, *root;
2106   re_token_t current_token;
2107   dfa->syntax = syntax;
2108   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2109   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2110   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2111     return NULL;
2112   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2113   if (tree != NULL)
2114     root = create_tree (dfa, tree, eor, CONCAT);
2115   else
2116     root = eor;
2117   if (BE (eor == NULL || root == NULL, 0))
2118     {
2119       *err = REG_ESPACE;
2120       return NULL;
2121     }
2122   return root;
2123 }
2124
2125 /* This function build the following tree, from regular expression
2126    <branch1>|<branch2>:
2127            ALT
2128            / \
2129           /   \
2130    <branch1> <branch2>
2131
2132    ALT means alternative, which represents the operator `|'.  */
2133
2134 static bin_tree_t *
2135 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2136                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2137 {
2138   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2139   bin_tree_t *tree, *branch = NULL;
2140   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2141   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142     return NULL;
2143
2144   while (token->type == OP_ALT)
2145     {
2146       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147       if (token->type != OP_ALT && token->type != END_OF_RE
2148           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2149         {
2150           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2151           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2152             return NULL;
2153         }
2154       else
2155         branch = NULL;
2156       tree = create_tree (dfa, tree, branch, OP_ALT);
2157       if (BE (tree == NULL, 0))
2158         {
2159           *err = REG_ESPACE;
2160           return NULL;
2161         }
2162     }
2163   return tree;
2164 }
2165
2166 /* This function build the following tree, from regular expression
2167    <exp1><exp2>:
2168         CAT
2169         / \
2170        /   \
2171    <exp1> <exp2>
2172
2173    CAT means concatenation.  */
2174
2175 static bin_tree_t *
2176 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2177               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2178 {
2179   bin_tree_t *tree, *exp;
2180   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2181   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2182   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183     return NULL;
2184
2185   while (token->type != OP_ALT && token->type != END_OF_RE
2186          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187     {
2188       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2189       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2190         {
2191           if (tree != NULL)
2192             postorder (tree, free_tree, NULL);
2193           return NULL;
2194         }
2195       if (tree != NULL && exp != NULL)
2196         {
2197           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2198           if (newtree == NULL)
2199             {
2200               postorder (exp, free_tree, NULL);
2201               postorder (tree, free_tree, NULL);
2202               *err = REG_ESPACE;
2203               return NULL;
2204             }
2205           tree = newtree;
2206         }
2207       else if (tree == NULL)
2208         tree = exp;
2209       /* Otherwise exp == NULL, we don't need to create new tree.  */
2210     }
2211   return tree;
2212 }
2213
2214 /* This function build the following tree, from regular expression a*:
2215          *
2216          |
2217          a
2218 */
2219
2220 static bin_tree_t *
2221 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2223 {
2224   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2225   bin_tree_t *tree;
2226   switch (token->type)
2227     {
2228     case CHARACTER:
2229       tree = create_token_tree (dfa, NULL, NULL, token);
2230       if (BE (tree == NULL, 0))
2231         {
2232           *err = REG_ESPACE;
2233           return NULL;
2234         }
2235 #ifdef RE_ENABLE_I18N
2236       if (dfa->mb_cur_max > 1)
2237         {
2238           while (!re_string_eoi (regexp)
2239                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2240             {
2241               bin_tree_t *mbc_remain;
2242               fetch_token (token, regexp, syntax);
2243               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2244               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2245               if (BE (mbc_remain == NULL || tree == NULL, 0))
2246                 {
2247                   *err = REG_ESPACE;
2248                   return NULL;
2249                 }
2250             }
2251         }
2252 #endif
2253       break;
2254     case OP_OPEN_SUBEXP:
2255       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2256       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257         return NULL;
2258       break;
2259     case OP_OPEN_BRACKET:
2260       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2261       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262         return NULL;
2263       break;
2264     case OP_BACK_REF:
2265       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2266         {
2267           *err = REG_ESUBREG;
2268           return NULL;
2269         }
2270       dfa->used_bkref_map |= 1 << token->opr.idx;
2271       tree = create_token_tree (dfa, NULL, NULL, token);
2272       if (BE (tree == NULL, 0))
2273         {
2274           *err = REG_ESPACE;
2275           return NULL;
2276         }
2277       ++dfa->nbackref;
2278       dfa->has_mb_node = 1;
2279       break;
2280     case OP_OPEN_DUP_NUM:
2281       if (syntax & RE_CONTEXT_INVALID_DUP)
2282         {
2283           *err = REG_BADRPT;
2284           return NULL;
2285         }
2286       /* FALLTHROUGH */
2287     case OP_DUP_ASTERISK:
2288     case OP_DUP_PLUS:
2289     case OP_DUP_QUESTION:
2290       if (syntax & RE_CONTEXT_INVALID_OPS)
2291         {
2292           *err = REG_BADRPT;
2293           return NULL;
2294         }
2295       else if (syntax & RE_CONTEXT_INDEP_OPS)
2296         {
2297           fetch_token (token, regexp, syntax);
2298           return parse_expression (regexp, preg, token, syntax, nest, err);
2299         }
2300       /* else fall through  */
2301     case OP_CLOSE_SUBEXP:
2302       if ((token->type == OP_CLOSE_SUBEXP) &&
2303           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2304         {
2305           *err = REG_ERPAREN;
2306           return NULL;
2307         }
2308       /* else fall through  */
2309     case OP_CLOSE_DUP_NUM:
2310       /* We treat it as a normal character.  */
2311
2312       /* Then we can these characters as normal characters.  */
2313       token->type = CHARACTER;
2314       /* mb_partial and word_char bits should be initialized already
2315          by peek_token.  */
2316       tree = create_token_tree (dfa, NULL, NULL, token);
2317       if (BE (tree == NULL, 0))
2318         {
2319           *err = REG_ESPACE;
2320           return NULL;
2321         }
2322       break;
2323     case ANCHOR:
2324       if ((token->opr.ctx_type
2325            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2326           && dfa->word_ops_used == 0)
2327         init_word_char (dfa);
2328       if (token->opr.ctx_type == WORD_DELIM
2329           || token->opr.ctx_type == NOT_WORD_DELIM)
2330         {
2331           bin_tree_t *tree_first, *tree_last;
2332           if (token->opr.ctx_type == WORD_DELIM)
2333             {
2334               token->opr.ctx_type = WORD_FIRST;
2335               tree_first = create_token_tree (dfa, NULL, NULL, token);
2336               token->opr.ctx_type = WORD_LAST;
2337             }
2338           else
2339             {
2340               token->opr.ctx_type = INSIDE_WORD;
2341               tree_first = create_token_tree (dfa, NULL, NULL, token);
2342               token->opr.ctx_type = INSIDE_NOTWORD;
2343             }
2344           tree_last = create_token_tree (dfa, NULL, NULL, token);
2345           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2346           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2347             {
2348               *err = REG_ESPACE;
2349               return NULL;
2350             }
2351         }
2352       else
2353         {
2354           tree = create_token_tree (dfa, NULL, NULL, token);
2355           if (BE (tree == NULL, 0))
2356             {
2357               *err = REG_ESPACE;
2358               return NULL;
2359             }
2360         }
2361       /* We must return here, since ANCHORs can't be followed
2362          by repetition operators.
2363          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2364              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2365       fetch_token (token, regexp, syntax);
2366       return tree;
2367     case OP_PERIOD:
2368       tree = create_token_tree (dfa, NULL, NULL, token);
2369       if (BE (tree == NULL, 0))
2370         {
2371           *err = REG_ESPACE;
2372           return NULL;
2373         }
2374       if (dfa->mb_cur_max > 1)
2375         dfa->has_mb_node = 1;
2376       break;
2377     case OP_WORD:
2378     case OP_NOTWORD:
2379       tree = build_charclass_op (dfa, regexp->trans,
2380                                  (const unsigned char *) "alnum",
2381                                  (const unsigned char *) "_",
2382                                  token->type == OP_NOTWORD, err);
2383       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2384         return NULL;
2385       break;
2386     case OP_SPACE:
2387     case OP_NOTSPACE:
2388       tree = build_charclass_op (dfa, regexp->trans,
2389                                  (const unsigned char *) "space",
2390                                  (const unsigned char *) "",
2391                                  token->type == OP_NOTSPACE, err);
2392       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2393         return NULL;
2394       break;
2395     case OP_ALT:
2396     case END_OF_RE:
2397       return NULL;
2398     case BACK_SLASH:
2399       *err = REG_EESCAPE;
2400       return NULL;
2401     default:
2402       /* Must not happen?  */
2403 #ifdef DEBUG
2404       assert (0);
2405 #endif
2406       return NULL;
2407     }
2408   fetch_token (token, regexp, syntax);
2409
2410   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2411          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2412     {
2413       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2414       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2415         return NULL;
2416       /* In BRE consecutive duplications are not allowed.  */
2417       if ((syntax & RE_CONTEXT_INVALID_DUP)
2418           && (token->type == OP_DUP_ASTERISK
2419               || token->type == OP_OPEN_DUP_NUM))
2420         {
2421           *err = REG_BADRPT;
2422           return NULL;
2423         }
2424     }
2425
2426   return tree;
2427 }
2428
2429 /* This function build the following tree, from regular expression
2430    (<reg_exp>):
2431          SUBEXP
2432             |
2433         <reg_exp>
2434 */
2435
2436 static bin_tree_t *
2437 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2438                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2439 {
2440   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2441   bin_tree_t *tree;
2442   size_t cur_nsub;
2443   cur_nsub = preg->re_nsub++;
2444
2445   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2446
2447   /* The subexpression may be a null string.  */
2448   if (token->type == OP_CLOSE_SUBEXP)
2449     tree = NULL;
2450   else
2451     {
2452       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2453       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2454         {
2455           if (tree != NULL)
2456             postorder (tree, free_tree, NULL);
2457           *err = REG_EPAREN;
2458         }
2459       if (BE (*err != REG_NOERROR, 0))
2460         return NULL;
2461     }
2462
2463   if (cur_nsub <= '9' - '1')
2464     dfa->completed_bkref_map |= 1 << cur_nsub;
2465
2466   tree = create_tree (dfa, tree, NULL, SUBEXP);
2467   if (BE (tree == NULL, 0))
2468     {
2469       *err = REG_ESPACE;
2470       return NULL;
2471     }
2472   tree->token.opr.idx = cur_nsub;
2473   return tree;
2474 }
2475
2476 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2477
2478 static bin_tree_t *
2479 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2480               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481 {
2482   bin_tree_t *tree = NULL, *old_tree = NULL;
2483   int i, start, end, start_idx = re_string_cur_idx (regexp);
2484   re_token_t start_token = *token;
2485
2486   if (token->type == OP_OPEN_DUP_NUM)
2487     {
2488       end = 0;
2489       start = fetch_number (regexp, token, syntax);
2490       if (start == -1)
2491         {
2492           if (token->type == CHARACTER && token->opr.c == ',')
2493             start = 0; /* We treat "{,m}" as "{0,m}".  */
2494           else
2495             {
2496               *err = REG_BADBR; /* <re>{} is invalid.  */
2497               return NULL;
2498             }
2499         }
2500       if (BE (start != -2, 1))
2501         {
2502           /* We treat "{n}" as "{n,n}".  */
2503           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2504                  : ((token->type == CHARACTER && token->opr.c == ',')
2505                     ? fetch_number (regexp, token, syntax) : -2));
2506         }
2507       if (BE (start == -2 || end == -2, 0))
2508         {
2509           /* Invalid sequence.  */
2510           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2511             {
2512               if (token->type == END_OF_RE)
2513                 *err = REG_EBRACE;
2514               else
2515                 *err = REG_BADBR;
2516
2517               return NULL;
2518             }
2519
2520           /* If the syntax bit is set, rollback.  */
2521           re_string_set_index (regexp, start_idx);
2522           *token = start_token;
2523           token->type = CHARACTER;
2524           /* mb_partial and word_char bits should be already initialized by
2525              peek_token.  */
2526           return elem;
2527         }
2528
2529       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2530         {
2531           /* First number greater than second.  */
2532           *err = REG_BADBR;
2533           return NULL;
2534         }
2535     }
2536   else
2537     {
2538       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2540     }
2541
2542   fetch_token (token, regexp, syntax);
2543
2544   if (BE (elem == NULL, 0))
2545     return NULL;
2546   if (BE (start == 0 && end == 0, 0))
2547     {
2548       postorder (elem, free_tree, NULL);
2549       return NULL;
2550     }
2551
2552   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2553   if (BE (start > 0, 0))
2554     {
2555       tree = elem;
2556       for (i = 2; i <= start; ++i)
2557         {
2558           elem = duplicate_tree (elem, dfa);
2559           tree = create_tree (dfa, tree, elem, CONCAT);
2560           if (BE (elem == NULL || tree == NULL, 0))
2561             goto parse_dup_op_espace;
2562         }
2563
2564       if (start == end)
2565         return tree;
2566
2567       /* Duplicate ELEM before it is marked optional.  */
2568       elem = duplicate_tree (elem, dfa);
2569       old_tree = tree;
2570     }
2571   else
2572     old_tree = NULL;
2573
2574   if (elem->token.type == SUBEXP)
2575     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576
2577   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2578   if (BE (tree == NULL, 0))
2579     goto parse_dup_op_espace;
2580
2581   /* This loop is actually executed only when end != -1,
2582      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2583      already created the start+1-th copy.  */
2584   for (i = start + 2; i <= end; ++i)
2585     {
2586       elem = duplicate_tree (elem, dfa);
2587       tree = create_tree (dfa, tree, elem, CONCAT);
2588       if (BE (elem == NULL || tree == NULL, 0))
2589         goto parse_dup_op_espace;
2590
2591       tree = create_tree (dfa, tree, NULL, OP_ALT);
2592       if (BE (tree == NULL, 0))
2593         goto parse_dup_op_espace;
2594     }
2595
2596   if (old_tree)
2597     tree = create_tree (dfa, old_tree, tree, CONCAT);
2598
2599   return tree;
2600
2601  parse_dup_op_espace:
2602   *err = REG_ESPACE;
2603   return NULL;
2604 }
2605
2606 /* Size of the names for collating symbol/equivalence_class/character_class.
2607    I'm not sure, but maybe enough.  */
2608 #define BRACKET_NAME_BUF_SIZE 32
2609
2610 #ifndef _LIBC
2611   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2612      Build the range expression which starts from START_ELEM, and ends
2613      at END_ELEM.  The result are written to MBCSET and SBCSET.
2614      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2615      mbcset->range_ends, is a pointer argument sinse we may
2616      update it.  */
2617
2618 static reg_errcode_t
2619 internal_function
2620 # ifdef RE_ENABLE_I18N
2621 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2622                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2623 # else /* not RE_ENABLE_I18N */
2624 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2625                  bracket_elem_t *end_elem)
2626 # endif /* not RE_ENABLE_I18N */
2627 {
2628   unsigned int start_ch, end_ch;
2629   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2630   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2631           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2632           0))
2633     return REG_ERANGE;
2634
2635   /* We can handle no multi character collating elements without libc
2636      support.  */
2637   if (BE ((start_elem->type == COLL_SYM
2638            && strlen ((char *) start_elem->opr.name) > 1)
2639           || (end_elem->type == COLL_SYM
2640               && strlen ((char *) end_elem->opr.name) > 1), 0))
2641     return REG_ECOLLATE;
2642
2643 # ifdef RE_ENABLE_I18N
2644   {
2645     wchar_t wc;
2646     wint_t start_wc;
2647     wint_t end_wc;
2648     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2649
2650     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2651                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2652                    : 0));
2653     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2654               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2655                  : 0));
2656     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2657                 ? __btowc (start_ch) : start_elem->opr.wch);
2658     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2659               ? __btowc (end_ch) : end_elem->opr.wch);
2660     if (start_wc == WEOF || end_wc == WEOF)
2661       return REG_ECOLLATE;
2662     cmp_buf[0] = start_wc;
2663     cmp_buf[4] = end_wc;
2664     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2665       return REG_ERANGE;
2666
2667     /* Got valid collation sequence values, add them as a new entry.
2668        However, for !_LIBC we have no collation elements: if the
2669        character set is single byte, the single byte character set
2670        that we build below suffices.  parse_bracket_exp passes
2671        no MBCSET if dfa->mb_cur_max == 1.  */
2672     if (mbcset)
2673       {
2674         /* Check the space of the arrays.  */
2675         if (BE (*range_alloc == mbcset->nranges, 0))
2676           {
2677             /* There is not enough space, need realloc.  */
2678             wchar_t *new_array_start, *new_array_end;
2679             int new_nranges;
2680
2681             /* +1 in case of mbcset->nranges is 0.  */
2682             new_nranges = 2 * mbcset->nranges + 1;
2683             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2684                are NULL if *range_alloc == 0.  */
2685             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2686                                           new_nranges);
2687             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2688                                         new_nranges);
2689
2690             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2691               return REG_ESPACE;
2692
2693             mbcset->range_starts = new_array_start;
2694             mbcset->range_ends = new_array_end;
2695             *range_alloc = new_nranges;
2696           }
2697
2698         mbcset->range_starts[mbcset->nranges] = start_wc;
2699         mbcset->range_ends[mbcset->nranges++] = end_wc;
2700       }
2701
2702     /* Build the table for single byte characters.  */
2703     for (wc = 0; wc < SBC_MAX; ++wc)
2704       {
2705         cmp_buf[2] = wc;
2706         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2707             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2708           bitset_set (sbcset, wc);
2709       }
2710   }
2711 # else /* not RE_ENABLE_I18N */
2712   {
2713     unsigned int ch;
2714     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2715                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2716                    : 0));
2717     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2718               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2719                  : 0));
2720     if (start_ch > end_ch)
2721       return REG_ERANGE;
2722     /* Build the table for single byte characters.  */
2723     for (ch = 0; ch < SBC_MAX; ++ch)
2724       if (start_ch <= ch  && ch <= end_ch)
2725         bitset_set (sbcset, ch);
2726   }
2727 # endif /* not RE_ENABLE_I18N */
2728   return REG_NOERROR;
2729 }
2730 #endif /* not _LIBC */
2731
2732 #ifndef _LIBC
2733 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2734    Build the collating element which is represented by NAME.
2735    The result are written to MBCSET and SBCSET.
2736    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2737    pointer argument since we may update it.  */
2738
2739 static reg_errcode_t
2740 internal_function
2741 # ifdef RE_ENABLE_I18N
2742 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2743                         int *coll_sym_alloc, const unsigned char *name)
2744 # else /* not RE_ENABLE_I18N */
2745 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2746 # endif /* not RE_ENABLE_I18N */
2747 {
2748   size_t name_len = strlen ((const char *) name);
2749   if (BE (name_len != 1, 0))
2750     return REG_ECOLLATE;
2751   else
2752     {
2753       bitset_set (sbcset, name[0]);
2754       return REG_NOERROR;
2755     }
2756 }
2757 #endif /* not _LIBC */
2758
2759 /* This function parse bracket expression like "[abc]", "[a-c]",
2760    "[[.a-a.]]" etc.  */
2761
2762 static bin_tree_t *
2763 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2764                    reg_syntax_t syntax, reg_errcode_t *err)
2765 {
2766 #ifdef _LIBC
2767   const unsigned char *collseqmb;
2768   const char *collseqwc;
2769   uint32_t nrules;
2770   int32_t table_size;
2771   const int32_t *symb_table;
2772   const unsigned char *extra;
2773
2774   /* Local function for parse_bracket_exp used in _LIBC environement.
2775      Seek the collating symbol entry correspondings to NAME.
2776      Return the index of the symbol in the SYMB_TABLE.  */
2777
2778   auto inline int32_t
2779   __attribute ((always_inline))
2780   seek_collating_symbol_entry (name, name_len)
2781          const unsigned char *name;
2782          size_t name_len;
2783     {
2784       int32_t hash = elem_hash ((const char *) name, name_len);
2785       int32_t elem = hash % table_size;
2786       if (symb_table[2 * elem] != 0)
2787         {
2788           int32_t second = hash % (table_size - 2) + 1;
2789
2790           do
2791             {
2792               /* First compare the hashing value.  */
2793               if (symb_table[2 * elem] == hash
2794                   /* Compare the length of the name.  */
2795                   && name_len == extra[symb_table[2 * elem + 1]]
2796                   /* Compare the name.  */
2797                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2798                              name_len) == 0)
2799                 {
2800                   /* Yep, this is the entry.  */
2801                   break;
2802                 }
2803
2804               /* Next entry.  */
2805               elem += second;
2806             }
2807           while (symb_table[2 * elem] != 0);
2808         }
2809       return elem;
2810     }
2811
2812   /* Local function for parse_bracket_exp used in _LIBC environment.
2813      Look up the collation sequence value of BR_ELEM.
2814      Return the value if succeeded, UINT_MAX otherwise.  */
2815
2816   auto inline unsigned int
2817   __attribute ((always_inline))
2818   lookup_collation_sequence_value (br_elem)
2819          bracket_elem_t *br_elem;
2820     {
2821       if (br_elem->type == SB_CHAR)
2822         {
2823           /*
2824           if (MB_CUR_MAX == 1)
2825           */
2826           if (nrules == 0)
2827             return collseqmb[br_elem->opr.ch];
2828           else
2829             {
2830               wint_t wc = __btowc (br_elem->opr.ch);
2831               return __collseq_table_lookup (collseqwc, wc);
2832             }
2833         }
2834       else if (br_elem->type == MB_CHAR)
2835         {
2836           if (nrules != 0)
2837             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2838         }
2839       else if (br_elem->type == COLL_SYM)
2840         {
2841           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2842           if (nrules != 0)
2843             {
2844               int32_t elem, idx;
2845               elem = seek_collating_symbol_entry (br_elem->opr.name,
2846                                                   sym_name_len);
2847               if (symb_table[2 * elem] != 0)
2848                 {
2849                   /* We found the entry.  */
2850                   idx = symb_table[2 * elem + 1];
2851                   /* Skip the name of collating element name.  */
2852                   idx += 1 + extra[idx];
2853                   /* Skip the byte sequence of the collating element.  */
2854                   idx += 1 + extra[idx];
2855                   /* Adjust for the alignment.  */
2856                   idx = (idx + 3) & ~3;
2857                   /* Skip the multibyte collation sequence value.  */
2858                   idx += sizeof (unsigned int);
2859                   /* Skip the wide char sequence of the collating element.  */
2860                   idx += sizeof (unsigned int) *
2861                     (1 + *(unsigned int *) (extra + idx));
2862                   /* Return the collation sequence value.  */
2863                   return *(unsigned int *) (extra + idx);
2864                 }
2865               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2866                 {
2867                   /* No valid character.  Match it as a single byte
2868                      character.  */
2869                   return collseqmb[br_elem->opr.name[0]];
2870                 }
2871             }
2872           else if (sym_name_len == 1)
2873             return collseqmb[br_elem->opr.name[0]];
2874         }
2875       return UINT_MAX;
2876     }
2877
2878   /* Local function for parse_bracket_exp used in _LIBC environement.
2879      Build the range expression which starts from START_ELEM, and ends
2880      at END_ELEM.  The result are written to MBCSET and SBCSET.
2881      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2882      mbcset->range_ends, is a pointer argument sinse we may
2883      update it.  */
2884
2885   auto inline reg_errcode_t
2886   __attribute ((always_inline))
2887   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2888          re_charset_t *mbcset;
2889          int *range_alloc;
2890          bitset_t sbcset;
2891          bracket_elem_t *start_elem, *end_elem;
2892     {
2893       unsigned int ch;
2894       uint32_t start_collseq;
2895       uint32_t end_collseq;
2896
2897       /* Equivalence Classes and Character Classes can't be a range
2898          start/end.  */
2899       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2900               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2901               0))
2902         return REG_ERANGE;
2903
2904       start_collseq = lookup_collation_sequence_value (start_elem);
2905       end_collseq = lookup_collation_sequence_value (end_elem);
2906       /* Check start/end collation sequence values.  */
2907       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2908         return REG_ECOLLATE;
2909       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2910         return REG_ERANGE;
2911
2912       /* Got valid collation sequence values, add them as a new entry.
2913          However, if we have no collation elements, and the character set
2914          is single byte, the single byte character set that we
2915          build below suffices. */
2916       if (nrules > 0 || dfa->mb_cur_max > 1)
2917         {
2918           /* Check the space of the arrays.  */
2919           if (BE (*range_alloc == mbcset->nranges, 0))
2920             {
2921               /* There is not enough space, need realloc.  */
2922               uint32_t *new_array_start;
2923               uint32_t *new_array_end;
2924               int new_nranges;
2925
2926               /* +1 in case of mbcset->nranges is 0.  */
2927               new_nranges = 2 * mbcset->nranges + 1;
2928               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2929                                             new_nranges);
2930               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2931                                           new_nranges);
2932
2933               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2934                 return REG_ESPACE;
2935
2936               mbcset->range_starts = new_array_start;
2937               mbcset->range_ends = new_array_end;
2938               *range_alloc = new_nranges;
2939             }
2940
2941           mbcset->range_starts[mbcset->nranges] = start_collseq;
2942           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2943         }
2944
2945       /* Build the table for single byte characters.  */
2946       for (ch = 0; ch < SBC_MAX; ch++)
2947         {
2948           uint32_t ch_collseq;
2949           /*
2950           if (MB_CUR_MAX == 1)
2951           */
2952           if (nrules == 0)
2953             ch_collseq = collseqmb[ch];
2954           else
2955             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2956           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2957             bitset_set (sbcset, ch);
2958         }
2959       return REG_NOERROR;
2960     }
2961
2962   /* Local function for parse_bracket_exp used in _LIBC environement.
2963      Build the collating element which is represented by NAME.
2964      The result are written to MBCSET and SBCSET.
2965      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2966      pointer argument sinse we may update it.  */
2967
2968   auto inline reg_errcode_t
2969   __attribute ((always_inline))
2970   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2971          re_charset_t *mbcset;
2972          int *coll_sym_alloc;
2973          bitset_t sbcset;
2974          const unsigned char *name;
2975     {
2976       int32_t elem, idx;
2977       size_t name_len = strlen ((const char *) name);
2978       if (nrules != 0)
2979         {
2980           elem = seek_collating_symbol_entry (name, name_len);
2981           if (symb_table[2 * elem] != 0)
2982             {
2983               /* We found the entry.  */
2984               idx = symb_table[2 * elem + 1];
2985               /* Skip the name of collating element name.  */
2986               idx += 1 + extra[idx];
2987             }
2988           else if (symb_table[2 * elem] == 0 && name_len == 1)
2989             {
2990               /* No valid character, treat it as a normal
2991                  character.  */
2992               bitset_set (sbcset, name[0]);
2993               return REG_NOERROR;
2994             }
2995           else
2996             return REG_ECOLLATE;
2997
2998           /* Got valid collation sequence, add it as a new entry.  */
2999           /* Check the space of the arrays.  */
3000           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3001             {
3002               /* Not enough, realloc it.  */
3003               /* +1 in case of mbcset->ncoll_syms is 0.  */
3004               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3005               /* Use realloc since mbcset->coll_syms is NULL
3006                  if *alloc == 0.  */
3007               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3008                                                    new_coll_sym_alloc);
3009               if (BE (new_coll_syms == NULL, 0))
3010                 return REG_ESPACE;
3011               mbcset->coll_syms = new_coll_syms;
3012               *coll_sym_alloc = new_coll_sym_alloc;
3013             }
3014           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3015           return REG_NOERROR;
3016         }
3017       else
3018         {
3019           if (BE (name_len != 1, 0))
3020             return REG_ECOLLATE;
3021           else
3022             {
3023               bitset_set (sbcset, name[0]);
3024               return REG_NOERROR;
3025             }
3026         }
3027     }
3028 #endif
3029
3030   re_token_t br_token;
3031   re_bitset_ptr_t sbcset;
3032 #ifdef RE_ENABLE_I18N
3033   re_charset_t *mbcset;
3034   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3035   int equiv_class_alloc = 0, char_class_alloc = 0;
3036 #endif /* not RE_ENABLE_I18N */
3037   int non_match = 0;
3038   bin_tree_t *work_tree;
3039   int token_len;
3040   int first_round = 1;
3041 #ifdef _LIBC
3042   collseqmb = (const unsigned char *)
3043     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3044   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3045   if (nrules)
3046     {
3047       /*
3048       if (MB_CUR_MAX > 1)
3049       */
3050       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3051       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3052       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3053                                                   _NL_COLLATE_SYMB_TABLEMB);
3054       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055                                                    _NL_COLLATE_SYMB_EXTRAMB);
3056     }
3057 #endif
3058   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059 #ifdef RE_ENABLE_I18N
3060   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3061 #endif /* RE_ENABLE_I18N */
3062 #ifdef RE_ENABLE_I18N
3063   if (BE (sbcset == NULL || mbcset == NULL, 0))
3064 #else
3065   if (BE (sbcset == NULL, 0))
3066 #endif /* RE_ENABLE_I18N */
3067     {
3068       re_free (sbcset);
3069 #ifdef RE_ENABLE_I18N
3070       re_free (mbcset);
3071 #endif
3072       *err = REG_ESPACE;
3073       return NULL;
3074     }
3075
3076   token_len = peek_token_bracket (token, regexp, syntax);
3077   if (BE (token->type == END_OF_RE, 0))
3078     {
3079       *err = REG_BADPAT;
3080       goto parse_bracket_exp_free_return;
3081     }
3082   if (token->type == OP_NON_MATCH_LIST)
3083     {
3084 #ifdef RE_ENABLE_I18N
3085       mbcset->non_match = 1;
3086 #endif /* not RE_ENABLE_I18N */
3087       non_match = 1;
3088       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3089         bitset_set (sbcset, '\n');
3090       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3091       token_len = peek_token_bracket (token, regexp, syntax);
3092       if (BE (token->type == END_OF_RE, 0))
3093         {
3094           *err = REG_BADPAT;
3095           goto parse_bracket_exp_free_return;
3096         }
3097     }
3098
3099   /* We treat the first ']' as a normal character.  */
3100   if (token->type == OP_CLOSE_BRACKET)
3101     token->type = CHARACTER;
3102
3103   while (1)
3104     {
3105       bracket_elem_t start_elem, end_elem;
3106       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3107       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3108       reg_errcode_t ret;
3109       int token_len2 = 0, is_range_exp = 0;
3110       re_token_t token2;
3111
3112       start_elem.opr.name = start_name_buf;
3113       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3114                                    syntax, first_round);
3115       if (BE (ret != REG_NOERROR, 0))
3116         {
3117           *err = ret;
3118           goto parse_bracket_exp_free_return;
3119         }
3120       first_round = 0;
3121
3122       /* Get information about the next token.  We need it in any case.  */
3123       token_len = peek_token_bracket (token, regexp, syntax);
3124
3125       /* Do not check for ranges if we know they are not allowed.  */
3126       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3127         {
3128           if (BE (token->type == END_OF_RE, 0))
3129             {
3130               *err = REG_EBRACK;
3131               goto parse_bracket_exp_free_return;
3132             }
3133           if (token->type == OP_CHARSET_RANGE)
3134             {
3135               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3136               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3137               if (BE (token2.type == END_OF_RE, 0))
3138                 {
3139                   *err = REG_EBRACK;
3140                   goto parse_bracket_exp_free_return;
3141                 }
3142               if (token2.type == OP_CLOSE_BRACKET)
3143                 {
3144                   /* We treat the last '-' as a normal character.  */
3145                   re_string_skip_bytes (regexp, -token_len);
3146                   token->type = CHARACTER;
3147                 }
3148               else
3149                 is_range_exp = 1;
3150             }
3151         }
3152
3153       if (is_range_exp == 1)
3154         {
3155           end_elem.opr.name = end_name_buf;
3156           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3157                                        dfa, syntax, 1);
3158           if (BE (ret != REG_NOERROR, 0))
3159             {
3160               *err = ret;
3161               goto parse_bracket_exp_free_return;
3162             }
3163
3164           token_len = peek_token_bracket (token, regexp, syntax);
3165
3166 #ifdef _LIBC
3167           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3168                                   &start_elem, &end_elem);
3169 #else
3170 # ifdef RE_ENABLE_I18N
3171           *err = build_range_exp (sbcset,
3172                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3173                                   &range_alloc, &start_elem, &end_elem);
3174 # else
3175           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3176 # endif
3177 #endif /* RE_ENABLE_I18N */
3178           if (BE (*err != REG_NOERROR, 0))
3179             goto parse_bracket_exp_free_return;
3180         }
3181       else
3182         {
3183           switch (start_elem.type)
3184             {
3185             case SB_CHAR:
3186               bitset_set (sbcset, start_elem.opr.ch);
3187               break;
3188 #ifdef RE_ENABLE_I18N
3189             case MB_CHAR:
3190               /* Check whether the array has enough space.  */
3191               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3192                 {
3193                   wchar_t *new_mbchars;
3194                   /* Not enough, realloc it.  */
3195                   /* +1 in case of mbcset->nmbchars is 0.  */
3196                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3197                   /* Use realloc since array is NULL if *alloc == 0.  */
3198                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3199                                             mbchar_alloc);
3200                   if (BE (new_mbchars == NULL, 0))
3201                     goto parse_bracket_exp_espace;
3202                   mbcset->mbchars = new_mbchars;
3203                 }
3204               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3205               break;
3206 #endif /* RE_ENABLE_I18N */
3207             case EQUIV_CLASS:
3208               *err = build_equiv_class (sbcset,
3209 #ifdef RE_ENABLE_I18N
3210                                         mbcset, &equiv_class_alloc,
3211 #endif /* RE_ENABLE_I18N */
3212                                         start_elem.opr.name);
3213               if (BE (*err != REG_NOERROR, 0))
3214                 goto parse_bracket_exp_free_return;
3215               break;
3216             case COLL_SYM:
3217               *err = build_collating_symbol (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219                                              mbcset, &coll_sym_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221                                              start_elem.opr.name);
3222               if (BE (*err != REG_NOERROR, 0))
3223                 goto parse_bracket_exp_free_return;
3224               break;
3225             case CHAR_CLASS:
3226               *err = build_charclass (regexp->trans, sbcset,
3227 #ifdef RE_ENABLE_I18N
3228                                       mbcset, &char_class_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230                                       start_elem.opr.name, syntax);
3231               if (BE (*err != REG_NOERROR, 0))
3232                goto parse_bracket_exp_free_return;
3233               break;
3234             default:
3235               assert (0);
3236               break;
3237             }
3238         }
3239       if (BE (token->type == END_OF_RE, 0))
3240         {
3241           *err = REG_EBRACK;
3242           goto parse_bracket_exp_free_return;
3243         }
3244       if (token->type == OP_CLOSE_BRACKET)
3245         break;
3246     }
3247
3248   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3249
3250   /* If it is non-matching list.  */
3251   if (non_match)
3252     bitset_not (sbcset);
3253
3254 #ifdef RE_ENABLE_I18N
3255   /* Ensure only single byte characters are set.  */
3256   if (dfa->mb_cur_max > 1)
3257     bitset_mask (sbcset, dfa->sb_char);
3258
3259   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3260       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3261                                                      || mbcset->non_match)))
3262     {
3263       bin_tree_t *mbc_tree;
3264       int sbc_idx;
3265       /* Build a tree for complex bracket.  */
3266       dfa->has_mb_node = 1;
3267       br_token.type = COMPLEX_BRACKET;
3268       br_token.opr.mbcset = mbcset;
3269       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270       if (BE (mbc_tree == NULL, 0))
3271         goto parse_bracket_exp_espace;
3272       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3273         if (sbcset[sbc_idx])
3274           break;
3275       /* If there are no bits set in sbcset, there is no point
3276          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3277       if (sbc_idx < BITSET_WORDS)
3278         {
3279           /* Build a tree for simple bracket.  */
3280           br_token.type = SIMPLE_BRACKET;
3281           br_token.opr.sbcset = sbcset;
3282           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283           if (BE (work_tree == NULL, 0))
3284             goto parse_bracket_exp_espace;
3285
3286           /* Then join them by ALT node.  */
3287           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3288           if (BE (work_tree == NULL, 0))
3289             goto parse_bracket_exp_espace;
3290         }
3291       else
3292         {
3293           re_free (sbcset);
3294           work_tree = mbc_tree;
3295         }
3296     }
3297   else
3298 #endif /* not RE_ENABLE_I18N */
3299     {
3300 #ifdef RE_ENABLE_I18N
3301       free_charset (mbcset);
3302 #endif
3303       /* Build a tree for simple bracket.  */
3304       br_token.type = SIMPLE_BRACKET;
3305       br_token.opr.sbcset = sbcset;
3306       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3307       if (BE (work_tree == NULL, 0))
3308         goto parse_bracket_exp_espace;
3309     }
3310   return work_tree;
3311
3312  parse_bracket_exp_espace:
3313   *err = REG_ESPACE;
3314  parse_bracket_exp_free_return:
3315   re_free (sbcset);
3316 #ifdef RE_ENABLE_I18N
3317   free_charset (mbcset);
3318 #endif /* RE_ENABLE_I18N */
3319   return NULL;
3320 }
3321
3322 /* Parse an element in the bracket expression.  */
3323
3324 static reg_errcode_t
3325 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3326                        re_token_t *token, int token_len, re_dfa_t *dfa,
3327                        reg_syntax_t syntax, int accept_hyphen)
3328 {
3329 #ifdef RE_ENABLE_I18N
3330   int cur_char_size;
3331   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3332   if (cur_char_size > 1)
3333     {
3334       elem->type = MB_CHAR;
3335       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3336       re_string_skip_bytes (regexp, cur_char_size);
3337       return REG_NOERROR;
3338     }
3339 #endif /* RE_ENABLE_I18N */
3340   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3341   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3342       || token->type == OP_OPEN_EQUIV_CLASS)
3343     return parse_bracket_symbol (elem, regexp, token);
3344   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3345     {
3346       /* A '-' must only appear as anything but a range indicator before
3347          the closing bracket.  Everything else is an error.  */
3348       re_token_t token2;
3349       (void) peek_token_bracket (&token2, regexp, syntax);
3350       if (token2.type != OP_CLOSE_BRACKET)
3351         /* The actual error value is not standardized since this whole
3352            case is undefined.  But ERANGE makes good sense.  */
3353         return REG_ERANGE;
3354     }
3355   elem->type = SB_CHAR;
3356   elem->opr.ch = token->opr.c;
3357   return REG_NOERROR;
3358 }
3359
3360 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3361    such as [:<character_class>:], [.<collating_element>.], and
3362    [=<equivalent_class>=].  */
3363
3364 static reg_errcode_t
3365 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3366                       re_token_t *token)
3367 {
3368   unsigned char ch, delim = token->opr.c;
3369   int i = 0;
3370   if (re_string_eoi(regexp))
3371     return REG_EBRACK;
3372   for (;; ++i)
3373     {
3374       if (i >= BRACKET_NAME_BUF_SIZE)
3375         return REG_EBRACK;
3376       if (token->type == OP_OPEN_CHAR_CLASS)
3377         ch = re_string_fetch_byte_case (regexp);
3378       else
3379         ch = re_string_fetch_byte (regexp);
3380       if (re_string_eoi(regexp))
3381         return REG_EBRACK;
3382       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3383         break;
3384       elem->opr.name[i] = ch;
3385     }
3386   re_string_skip_bytes (regexp, 1);
3387   elem->opr.name[i] = '\0';
3388   switch (token->type)
3389     {
3390     case OP_OPEN_COLL_ELEM:
3391       elem->type = COLL_SYM;
3392       break;
3393     case OP_OPEN_EQUIV_CLASS:
3394       elem->type = EQUIV_CLASS;
3395       break;
3396     case OP_OPEN_CHAR_CLASS:
3397       elem->type = CHAR_CLASS;
3398       break;
3399     default:
3400       break;
3401     }
3402   return REG_NOERROR;
3403 }
3404
3405   /* Helper function for parse_bracket_exp.
3406      Build the equivalence class which is represented by NAME.
3407      The result are written to MBCSET and SBCSET.
3408      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3409      is a pointer argument sinse we may update it.  */
3410
3411 static reg_errcode_t
3412 #ifdef RE_ENABLE_I18N
3413 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3414                    int *equiv_class_alloc, const unsigned char *name)
3415 #else /* not RE_ENABLE_I18N */
3416 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3417 #endif /* not RE_ENABLE_I18N */
3418 {
3419 #ifdef _LIBC
3420   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3421   if (nrules != 0)
3422     {
3423       const int32_t *table, *indirect;
3424       const unsigned char *weights, *extra, *cp;
3425       unsigned char char_buf[2];
3426       int32_t idx1, idx2;
3427       unsigned int ch;
3428       size_t len;
3429       /* This #include defines a local function!  */
3430 # include <locale/weight.h>
3431       /* Calculate the index for equivalence class.  */
3432       cp = name;
3433       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3434       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3435                                                _NL_COLLATE_WEIGHTMB);
3436       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3437                                                    _NL_COLLATE_EXTRAMB);
3438       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3439                                                 _NL_COLLATE_INDIRECTMB);
3440       idx1 = findidx (&cp, -1);
3441       if (BE (idx1 == 0 || *cp != '\0', 0))
3442         /* This isn't a valid character.  */
3443         return REG_ECOLLATE;
3444
3445       /* Build single byte matcing table for this equivalence class.  */
3446       len = weights[idx1 & 0xffffff];
3447       for (ch = 0; ch < SBC_MAX; ++ch)
3448         {
3449           char_buf[0] = ch;
3450           cp = char_buf;
3451           idx2 = findidx (&cp, 1);
3452 /*
3453           idx2 = table[ch];
3454 */
3455           if (idx2 == 0)
3456             /* This isn't a valid character.  */
3457             continue;
3458           /* Compare only if the length matches and the collation rule
3459              index is the same.  */
3460           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3461             {
3462               int cnt = 0;
3463
3464               while (cnt <= len &&
3465                      weights[(idx1 & 0xffffff) + 1 + cnt]
3466                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3467                 ++cnt;
3468
3469               if (cnt > len)
3470                 bitset_set (sbcset, ch);
3471             }
3472         }
3473       /* Check whether the array has enough space.  */
3474       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3475         {
3476           /* Not enough, realloc it.  */
3477           /* +1 in case of mbcset->nequiv_classes is 0.  */
3478           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3479           /* Use realloc since the array is NULL if *alloc == 0.  */
3480           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3481                                                    int32_t,
3482                                                    new_equiv_class_alloc);
3483           if (BE (new_equiv_classes == NULL, 0))
3484             return REG_ESPACE;
3485           mbcset->equiv_classes = new_equiv_classes;
3486           *equiv_class_alloc = new_equiv_class_alloc;
3487         }
3488       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3489     }
3490   else
3491 #endif /* _LIBC */
3492     {
3493       if (BE (strlen ((const char *) name) != 1, 0))
3494         return REG_ECOLLATE;
3495       bitset_set (sbcset, *name);
3496     }
3497   return REG_NOERROR;
3498 }
3499
3500   /* Helper function for parse_bracket_exp.
3501      Build the character class which is represented by NAME.
3502      The result are written to MBCSET and SBCSET.
3503      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3504      is a pointer argument sinse we may update it.  */
3505
3506 static reg_errcode_t
3507 #ifdef RE_ENABLE_I18N
3508 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3509                  re_charset_t *mbcset, int *char_class_alloc,
3510                  const unsigned char *class_name, reg_syntax_t syntax)
3511 #else /* not RE_ENABLE_I18N */
3512 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3513                  const unsigned char *class_name, reg_syntax_t syntax)
3514 #endif /* not RE_ENABLE_I18N */
3515 {
3516   int i;
3517   const char *name = (const char *) class_name;
3518
3519   /* In case of REG_ICASE "upper" and "lower" match the both of
3520      upper and lower cases.  */
3521   if ((syntax & RE_ICASE)
3522       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3523     name = "alpha";
3524
3525 #ifdef RE_ENABLE_I18N
3526   /* Check the space of the arrays.  */
3527   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3528     {
3529       /* Not enough, realloc it.  */
3530       /* +1 in case of mbcset->nchar_classes is 0.  */
3531       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3532       /* Use realloc since array is NULL if *alloc == 0.  */
3533       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3534                                                new_char_class_alloc);
3535       if (BE (new_char_classes == NULL, 0))
3536         return REG_ESPACE;
3537       mbcset->char_classes = new_char_classes;
3538       *char_class_alloc = new_char_class_alloc;
3539     }
3540   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3541 #endif /* RE_ENABLE_I18N */
3542
3543 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3544   do {                                          \
3545     if (BE (trans != NULL, 0))                  \
3546       {                                         \
3547         for (i = 0; i < SBC_MAX; ++i)           \
3548           if (ctype_func (i))                   \
3549             bitset_set (sbcset, trans[i]);      \
3550       }                                         \
3551     else                                        \
3552       {                                         \
3553         for (i = 0; i < SBC_MAX; ++i)           \
3554           if (ctype_func (i))                   \
3555             bitset_set (sbcset, i);             \
3556       }                                         \
3557   } while (0)
3558
3559   if (strcmp (name, "alnum") == 0)
3560     BUILD_CHARCLASS_LOOP (isalnum);
3561   else if (strcmp (name, "cntrl") == 0)
3562     BUILD_CHARCLASS_LOOP (iscntrl);
3563   else if (strcmp (name, "lower") == 0)
3564     BUILD_CHARCLASS_LOOP (islower);
3565   else if (strcmp (name, "space") == 0)
3566     BUILD_CHARCLASS_LOOP (isspace);
3567   else if (strcmp (name, "alpha") == 0)
3568     BUILD_CHARCLASS_LOOP (isalpha);
3569   else if (strcmp (name, "digit") == 0)
3570     BUILD_CHARCLASS_LOOP (isdigit);
3571   else if (strcmp (name, "print") == 0)
3572     BUILD_CHARCLASS_LOOP (isprint);
3573   else if (strcmp (name, "upper") == 0)
3574     BUILD_CHARCLASS_LOOP (isupper);
3575   else if (strcmp (name, "blank") == 0)
3576     BUILD_CHARCLASS_LOOP (isblank);
3577   else if (strcmp (name, "graph") == 0)
3578     BUILD_CHARCLASS_LOOP (isgraph);
3579   else if (strcmp (name, "punct") == 0)
3580     BUILD_CHARCLASS_LOOP (ispunct);
3581   else if (strcmp (name, "xdigit") == 0)
3582     BUILD_CHARCLASS_LOOP (isxdigit);
3583   else
3584     return REG_ECTYPE;
3585
3586   return REG_NOERROR;
3587 }
3588
3589 static bin_tree_t *
3590 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3591                     const unsigned char *class_name,
3592                     const unsigned char *extra, int non_match,
3593                     reg_errcode_t *err)
3594 {
3595   re_bitset_ptr_t sbcset;
3596 #ifdef RE_ENABLE_I18N
3597   re_charset_t *mbcset;
3598   int alloc = 0;
3599 #endif /* not RE_ENABLE_I18N */
3600   reg_errcode_t ret;
3601   re_token_t br_token;
3602   bin_tree_t *tree;
3603
3604   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3605 #ifdef RE_ENABLE_I18N
3606   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3607 #endif /* RE_ENABLE_I18N */
3608
3609 #ifdef RE_ENABLE_I18N
3610   if (BE (sbcset == NULL || mbcset == NULL, 0))
3611 #else /* not RE_ENABLE_I18N */
3612   if (BE (sbcset == NULL, 0))
3613 #endif /* not RE_ENABLE_I18N */
3614     {
3615       *err = REG_ESPACE;
3616       return NULL;
3617     }
3618
3619   if (non_match)
3620     {
3621 #ifdef RE_ENABLE_I18N
3622       mbcset->non_match = 1;
3623 #endif /* not RE_ENABLE_I18N */
3624     }
3625
3626   /* We don't care the syntax in this case.  */
3627   ret = build_charclass (trans, sbcset,
3628 #ifdef RE_ENABLE_I18N
3629                          mbcset, &alloc,
3630 #endif /* RE_ENABLE_I18N */
3631                          class_name, 0);
3632
3633   if (BE (ret != REG_NOERROR, 0))
3634     {
3635       re_free (sbcset);
3636 #ifdef RE_ENABLE_I18N
3637       free_charset (mbcset);
3638 #endif /* RE_ENABLE_I18N */
3639       *err = ret;
3640       return NULL;
3641     }
3642   /* \w match '_' also.  */
3643   for (; *extra; extra++)
3644     bitset_set (sbcset, *extra);
3645
3646   /* If it is non-matching list.  */
3647   if (non_match)
3648     bitset_not (sbcset);
3649
3650 #ifdef RE_ENABLE_I18N
3651   /* Ensure only single byte characters are set.  */
3652   if (dfa->mb_cur_max > 1)
3653     bitset_mask (sbcset, dfa->sb_char);
3654 #endif
3655
3656   /* Build a tree for simple bracket.  */
3657   br_token.type = SIMPLE_BRACKET;
3658   br_token.opr.sbcset = sbcset;
3659   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3660   if (BE (tree == NULL, 0))
3661     goto build_word_op_espace;
3662
3663 #ifdef RE_ENABLE_I18N
3664   if (dfa->mb_cur_max > 1)
3665     {
3666       bin_tree_t *mbc_tree;
3667       /* Build a tree for complex bracket.  */
3668       br_token.type = COMPLEX_BRACKET;
3669       br_token.opr.mbcset = mbcset;
3670       dfa->has_mb_node = 1;
3671       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3672       if (BE (mbc_tree == NULL, 0))
3673         goto build_word_op_espace;
3674       /* Then join them by ALT node.  */
3675       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3676       if (BE (mbc_tree != NULL, 1))
3677         return tree;
3678     }
3679   else
3680     {
3681       free_charset (mbcset);
3682       return tree;
3683     }
3684 #else /* not RE_ENABLE_I18N */
3685   return tree;
3686 #endif /* not RE_ENABLE_I18N */
3687
3688  build_word_op_espace:
3689   re_free (sbcset);
3690 #ifdef RE_ENABLE_I18N
3691   free_charset (mbcset);
3692 #endif /* RE_ENABLE_I18N */
3693   *err = REG_ESPACE;
3694   return NULL;
3695 }
3696
3697 /* This is intended for the expressions like "a{1,3}".
3698    Fetch a number from `input', and return the number.
3699    Return -1, if the number field is empty like "{,1}".
3700    Return -2, If an error is occured.  */
3701
3702 static int
3703 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3704 {
3705   int num = -1;
3706   unsigned char c;
3707   while (1)
3708     {
3709       fetch_token (token, input, syntax);
3710       c = token->opr.c;
3711       if (BE (token->type == END_OF_RE, 0))
3712         return -2;
3713       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3714         break;
3715       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3716              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3717       num = (num > RE_DUP_MAX) ? -2 : num;
3718     }
3719   return num;
3720 }
3721 \f
3722 #ifdef RE_ENABLE_I18N
3723 static void
3724 free_charset (re_charset_t *cset)
3725 {
3726   re_free (cset->mbchars);
3727 # ifdef _LIBC
3728   re_free (cset->coll_syms);
3729   re_free (cset->equiv_classes);
3730   re_free (cset->range_starts);
3731   re_free (cset->range_ends);
3732 # endif
3733   re_free (cset->char_classes);
3734   re_free (cset);
3735 }
3736 #endif /* RE_ENABLE_I18N */
3737 \f
3738 /* Functions for binary tree operation.  */
3739
3740 /* Create a tree node.  */
3741
3742 static bin_tree_t *
3743 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3744              re_token_type_t type)
3745 {
3746   re_token_t t;
3747   t.type = type;
3748   return create_token_tree (dfa, left, right, &t);
3749 }
3750
3751 static bin_tree_t *
3752 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3753                    const re_token_t *token)
3754 {
3755   bin_tree_t *tree;
3756   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3757     {
3758       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3759
3760       if (storage == NULL)
3761         return NULL;
3762       storage->next = dfa->str_tree_storage;
3763       dfa->str_tree_storage = storage;
3764       dfa->str_tree_storage_idx = 0;
3765     }
3766   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3767
3768   tree->parent = NULL;
3769   tree->left = left;
3770   tree->right = right;
3771   tree->token = *token;
3772   tree->token.duplicated = 0;
3773   tree->token.opt_subexp = 0;
3774   tree->first = NULL;
3775   tree->next = NULL;
3776   tree->node_idx = -1;
3777
3778   if (left != NULL)
3779     left->parent = tree;
3780   if (right != NULL)
3781     right->parent = tree;
3782   return tree;
3783 }
3784
3785 /* Mark the tree SRC as an optional subexpression.
3786    To be called from preorder or postorder.  */
3787
3788 static reg_errcode_t
3789 mark_opt_subexp (void *extra, bin_tree_t *node)
3790 {
3791   int idx = (int) (long) extra;
3792   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3793     node->token.opt_subexp = 1;
3794
3795   return REG_NOERROR;
3796 }
3797
3798 /* Free the allocated memory inside NODE. */
3799
3800 static void
3801 free_token (re_token_t *node)
3802 {
3803 #ifdef RE_ENABLE_I18N
3804   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3805     free_charset (node->opr.mbcset);
3806   else
3807 #endif /* RE_ENABLE_I18N */
3808     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3809       re_free (node->opr.sbcset);
3810 }
3811
3812 /* Worker function for tree walking.  Free the allocated memory inside NODE
3813    and its children. */
3814
3815 static reg_errcode_t
3816 free_tree (void *extra, bin_tree_t *node)
3817 {
3818   free_token (&node->token);
3819   return REG_NOERROR;
3820 }
3821
3822
3823 /* Duplicate the node SRC, and return new node.  This is a preorder
3824    visit similar to the one implemented by the generic visitor, but
3825    we need more infrastructure to maintain two parallel trees --- so,
3826    it's easier to duplicate.  */
3827
3828 static bin_tree_t *
3829 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3830 {
3831   const bin_tree_t *node;
3832   bin_tree_t *dup_root;
3833   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3834
3835   for (node = root; ; )
3836     {
3837       /* Create a new tree and link it back to the current parent.  */
3838       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3839       if (*p_new == NULL)
3840         return NULL;
3841       (*p_new)->parent = dup_node;
3842       (*p_new)->token.duplicated = 1;
3843       dup_node = *p_new;
3844
3845       /* Go to the left node, or up and to the right.  */
3846       if (node->left)
3847         {
3848           node = node->left;
3849           p_new = &dup_node->left;
3850         }
3851       else
3852         {
3853           const bin_tree_t *prev = NULL;
3854           while (node->right == prev || node->right == NULL)
3855             {
3856               prev = node;
3857               node = node->parent;
3858               dup_node = dup_node->parent;
3859               if (!node)
3860                 return dup_root;
3861             }
3862           node = node->right;
3863           p_new = &dup_node->right;
3864         }
3865     }
3866 }