posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  51                                              int top_clone_node, int root_node,
  52                                              unsigned int constraint);
  53 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  54 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  55                                    unsigned int constraint);
  56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  58                                          int node, int root);
  59 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  60 static int fetch_number (re_string_t *input, re_token_t *token,
  61                          reg_syntax_t syntax);
  62 static void fetch_token (re_token_t *result, re_string_t *input,
  63                          reg_syntax_t syntax);
  64 static int peek_token (re_token_t *token, re_string_t *input,
  65                         reg_syntax_t syntax);
  66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  67                                reg_syntax_t syntax);
  68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  69                           reg_syntax_t syntax, reg_errcode_t *err);
  70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  74                                  re_token_t *token, reg_syntax_t syntax,
  75                                  int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  77                                      re_token_t *token, reg_syntax_t syntax,
  78                                      int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  80                                   re_token_t *token, reg_syntax_t syntax,
  81                                   int nest, reg_errcode_t *err);
  82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  83                                  re_dfa_t *dfa, re_token_t *token,
  84                                  reg_syntax_t syntax, reg_errcode_t *err);
  85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  86                                       re_token_t *token, reg_syntax_t syntax,
  87                                       reg_errcode_t *err);
  88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  89                                             re_string_t *regexp,
  90                                             re_token_t *token, int token_len,
  91                                             re_dfa_t *dfa,
  92                                             reg_syntax_t syntax,
  93                                             int accept_hyphen);
  94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  95                                           re_string_t *regexp,
  96                                           re_token_t *token);
  97 #ifndef _LIBC
  98 # ifdef RE_ENABLE_I18N
  99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 100                                       re_charset_t *mbcset, int *range_alloc,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              re_charset_t *mbcset,
 105                                              int *coll_sym_alloc,
 106                                              const unsigned char *name);
 107 # else /* not RE_ENABLE_I18N */
 108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 109                                       bracket_elem_t *start_elem,
 110                                       bracket_elem_t *end_elem);
 111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 112                                              const unsigned char *name);
 113 # endif /* not RE_ENABLE_I18N */
 114 #endif /* not _LIBC */
 115 #ifdef RE_ENABLE_I18N
 116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 117                                         re_charset_t *mbcset,
 118                                         int *equiv_class_alloc,
 119                                         const unsigned char *name);
 120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 121                                       re_bitset_ptr_t sbcset,
 122                                       re_charset_t *mbcset,
 123                                       int *char_class_alloc,
 124                                       const unsigned char *class_name,
 125                                       reg_syntax_t syntax);
 126 #else  /* not RE_ENABLE_I18N */
 127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 128                                         const unsigned char *name);
 129 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 130                                       re_bitset_ptr_t sbcset,
 131                                       const unsigned char *class_name,
 132                                       reg_syntax_t syntax);
 133 #endif /* not RE_ENABLE_I18N */
 134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 135                                        RE_TRANSLATE_TYPE trans,
 136                                        const unsigned char *class_name,
 137                                        const unsigned char *extra,
 138                                        int non_match, reg_errcode_t *err);
 139 static bin_tree_t *create_tree (re_dfa_t *dfa,
 140                                 bin_tree_t *left, bin_tree_t *right,
 141                                 re_token_type_t type);
 142 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 143                                       bin_tree_t *left, bin_tree_t *right,
 144                                       const re_token_t *token);
 145 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 146 static void free_token (re_token_t *node);
 147 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 148 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 149 \f
 150 /* This table gives an error message for each of the error codes listed
 151    in regex.h.  Obviously the order here has to be same as there.
 152    POSIX doesn't require that we do anything for REG_NOERROR,
 153    but why not be nice?  */
 154
 155 const char __re_error_msgid[] attribute_hidden =
 156   {
 157 #define REG_NOERROR_IDX 0
 158     gettext_noop ("Success")    /* REG_NOERROR */
 159     "\0"
 160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 161     gettext_noop ("No match")   /* REG_NOMATCH */
 162     "\0"
 163 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 164     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 165     "\0"
 166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 167     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 168     "\0"
 169 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 170     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 171     "\0"
 172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 173     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 174     "\0"
 175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 176     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 177     "\0"
 178 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 179     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 180     "\0"
 181 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 182     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 183     "\0"
 184 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 185     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 186     "\0"
 187 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 188     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 189     "\0"
 190 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 191     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 192     "\0"
 193 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 194     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 195     "\0"
 196 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 197     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 198     "\0"
 199 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 200     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 201     "\0"
 202 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 203     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 204     "\0"
 205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 206     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 207   };
 208
 209 const size_t __re_error_msgid_idx[] attribute_hidden =
 210   {
 211     REG_NOERROR_IDX,
 212     REG_NOMATCH_IDX,
 213     REG_BADPAT_IDX,
 214     REG_ECOLLATE_IDX,
 215     REG_ECTYPE_IDX,
 216     REG_EESCAPE_IDX,
 217     REG_ESUBREG_IDX,
 218     REG_EBRACK_IDX,
 219     REG_EPAREN_IDX,
 220     REG_EBRACE_IDX,
 221     REG_BADBR_IDX,
 222     REG_ERANGE_IDX,
 223     REG_ESPACE_IDX,
 224     REG_BADRPT_IDX,
 225     REG_EEND_IDX,
 226     REG_ESIZE_IDX,
 227     REG_ERPAREN_IDX
 228   };
 229 \f
 230 /* Entry points for GNU code.  */
 231
 232 /* re_compile_pattern is the GNU regular expression compiler: it
 233    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 234    Returns 0 if the pattern was valid, otherwise an error string.
 235
 236    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 237    are set in BUFP on entry.  */
 238
 239 const char *
 240 re_compile_pattern (pattern, length, bufp)
 241     const char *pattern;
 242     size_t length;
 243     struct re_pattern_buffer *bufp;
 244 {
 245   reg_errcode_t ret;
 246
 247   /* And GNU code determines whether or not to get register information
 248      by passing null for the REGS argument to re_match, etc., not by
 249      setting no_sub, unless RE_NO_SUB is set.  */
 250   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 251
 252   /* Match anchors at newline.  */
 253   bufp->newline_anchor = 1;
 254
 255   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 256
 257   if (!ret)
 258     return NULL;
 259   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 260 }
 261 #ifdef _LIBC
 262 weak_alias (__re_compile_pattern, re_compile_pattern)
 263 #endif
 264
 265 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 266    also be assigned to arbitrarily: each pattern buffer stores its own
 267    syntax, so it can be changed between regex compilations.  */
 268 /* This has no initializer because initialized variables in Emacs
 269    become read-only after dumping.  */
 270 reg_syntax_t re_syntax_options;
 271
 272
 273 /* Specify the precise syntax of regexps for compilation.  This provides
 274    for compatibility for various utilities which historically have
 275    different, incompatible syntaxes.
 276
 277    The argument SYNTAX is a bit mask comprised of the various bits
 278    defined in regex.h.  We return the old syntax.  */
 279
 280 reg_syntax_t
 281 re_set_syntax (syntax)
 282     reg_syntax_t syntax;
 283 {
 284   reg_syntax_t ret = re_syntax_options;
 285
 286   re_syntax_options = syntax;
 287   return ret;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_set_syntax, re_set_syntax)
 291 #endif
 292
 293 int
 294 re_compile_fastmap (bufp)
 295     struct re_pattern_buffer *bufp;
 296 {
 297   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 298   char *fastmap = bufp->fastmap;
 299
 300   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 301   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 302   if (dfa->init_state != dfa->init_state_word)
 303     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 304   if (dfa->init_state != dfa->init_state_nl)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 306   if (dfa->init_state != dfa->init_state_begbuf)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 308   bufp->fastmap_accurate = 1;
 309   return 0;
 310 }
 311 #ifdef _LIBC
 312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 313 #endif
 314
 315 static inline void
 316 __attribute ((always_inline))
 317 re_set_fastmap (char *fastmap, int icase, int ch)
 318 {
 319   fastmap[ch] = 1;
 320   if (icase)
 321     fastmap[tolower (ch)] = 1;
 322 }
 323
 324 /* Helper function for re_compile_fastmap.
 325    Compile fastmap for the initial_state INIT_STATE.  */
 326
 327 static void
 328 re_compile_fastmap_iter (bufp, init_state, fastmap)
 329      regex_t *bufp;
 330      const re_dfastate_t *init_state;
 331      char *fastmap;
 332 {
 333   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 334   int node_cnt;
 335   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 336   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 337     {
 338       int node = init_state->nodes.elems[node_cnt];
 339       re_token_type_t type = dfa->nodes[node].type;
 340
 341       if (type == CHARACTER)
 342         {
 343           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 344 #ifdef RE_ENABLE_I18N
 345           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 346             {
 347               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 348               wchar_t wc;
 349               mbstate_t state;
 350
 351               p = buf;
 352               *p++ = dfa->nodes[node].opr.c;
 353               while (++node < dfa->nodes_len
 354                      && dfa->nodes[node].type == CHARACTER
 355                      && dfa->nodes[node].mb_partial)
 356                 *p++ = dfa->nodes[node].opr.c;
 357               memset (&state, 0, sizeof (state));
 358               if (mbrtowc (&wc, (const char *) buf, p - buf,
 359                            &state) == p - buf
 360                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 361                       != (size_t) -1))
 362                 re_set_fastmap (fastmap, 0, buf[0]);
 363             }
 364 #endif
 365         }
 366       else if (type == SIMPLE_BRACKET)
 367         {
 368           int i, j, ch;
 369           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 370             for (j = 0; j < UINT_BITS; ++j, ++ch)
 371               if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
 372                 re_set_fastmap (fastmap, icase, ch);
 373         }
 374 #ifdef RE_ENABLE_I18N
 375       else if (type == COMPLEX_BRACKET)
 376         {
 377           int i;
 378           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 379           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 380               || cset->nranges || cset->nchar_classes)
 381             {
 382 # ifdef _LIBC
 383               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 384                 {
 385                   /* In this case we want to catch the bytes which are
 386                      the first byte of any collation elements.
 387                      e.g. In da_DK, we want to catch 'a' since "aa"
 388                           is a valid collation element, and don't catch
 389                           'b' since 'b' is the only collation element
 390                           which starts from 'b'.  */
 391                   int j, ch;
 392                   const int32_t *table = (const int32_t *)
 393                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 394                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 395                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 396                       if (table[ch] < 0)
 397                         re_set_fastmap (fastmap, icase, ch);
 398                 }
 399 # else
 400               if (dfa->mb_cur_max > 1)
 401                 for (i = 0; i < SBC_MAX; ++i)
 402                   if (__btowc (i) == WEOF)
 403                     re_set_fastmap (fastmap, icase, i);
 404 # endif /* not _LIBC */
 405             }
 406           for (i = 0; i < cset->nmbchars; ++i)
 407             {
 408               char buf[256];
 409               mbstate_t state;
 410               memset (&state, '\0', sizeof (state));
 411               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 412                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 413               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 414                 {
 415                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 416                       != (size_t) -1)
 417                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      `buffer' to the compiled pattern;
 443      `used' to the length of the compiled pattern;
 444      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      `fastmap' to an allocated space for the fastmap;
 449      `fastmap_accurate' to zero;
 450      `re_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *__restrict preg;
 475     const char *__restrict pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *__restrict preg;
 538     char *__restrict errbuf;
 539     size_t errbuf_size;
 540 {
 541   const char *msg;
 542   size_t msg_size;
 543
 544   if (BE (errcode < 0
 545           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 546                                / sizeof (__re_error_msgid_idx[0])), 0))
 547     /* Only error codes returned by the rest of the code should be passed
 548        to this routine.  If we are given anything else, or if other regex
 549        code generates an invalid error code, then the program has a bug.
 550        Dump core so we can fix it.  */
 551     abort ();
 552
 553   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 554
 555   msg_size = strlen (msg) + 1; /* Includes the null.  */
 556
 557   if (BE (errbuf_size != 0, 1))
 558     {
 559       if (BE (msg_size > errbuf_size, 0))
 560         {
 561 #if defined HAVE_MEMPCPY || defined _LIBC
 562           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 563 #else
 564           memcpy (errbuf, msg, errbuf_size - 1);
 565           errbuf[errbuf_size - 1] = 0;
 566 #endif
 567         }
 568       else
 569         memcpy (errbuf, msg, msg_size);
 570     }
 571
 572   return msg_size;
 573 }
 574 #ifdef _LIBC
 575 weak_alias (__regerror, regerror)
 576 #endif
 577
 578
 579 #ifdef RE_ENABLE_I18N
 580 /* This static array is used for the map to single-byte characters when
 581    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 582    it the same all the time.  UTF-8 is the preferred encoding so this is
 583    a worthwhile optimization.  */
 584 static const bitset utf8_sb_map =
 585 {
 586   /* Set the first 128 bits.  */
 587 # if UINT_MAX == 0xffffffff
 588   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 589 # else
 590 #  error "Add case for new unsigned int size"
 591 # endif
 592 };
 593 #endif
 594
 595
 596 static void
 597 free_dfa_content (re_dfa_t *dfa)
 598 {
 599   int i, j;
 600
 601   if (dfa->nodes)
 602     for (i = 0; i < dfa->nodes_len; ++i)
 603       free_token (dfa->nodes + i);
 604   re_free (dfa->nexts);
 605   for (i = 0; i < dfa->nodes_len; ++i)
 606     {
 607       if (dfa->eclosures != NULL)
 608         re_node_set_free (dfa->eclosures + i);
 609       if (dfa->inveclosures != NULL)
 610         re_node_set_free (dfa->inveclosures + i);
 611       if (dfa->edests != NULL)
 612         re_node_set_free (dfa->edests + i);
 613     }
 614   re_free (dfa->edests);
 615   re_free (dfa->eclosures);
 616   re_free (dfa->inveclosures);
 617   re_free (dfa->nodes);
 618
 619   if (dfa->state_table)
 620     for (i = 0; i <= dfa->state_hash_mask; ++i)
 621       {
 622         struct re_state_table_entry *entry = dfa->state_table + i;
 623         for (j = 0; j < entry->num; ++j)
 624           {
 625             re_dfastate_t *state = entry->array[j];
 626             free_state (state);
 627           }
 628         re_free (entry->array);
 629       }
 630   re_free (dfa->state_table);
 631 #ifdef RE_ENABLE_I18N
 632   if (dfa->sb_char != utf8_sb_map)
 633     re_free (dfa->sb_char);
 634 #endif
 635   re_free (dfa->subexp_map);
 636 #ifdef DEBUG
 637   re_free (dfa->re_str);
 638 #endif
 639
 640   re_free (dfa);
 641 }
 642
 643
 644 /* Free dynamically allocated space used by PREG.  */
 645
 646 void
 647 regfree (preg)
 648     regex_t *preg;
 649 {
 650   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 651   if (BE (dfa != NULL, 1))
 652     free_dfa_content (dfa);
 653   preg->buffer = NULL;
 654   preg->allocated = 0;
 655
 656   re_free (preg->fastmap);
 657   preg->fastmap = NULL;
 658
 659   re_free (preg->translate);
 660   preg->translate = NULL;
 661 }
 662 #ifdef _LIBC
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (s)
 682      const char *s;
 683 {
 684   reg_errcode_t ret;
 685   char *fastmap;
 686
 687   if (!s)
 688     {
 689       if (!re_comp_buf.buffer)
 690         return gettext ("No previous regular expression");
 691       return 0;
 692     }
 693
 694   if (re_comp_buf.buffer)
 695     {
 696       fastmap = re_comp_buf.fastmap;
 697       re_comp_buf.fastmap = NULL;
 698       __regfree (&re_comp_buf);
 699       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 700       re_comp_buf.fastmap = fastmap;
 701     }
 702
 703   if (re_comp_buf.fastmap == NULL)
 704     {
 705       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 706       if (re_comp_buf.fastmap == NULL)
 707         return (char *) gettext (__re_error_msgid
 708                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 709     }
 710
 711   /* Since `re_exec' always passes NULL for the `regs' argument, we
 712      don't need to initialize the pattern buffer fields which affect it.  */
 713
 714   /* Match anchors at newlines.  */
 715   re_comp_buf.newline_anchor = 1;
 716
 717   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 718
 719   if (!ret)
 720     return NULL;
 721
 722   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 723   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 724 }
 725
 726 #ifdef _LIBC
 727 libc_freeres_fn (free_mem)
 728 {
 729   __regfree (&re_comp_buf);
 730 }
 731 #endif
 732
 733 #endif /* _REGEX_RE_COMP */
 734 \f
 735 /* Internal entry point.
 736    Compile the regular expression PATTERN, whose length is LENGTH.
 737    SYNTAX indicate regular expression's syntax.  */
 738
 739 static reg_errcode_t
 740 re_compile_internal (preg, pattern, length, syntax)
 741      regex_t *preg;
 742      const char * pattern;
 743      size_t length;
 744      reg_syntax_t syntax;
 745 {
 746   reg_errcode_t err = REG_NOERROR;
 747   re_dfa_t *dfa;
 748   re_string_t regexp;
 749
 750   /* Initialize the pattern buffer.  */
 751   preg->fastmap_accurate = 0;
 752   preg->syntax = syntax;
 753   preg->not_bol = preg->not_eol = 0;
 754   preg->used = 0;
 755   preg->re_nsub = 0;
 756   preg->can_be_null = 0;
 757   preg->regs_allocated = REGS_UNALLOCATED;
 758
 759   /* Initialize the dfa.  */
 760   dfa = (re_dfa_t *) preg->buffer;
 761   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762     {
 763       /* If zero allocated, but buffer is non-null, try to realloc
 764          enough space.  This loses if buffer's address is bogus, but
 765          that is the user's responsibility.  If ->buffer is NULL this
 766          is a simple allocation.  */
 767       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768       if (dfa == NULL)
 769         return REG_ESPACE;
 770       preg->allocated = sizeof (re_dfa_t);
 771       preg->buffer = (unsigned char *) dfa;
 772     }
 773   preg->used = sizeof (re_dfa_t);
 774
 775   err = init_dfa (dfa, length);
 776   if (BE (err != REG_NOERROR, 0))
 777     {
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783 #ifdef DEBUG
 784   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 785   dfa->re_str = re_malloc (char, length + 1);
 786   strncpy (dfa->re_str, pattern, length + 1);
 787 #endif
 788
 789   __libc_lock_init (dfa->lock);
 790
 791   err = re_string_construct (&regexp, pattern, length, preg->translate,
 792                              syntax & RE_ICASE, dfa);
 793   if (BE (err != REG_NOERROR, 0))
 794     {
 795     re_compile_internal_free_return:
 796       free_workarea_compile (preg);
 797       re_string_destruct (&regexp);
 798       free_dfa_content (dfa);
 799       preg->buffer = NULL;
 800       preg->allocated = 0;
 801       return err;
 802     }
 803
 804   /* Parse the regular expression, and build a structure tree.  */
 805   preg->re_nsub = 0;
 806   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 807   if (BE (dfa->str_tree == NULL, 0))
 808     goto re_compile_internal_free_return;
 809
 810   /* Analyze the tree and create the nfa.  */
 811   err = analyze (preg);
 812   if (BE (err != REG_NOERROR, 0))
 813     goto re_compile_internal_free_return;
 814
 815 #ifdef RE_ENABLE_I18N
 816   /* If possible, do searching in single byte encoding to speed things up.  */
 817   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 818     optimize_utf8 (dfa);
 819 #endif
 820
 821   /* Then create the initial state of the dfa.  */
 822   err = create_initial_state (dfa);
 823
 824   /* Release work areas.  */
 825   free_workarea_compile (preg);
 826   re_string_destruct (&regexp);
 827
 828   if (BE (err != REG_NOERROR, 0))
 829     {
 830       free_dfa_content (dfa);
 831       preg->buffer = NULL;
 832       preg->allocated = 0;
 833     }
 834
 835   return err;
 836 }
 837
 838 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 839    as the initial length of some arrays.  */
 840
 841 static reg_errcode_t
 842 init_dfa (dfa, pat_len)
 843      re_dfa_t *dfa;
 844      size_t pat_len;
 845 {
 846   unsigned int table_size;
 847 #ifndef _LIBC
 848   char *codeset_name;
 849 #endif
 850
 851   memset (dfa, '\0', sizeof (re_dfa_t));
 852
 853   /* Force allocation of str_tree_storage the first time.  */
 854   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 855
 856   /* Avoid overflows.  */
 857   if (pat_len == SIZE_MAX)
 858     return REG_ESPACE;
 859
 860   dfa->nodes_alloc = pat_len + 1;
 861   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 862
 863   /*  table_size = 2 ^ ceil(log pat_len) */
 864   for (table_size = 1; ; table_size <<= 1)
 865     if (table_size > pat_len)
 866       break;
 867
 868   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 869   dfa->state_hash_mask = table_size - 1;
 870
 871   dfa->mb_cur_max = MB_CUR_MAX;
 872 #ifdef _LIBC
 873   if (dfa->mb_cur_max == 6
 874       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 875     dfa->is_utf8 = 1;
 876   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 877                        != 0);
 878 #else
 879 # ifdef HAVE_LANGINFO_CODESET
 880   codeset_name = nl_langinfo (CODESET);
 881 # else
 882   codeset_name = getenv ("LC_ALL");
 883   if (codeset_name == NULL || codeset_name[0] == '\0')
 884     codeset_name = getenv ("LC_CTYPE");
 885   if (codeset_name == NULL || codeset_name[0] == '\0')
 886     codeset_name = getenv ("LANG");
 887   if (codeset_name == NULL)
 888     codeset_name = "";
 889   else if (strchr (codeset_name, '.') !=  NULL)
 890     codeset_name = strchr (codeset_name, '.') + 1;
 891 # endif
 892
 893   if (strcasecmp (codeset_name, "UTF-8") == 0
 894       || strcasecmp (codeset_name, "UTF8") == 0)
 895     dfa->is_utf8 = 1;
 896
 897   /* We check exhaustively in the loop below if this charset is a
 898      superset of ASCII.  */
 899   dfa->map_notascii = 0;
 900 #endif
 901
 902 #ifdef RE_ENABLE_I18N
 903   if (dfa->mb_cur_max > 1)
 904     {
 905       if (dfa->is_utf8)
 906         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 907       else
 908         {
 909           int i, j, ch;
 910
 911           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 912           if (BE (dfa->sb_char == NULL, 0))
 913             return REG_ESPACE;
 914
 915           /* Clear all bits by, then set those corresponding to single
 916              byte chars.  */
 917           bitset_empty (dfa->sb_char);
 918
 919           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 920             for (j = 0; j < UINT_BITS; ++j, ++ch)
 921               {
 922                 wint_t wch = __btowc (ch);
 923                 if (wch != WEOF)
 924                   dfa->sb_char[i] |= 1u << j;
 925 # ifndef _LIBC
 926                 if (isascii (ch) && wch != ch)
 927                   dfa->map_notascii = 1;
 928 # endif
 929               }
 930         }
 931     }
 932 #endif
 933
 934   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 935     return REG_ESPACE;
 936   return REG_NOERROR;
 937 }
 938
 939 /* Initialize WORD_CHAR table, which indicate which character is
 940    "word".  In this case "word" means that it is the word construction
 941    character used by some operators like "\<", "\>", etc.  */
 942
 943 static void
 944 init_word_char (dfa)
 945      re_dfa_t *dfa;
 946 {
 947   int i, j, ch;
 948   dfa->word_ops_used = 1;
 949   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 950     for (j = 0; j < UINT_BITS; ++j, ++ch)
 951       if (isalnum (ch) || ch == '_')
 952         dfa->word_char[i] |= 1u << j;
 953 }
 954
 955 /* Free the work area which are only used while compiling.  */
 956
 957 static void
 958 free_workarea_compile (preg)
 959      regex_t *preg;
 960 {
 961   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 962   bin_tree_storage_t *storage, *next;
 963   for (storage = dfa->str_tree_storage; storage; storage = next)
 964     {
 965       next = storage->next;
 966       re_free (storage);
 967     }
 968   dfa->str_tree_storage = NULL;
 969   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 970   dfa->str_tree = NULL;
 971   re_free (dfa->org_indices);
 972   dfa->org_indices = NULL;
 973 }
 974
 975 /* Create initial states for all contexts.  */
 976
 977 static reg_errcode_t
 978 create_initial_state (dfa)
 979      re_dfa_t *dfa;
 980 {
 981   int first, i;
 982   reg_errcode_t err;
 983   re_node_set init_nodes;
 984
 985   /* Initial states have the epsilon closure of the node which is
 986      the first node of the regular expression.  */
 987   first = dfa->str_tree->first->node_idx;
 988   dfa->init_node = first;
 989   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 990   if (BE (err != REG_NOERROR, 0))
 991     return err;
 992
 993   /* The back-references which are in initial states can epsilon transit,
 994      since in this case all of the subexpressions can be null.
 995      Then we add epsilon closures of the nodes which are the next nodes of
 996      the back-references.  */
 997   if (dfa->nbackref > 0)
 998     for (i = 0; i < init_nodes.nelem; ++i)
 999       {
1000         int node_idx = init_nodes.elems[i];
1001         re_token_type_t type = dfa->nodes[node_idx].type;
1002
1003         int clexp_idx;
1004         if (type != OP_BACK_REF)
1005           continue;
1006         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1007           {
1008             re_token_t *clexp_node;
1009             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1010             if (clexp_node->type == OP_CLOSE_SUBEXP
1011                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1012               break;
1013           }
1014         if (clexp_idx == init_nodes.nelem)
1015           continue;
1016
1017         if (type == OP_BACK_REF)
1018           {
1019             int dest_idx = dfa->edests[node_idx].elems[0];
1020             if (!re_node_set_contains (&init_nodes, dest_idx))
1021               {
1022                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1023                 i = 0;
1024               }
1025           }
1026       }
1027
1028   /* It must be the first time to invoke acquire_state.  */
1029   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1030   /* We don't check ERR here, since the initial state must not be NULL.  */
1031   if (BE (dfa->init_state == NULL, 0))
1032     return err;
1033   if (dfa->init_state->has_constraint)
1034     {
1035       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1036                                                        CONTEXT_WORD);
1037       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1038                                                      CONTEXT_NEWLINE);
1039       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1040                                                          &init_nodes,
1041                                                          CONTEXT_NEWLINE
1042                                                          | CONTEXT_BEGBUF);
1043       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1044               || dfa->init_state_begbuf == NULL, 0))
1045         return err;
1046     }
1047   else
1048     dfa->init_state_word = dfa->init_state_nl
1049       = dfa->init_state_begbuf = dfa->init_state;
1050
1051   re_node_set_free (&init_nodes);
1052   return REG_NOERROR;
1053 }
1054 \f
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058    DFA nodes where needed.  */
1059
1060 static void
1061 optimize_utf8 (dfa)
1062      re_dfa_t *dfa;
1063 {
1064   int node, i, mb_chars = 0, has_period = 0;
1065
1066   for (node = 0; node < dfa->nodes_len; ++node)
1067     switch (dfa->nodes[node].type)
1068       {
1069       case CHARACTER:
1070         if (dfa->nodes[node].opr.c >= 0x80)
1071           mb_chars = 1;
1072         break;
1073       case ANCHOR:
1074         switch (dfa->nodes[node].opr.idx)
1075           {
1076           case LINE_FIRST:
1077           case LINE_LAST:
1078           case BUF_FIRST:
1079           case BUF_LAST:
1080             break;
1081           default:
1082             /* Word anchors etc. cannot be handled.  */
1083             return;
1084           }
1085         break;
1086       case OP_PERIOD:
1087         has_period = 1;
1088         break;
1089       case OP_BACK_REF:
1090       case OP_ALT:
1091       case END_OF_RE:
1092       case OP_DUP_ASTERISK:
1093       case OP_OPEN_SUBEXP:
1094       case OP_CLOSE_SUBEXP:
1095         break;
1096       case COMPLEX_BRACKET:
1097         return;
1098       case SIMPLE_BRACKET:
1099         /* Just double check.  */
1100         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1101           if (dfa->nodes[node].opr.sbcset[i])
1102             return;
1103         break;
1104       default:
1105         abort ();
1106       }
1107
1108   if (mb_chars || has_period)
1109     for (node = 0; node < dfa->nodes_len; ++node)
1110       {
1111         if (dfa->nodes[node].type == CHARACTER
1112             && dfa->nodes[node].opr.c >= 0x80)
1113           dfa->nodes[node].mb_partial = 0;
1114         else if (dfa->nodes[node].type == OP_PERIOD)
1115           dfa->nodes[node].type = OP_UTF8_PERIOD;
1116       }
1117
1118   /* The search can be in single byte locale.  */
1119   dfa->mb_cur_max = 1;
1120   dfa->is_utf8 = 0;
1121   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1122 }
1123 #endif
1124 \f
1125 /* Analyze the structure tree, and calculate "first", "next", "edest",
1126    "eclosure", and "inveclosure".  */
1127
1128 static reg_errcode_t
1129 analyze (preg)
1130      regex_t *preg;
1131 {
1132   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1133   reg_errcode_t ret;
1134
1135   /* Allocate arrays.  */
1136   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1137   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1138   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1139   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1140   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1141           || dfa->eclosures == NULL, 0))
1142     return REG_ESPACE;
1143
1144   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1145   if (dfa->subexp_map != NULL)
1146     {
1147       int i;
1148       for (i = 0; i < preg->re_nsub; i++)
1149         dfa->subexp_map[i] = i;
1150       preorder (dfa->str_tree, optimize_subexps, dfa);
1151       for (i = 0; i < preg->re_nsub; i++)
1152         if (dfa->subexp_map[i] != i)
1153           break;
1154       if (i == preg->re_nsub)
1155         {
1156           free (dfa->subexp_map);
1157           dfa->subexp_map = NULL;
1158         }
1159     }
1160
1161   ret = postorder (dfa->str_tree, lower_subexps, preg);
1162   if (BE (ret != REG_NOERROR, 0))
1163     return ret;
1164   ret = postorder (dfa->str_tree, calc_first, dfa);
1165   if (BE (ret != REG_NOERROR, 0))
1166     return ret;
1167   preorder (dfa->str_tree, calc_next, dfa);
1168   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1169   if (BE (ret != REG_NOERROR, 0))
1170     return ret;
1171   ret = calc_eclosure (dfa);
1172   if (BE (ret != REG_NOERROR, 0))
1173     return ret;
1174
1175   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1176      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1177   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1178       || dfa->nbackref)
1179     {
1180       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1181       if (BE (dfa->inveclosures == NULL, 0))
1182         return REG_ESPACE;
1183       ret = calc_inveclosure (dfa);
1184     }
1185
1186   return ret;
1187 }
1188
1189 /* Our parse trees are very unbalanced, so we cannot use a stack to
1190    implement parse tree visits.  Instead, we use parent pointers and
1191    some hairy code in these two functions.  */
1192 static reg_errcode_t
1193 postorder (root, fn, extra)
1194      bin_tree_t *root;
1195      reg_errcode_t (fn (void *, bin_tree_t *));
1196      void *extra;
1197 {
1198   bin_tree_t *node, *prev;
1199
1200   for (node = root; ; )
1201     {
1202       /* Descend down the tree, preferably to the left (or to the right
1203          if that's the only child).  */
1204       while (node->left || node->right)
1205         if (node->left)
1206           node = node->left;
1207         else
1208           node = node->right;
1209
1210       do
1211         {
1212           reg_errcode_t err = fn (extra, node);
1213           if (BE (err != REG_NOERROR, 0))
1214             return err;
1215           if (node->parent == NULL)
1216             return REG_NOERROR;
1217           prev = node;
1218           node = node->parent;
1219         }
1220       /* Go up while we have a node that is reached from the right.  */
1221       while (node->right == prev || node->right == NULL);
1222       node = node->right;
1223     }
1224 }
1225
1226 static reg_errcode_t
1227 preorder (root, fn, extra)
1228      bin_tree_t *root;
1229      reg_errcode_t (fn (void *, bin_tree_t *));
1230      void *extra;
1231 {
1232   bin_tree_t *node;
1233
1234   for (node = root; ; )
1235     {
1236       reg_errcode_t err = fn (extra, node);
1237       if (BE (err != REG_NOERROR, 0))
1238         return err;
1239
1240       /* Go to the left node, or up and to the right.  */
1241       if (node->left)
1242         node = node->left;
1243       else
1244         {
1245           bin_tree_t *prev = NULL;
1246           while (node->right == prev || node->right == NULL)
1247             {
1248               prev = node;
1249               node = node->parent;
1250               if (!node)
1251                 return REG_NOERROR;
1252             }
1253           node = node->right;
1254         }
1255     }
1256 }
1257
1258 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1259    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1260    backreferences as well.  Requires a preorder visit.  */
1261 static reg_errcode_t
1262 optimize_subexps (extra, node)
1263      void *extra;
1264      bin_tree_t *node;
1265 {
1266   re_dfa_t *dfa = (re_dfa_t *) extra;
1267
1268   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1269     {
1270       int idx = node->token.opr.idx;
1271       node->token.opr.idx = dfa->subexp_map[idx];
1272       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1273     }
1274
1275   else if (node->token.type == SUBEXP
1276            && node->left && node->left->token.type == SUBEXP)
1277     {
1278       int other_idx = node->left->token.opr.idx;
1279
1280       node->left = node->left->left;
1281       if (node->left)
1282         node->left->parent = node;
1283
1284       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1285       if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1286         dfa->used_bkref_map &= ~(1u << other_idx);
1287     }
1288
1289   return REG_NOERROR;
1290 }
1291
1292 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1293    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1294 static reg_errcode_t
1295 lower_subexps (extra, node)
1296      void *extra;
1297      bin_tree_t *node;
1298 {
1299   regex_t *preg = (regex_t *) extra;
1300   reg_errcode_t err = REG_NOERROR;
1301
1302   if (node->left && node->left->token.type == SUBEXP)
1303     {
1304       node->left = lower_subexp (&err, preg, node->left);
1305       if (node->left)
1306         node->left->parent = node;
1307     }
1308   if (node->right && node->right->token.type == SUBEXP)
1309     {
1310       node->right = lower_subexp (&err, preg, node->right);
1311       if (node->right)
1312         node->right->parent = node;
1313     }
1314
1315   return err;
1316 }
1317
1318 static bin_tree_t *
1319 lower_subexp (err, preg, node)
1320      reg_errcode_t *err;
1321      regex_t *preg;
1322      bin_tree_t *node;
1323 {
1324   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1325   bin_tree_t *body = node->left;
1326   bin_tree_t *op, *cls, *tree1, *tree;
1327
1328   if (preg->no_sub
1329       /* We do not optimize empty subexpressions, because otherwise we may
1330          have bad CONCAT nodes with NULL children.  This is obviously not
1331          very common, so we do not lose much.  An example that triggers
1332          this case is the sed "script" /\(\)/x.  */
1333       && node->left != NULL
1334       && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1335           || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1336     return node->left;
1337
1338   /* Convert the SUBEXP node to the concatenation of an
1339      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1340   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1341   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1342   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1343   tree = create_tree (dfa, op, tree1, CONCAT);
1344   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1345     {
1346       *err = REG_ESPACE;
1347       return NULL;
1348     }
1349
1350   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1351   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1352   return tree;
1353 }
1354
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356    nodes.  Requires a postorder visit.  */
1357 static reg_errcode_t
1358 calc_first (extra, node)
1359      void *extra;
1360      bin_tree_t *node;
1361 {
1362   re_dfa_t *dfa = (re_dfa_t *) extra;
1363   if (node->token.type == CONCAT)
1364     {
1365       node->first = node->left->first;
1366       node->node_idx = node->left->node_idx;
1367     }
1368   else
1369     {
1370       node->first = node;
1371       node->node_idx = re_dfa_add_node (dfa, node->token);
1372       if (BE (node->node_idx == -1, 0))
1373         return REG_ESPACE;
1374     }
1375   return REG_NOERROR;
1376 }
1377
1378 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1379 static reg_errcode_t
1380 calc_next (extra, node)
1381      void *extra;
1382      bin_tree_t *node;
1383 {
1384   switch (node->token.type)
1385     {
1386     case OP_DUP_ASTERISK:
1387       node->left->next = node;
1388       break;
1389     case CONCAT:
1390       node->left->next = node->right->first;
1391       node->right->next = node->next;
1392       break;
1393     default:
1394       if (node->left)
1395         node->left->next = node->next;
1396       if (node->right)
1397         node->right->next = node->next;
1398       break;
1399     }
1400   return REG_NOERROR;
1401 }
1402
1403 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1404 static reg_errcode_t
1405 link_nfa_nodes (extra, node)
1406      void *extra;
1407      bin_tree_t *node;
1408 {
1409   re_dfa_t *dfa = (re_dfa_t *) extra;
1410   int idx = node->node_idx;
1411   reg_errcode_t err = REG_NOERROR;
1412
1413   switch (node->token.type)
1414     {
1415     case CONCAT:
1416       break;
1417
1418     case END_OF_RE:
1419       assert (node->next == NULL);
1420       break;
1421
1422     case OP_DUP_ASTERISK:
1423     case OP_ALT:
1424       {
1425         int left, right;
1426         dfa->has_plural_match = 1;
1427         if (node->left != NULL)
1428           left = node->left->first->node_idx;
1429         else
1430           left = node->next->node_idx;
1431         if (node->right != NULL)
1432           right = node->right->first->node_idx;
1433         else
1434           right = node->next->node_idx;
1435         assert (left > -1);
1436         assert (right > -1);
1437         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1438       }
1439       break;
1440
1441     case ANCHOR:
1442     case OP_OPEN_SUBEXP:
1443     case OP_CLOSE_SUBEXP:
1444       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1445       break;
1446
1447     case OP_BACK_REF:
1448       dfa->nexts[idx] = node->next->node_idx;
1449       if (node->token.type == OP_BACK_REF)
1450         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1451       break;
1452
1453     default:
1454       assert (!IS_EPSILON_NODE (node->token.type));
1455       dfa->nexts[idx] = node->next->node_idx;
1456       break;
1457     }
1458
1459   return err;
1460 }
1461
1462 /* Duplicate the epsilon closure of the node ROOT_NODE.
1463    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1464    to their own constraint.  */
1465
1466 static reg_errcode_t
1467 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1468                         init_constraint)
1469      re_dfa_t *dfa;
1470      int top_org_node, top_clone_node, root_node;
1471      unsigned int init_constraint;
1472 {
1473   int org_node, clone_node, ret;
1474   unsigned int constraint = init_constraint;
1475   for (org_node = top_org_node, clone_node = top_clone_node;;)
1476     {
1477       int org_dest, clone_dest;
1478       if (dfa->nodes[org_node].type == OP_BACK_REF)
1479         {
1480           /* If the back reference epsilon-transit, its destination must
1481              also have the constraint.  Then duplicate the epsilon closure
1482              of the destination of the back reference, and store it in
1483              edests of the back reference.  */
1484           org_dest = dfa->nexts[org_node];
1485           re_node_set_empty (dfa->edests + clone_node);
1486           clone_dest = duplicate_node (dfa, org_dest, constraint);
1487           if (BE (clone_dest == -1, 0))
1488             return REG_ESPACE;
1489           dfa->nexts[clone_node] = dfa->nexts[org_node];
1490           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491           if (BE (ret < 0, 0))
1492             return REG_ESPACE;
1493         }
1494       else if (dfa->edests[org_node].nelem == 0)
1495         {
1496           /* In case of the node can't epsilon-transit, don't duplicate the
1497              destination and store the original destination as the
1498              destination of the node.  */
1499           dfa->nexts[clone_node] = dfa->nexts[org_node];
1500           break;
1501         }
1502       else if (dfa->edests[org_node].nelem == 1)
1503         {
1504           /* In case of the node can epsilon-transit, and it has only one
1505              destination.  */
1506           org_dest = dfa->edests[org_node].elems[0];
1507           re_node_set_empty (dfa->edests + clone_node);
1508           if (dfa->nodes[org_node].type == ANCHOR)
1509             {
1510               /* In case of the node has another constraint, append it.  */
1511               if (org_node == root_node && clone_node != org_node)
1512                 {
1513                   /* ...but if the node is root_node itself, it means the
1514                      epsilon closure have a loop, then tie it to the
1515                      destination of the root_node.  */
1516                   ret = re_node_set_insert (dfa->edests + clone_node,
1517                                             org_dest);
1518                   if (BE (ret < 0, 0))
1519                     return REG_ESPACE;
1520                   break;
1521                 }
1522               constraint |= dfa->nodes[org_node].opr.ctx_type;
1523             }
1524           clone_dest = duplicate_node (dfa, org_dest, constraint);
1525           if (BE (clone_dest == -1, 0))
1526             return REG_ESPACE;
1527           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528           if (BE (ret < 0, 0))
1529             return REG_ESPACE;
1530         }
1531       else /* dfa->edests[org_node].nelem == 2 */
1532         {
1533           /* In case of the node can epsilon-transit, and it has two
1534              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1535           org_dest = dfa->edests[org_node].elems[0];
1536           re_node_set_empty (dfa->edests + clone_node);
1537           /* Search for a duplicated node which satisfies the constraint.  */
1538           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539           if (clone_dest == -1)
1540             {
1541               /* There are no such a duplicated node, create a new one.  */
1542               reg_errcode_t err;
1543               clone_dest = duplicate_node (dfa, org_dest, constraint);
1544               if (BE (clone_dest == -1, 0))
1545                 return REG_ESPACE;
1546               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547               if (BE (ret < 0, 0))
1548                 return REG_ESPACE;
1549               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550                                             root_node, constraint);
1551               if (BE (err != REG_NOERROR, 0))
1552                 return err;
1553             }
1554           else
1555             {
1556               /* There are a duplicated node which satisfy the constraint,
1557                  use it to avoid infinite loop.  */
1558               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559               if (BE (ret < 0, 0))
1560                 return REG_ESPACE;
1561             }
1562
1563           org_dest = dfa->edests[org_node].elems[1];
1564           clone_dest = duplicate_node (dfa, org_dest, constraint);
1565           if (BE (clone_dest == -1, 0))
1566             return REG_ESPACE;
1567           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568           if (BE (ret < 0, 0))
1569             return REG_ESPACE;
1570         }
1571       org_node = org_dest;
1572       clone_node = clone_dest;
1573     }
1574   return REG_NOERROR;
1575 }
1576
1577 /* Search for a node which is duplicated from the node ORG_NODE, and
1578    satisfies the constraint CONSTRAINT.  */
1579
1580 static int
1581 search_duplicated_node (dfa, org_node, constraint)
1582      const re_dfa_t *dfa;
1583      int org_node;
1584      unsigned int constraint;
1585 {
1586   int idx;
1587   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1588     {
1589       if (org_node == dfa->org_indices[idx]
1590           && constraint == dfa->nodes[idx].constraint)
1591         return idx; /* Found.  */
1592     }
1593   return -1; /* Not found.  */
1594 }
1595
1596 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1597    Return the index of the new node, or -1 if insufficient storage is
1598    available.  */
1599
1600 static int
1601 duplicate_node (dfa, org_idx, constraint)
1602      re_dfa_t *dfa;
1603      int org_idx;
1604      unsigned int constraint;
1605 {
1606   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1607   if (BE (dup_idx != -1, 1))
1608     {
1609       dfa->nodes[dup_idx].constraint = constraint;
1610       if (dfa->nodes[org_idx].type == ANCHOR)
1611         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1612       dfa->nodes[dup_idx].duplicated = 1;
1613
1614       /* Store the index of the original node.  */
1615       dfa->org_indices[dup_idx] = org_idx;
1616     }
1617   return dup_idx;
1618 }
1619
1620 static reg_errcode_t
1621 calc_inveclosure (dfa)
1622      re_dfa_t *dfa;
1623 {
1624   int src, idx, ret;
1625   for (idx = 0; idx < dfa->nodes_len; ++idx)
1626     re_node_set_init_empty (dfa->inveclosures + idx);
1627
1628   for (src = 0; src < dfa->nodes_len; ++src)
1629     {
1630       int *elems = dfa->eclosures[src].elems;
1631       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1632         {
1633           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634           if (BE (ret == -1, 0))
1635             return REG_ESPACE;
1636         }
1637     }
1638
1639   return REG_NOERROR;
1640 }
1641
1642 /* Calculate "eclosure" for all the node in DFA.  */
1643
1644 static reg_errcode_t
1645 calc_eclosure (dfa)
1646      re_dfa_t *dfa;
1647 {
1648   int node_idx, incomplete;
1649 #ifdef DEBUG
1650   assert (dfa->nodes_len > 0);
1651 #endif
1652   incomplete = 0;
1653   /* For each nodes, calculate epsilon closure.  */
1654   for (node_idx = 0; ; ++node_idx)
1655     {
1656       reg_errcode_t err;
1657       re_node_set eclosure_elem;
1658       if (node_idx == dfa->nodes_len)
1659         {
1660           if (!incomplete)
1661             break;
1662           incomplete = 0;
1663           node_idx = 0;
1664         }
1665
1666 #ifdef DEBUG
1667       assert (dfa->eclosures[node_idx].nelem != -1);
1668 #endif
1669
1670       /* If we have already calculated, skip it.  */
1671       if (dfa->eclosures[node_idx].nelem != 0)
1672         continue;
1673       /* Calculate epsilon closure of `node_idx'.  */
1674       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1675       if (BE (err != REG_NOERROR, 0))
1676         return err;
1677
1678       if (dfa->eclosures[node_idx].nelem == 0)
1679         {
1680           incomplete = 1;
1681           re_node_set_free (&eclosure_elem);
1682         }
1683     }
1684   return REG_NOERROR;
1685 }
1686
1687 /* Calculate epsilon closure of NODE.  */
1688
1689 static reg_errcode_t
1690 calc_eclosure_iter (new_set, dfa, node, root)
1691      re_node_set *new_set;
1692      re_dfa_t *dfa;
1693      int node, root;
1694 {
1695   reg_errcode_t err;
1696   unsigned int constraint;
1697   int i, incomplete;
1698   re_node_set eclosure;
1699   incomplete = 0;
1700   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1701   if (BE (err != REG_NOERROR, 0))
1702     return err;
1703
1704   /* This indicates that we are calculating this node now.
1705      We reference this value to avoid infinite loop.  */
1706   dfa->eclosures[node].nelem = -1;
1707
1708   constraint = ((dfa->nodes[node].type == ANCHOR)
1709                 ? dfa->nodes[node].opr.ctx_type : 0);
1710   /* If the current node has constraints, duplicate all nodes.
1711      Since they must inherit the constraints.  */
1712   if (constraint
1713       && dfa->edests[node].nelem
1714       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1715     {
1716       int org_node, cur_node;
1717       org_node = cur_node = node;
1718       err = duplicate_node_closure (dfa, node, node, node, constraint);
1719       if (BE (err != REG_NOERROR, 0))
1720         return err;
1721     }
1722
1723   /* Expand each epsilon destination nodes.  */
1724   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1725     for (i = 0; i < dfa->edests[node].nelem; ++i)
1726       {
1727         re_node_set eclosure_elem;
1728         int edest = dfa->edests[node].elems[i];
1729         /* If calculating the epsilon closure of `edest' is in progress,
1730            return intermediate result.  */
1731         if (dfa->eclosures[edest].nelem == -1)
1732           {
1733             incomplete = 1;
1734             continue;
1735           }
1736         /* If we haven't calculated the epsilon closure of `edest' yet,
1737            calculate now. Otherwise use calculated epsilon closure.  */
1738         if (dfa->eclosures[edest].nelem == 0)
1739           {
1740             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1741             if (BE (err != REG_NOERROR, 0))
1742               return err;
1743           }
1744         else
1745           eclosure_elem = dfa->eclosures[edest];
1746         /* Merge the epsilon closure of `edest'.  */
1747         re_node_set_merge (&eclosure, &eclosure_elem);
1748         /* If the epsilon closure of `edest' is incomplete,
1749            the epsilon closure of this node is also incomplete.  */
1750         if (dfa->eclosures[edest].nelem == 0)
1751           {
1752             incomplete = 1;
1753             re_node_set_free (&eclosure_elem);
1754           }
1755       }
1756
1757   /* Epsilon closures include itself.  */
1758   re_node_set_insert (&eclosure, node);
1759   if (incomplete && !root)
1760     dfa->eclosures[node].nelem = 0;
1761   else
1762     dfa->eclosures[node] = eclosure;
1763   *new_set = eclosure;
1764   return REG_NOERROR;
1765 }
1766 \f
1767 /* Functions for token which are used in the parser.  */
1768
1769 /* Fetch a token from INPUT.
1770    We must not use this function inside bracket expressions.  */
1771
1772 static void
1773 fetch_token (result, input, syntax)
1774      re_token_t *result;
1775      re_string_t *input;
1776      reg_syntax_t syntax;
1777 {
1778   re_string_skip_bytes (input, peek_token (result, input, syntax));
1779 }
1780
1781 /* Peek a token from INPUT, and return the length of the token.
1782    We must not use this function inside bracket expressions.  */
1783
1784 static int
1785 peek_token (token, input, syntax)
1786      re_token_t *token;
1787      re_string_t *input;
1788      reg_syntax_t syntax;
1789 {
1790   unsigned char c;
1791
1792   if (re_string_eoi (input))
1793     {
1794       token->type = END_OF_RE;
1795       return 0;
1796     }
1797
1798   c = re_string_peek_byte (input, 0);
1799   token->opr.c = c;
1800
1801   token->word_char = 0;
1802 #ifdef RE_ENABLE_I18N
1803   token->mb_partial = 0;
1804   if (input->mb_cur_max > 1 &&
1805       !re_string_first_byte (input, re_string_cur_idx (input)))
1806     {
1807       token->type = CHARACTER;
1808       token->mb_partial = 1;
1809       return 1;
1810     }
1811 #endif
1812   if (c == '\\')
1813     {
1814       unsigned char c2;
1815       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1816         {
1817           token->type = BACK_SLASH;
1818           return 1;
1819         }
1820
1821       c2 = re_string_peek_byte_case (input, 1);
1822       token->opr.c = c2;
1823       token->type = CHARACTER;
1824 #ifdef RE_ENABLE_I18N
1825       if (input->mb_cur_max > 1)
1826         {
1827           wint_t wc = re_string_wchar_at (input,
1828                                           re_string_cur_idx (input) + 1);
1829           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1830         }
1831       else
1832 #endif
1833         token->word_char = IS_WORD_CHAR (c2) != 0;
1834
1835       switch (c2)
1836         {
1837         case '|':
1838           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1839             token->type = OP_ALT;
1840           break;
1841         case '1': case '2': case '3': case '4': case '5':
1842         case '6': case '7': case '8': case '9':
1843           if (!(syntax & RE_NO_BK_REFS))
1844             {
1845               token->type = OP_BACK_REF;
1846               token->opr.idx = c2 - '1';
1847             }
1848           break;
1849         case '<':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = WORD_FIRST;
1854             }
1855           break;
1856         case '>':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             {
1859               token->type = ANCHOR;
1860               token->opr.ctx_type = WORD_LAST;
1861             }
1862           break;
1863         case 'b':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             {
1866               token->type = ANCHOR;
1867               token->opr.ctx_type = WORD_DELIM;
1868             }
1869           break;
1870         case 'B':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             {
1873               token->type = ANCHOR;
1874               token->opr.ctx_type = NOT_WORD_DELIM;
1875             }
1876           break;
1877         case 'w':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             token->type = OP_WORD;
1880           break;
1881         case 'W':
1882           if (!(syntax & RE_NO_GNU_OPS))
1883             token->type = OP_NOTWORD;
1884           break;
1885         case 's':
1886           if (!(syntax & RE_NO_GNU_OPS))
1887             token->type = OP_SPACE;
1888           break;
1889         case 'S':
1890           if (!(syntax & RE_NO_GNU_OPS))
1891             token->type = OP_NOTSPACE;
1892           break;
1893         case '`':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             {
1896               token->type = ANCHOR;
1897               token->opr.ctx_type = BUF_FIRST;
1898             }
1899           break;
1900         case '\'':
1901           if (!(syntax & RE_NO_GNU_OPS))
1902             {
1903               token->type = ANCHOR;
1904               token->opr.ctx_type = BUF_LAST;
1905             }
1906           break;
1907         case '(':
1908           if (!(syntax & RE_NO_BK_PARENS))
1909             token->type = OP_OPEN_SUBEXP;
1910           break;
1911         case ')':
1912           if (!(syntax & RE_NO_BK_PARENS))
1913             token->type = OP_CLOSE_SUBEXP;
1914           break;
1915         case '+':
1916           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1917             token->type = OP_DUP_PLUS;
1918           break;
1919         case '?':
1920           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1921             token->type = OP_DUP_QUESTION;
1922           break;
1923         case '{':
1924           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1925             token->type = OP_OPEN_DUP_NUM;
1926           break;
1927         case '}':
1928           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1929             token->type = OP_CLOSE_DUP_NUM;
1930           break;
1931         default:
1932           break;
1933         }
1934       return 2;
1935     }
1936
1937   token->type = CHARACTER;
1938 #ifdef RE_ENABLE_I18N
1939   if (input->mb_cur_max > 1)
1940     {
1941       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1942       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1943     }
1944   else
1945 #endif
1946     token->word_char = IS_WORD_CHAR (token->opr.c);
1947
1948   switch (c)
1949     {
1950     case '\n':
1951       if (syntax & RE_NEWLINE_ALT)
1952         token->type = OP_ALT;
1953       break;
1954     case '|':
1955       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1956         token->type = OP_ALT;
1957       break;
1958     case '*':
1959       token->type = OP_DUP_ASTERISK;
1960       break;
1961     case '+':
1962       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1963         token->type = OP_DUP_PLUS;
1964       break;
1965     case '?':
1966       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1967         token->type = OP_DUP_QUESTION;
1968       break;
1969     case '{':
1970       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1971         token->type = OP_OPEN_DUP_NUM;
1972       break;
1973     case '}':
1974       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1975         token->type = OP_CLOSE_DUP_NUM;
1976       break;
1977     case '(':
1978       if (syntax & RE_NO_BK_PARENS)
1979         token->type = OP_OPEN_SUBEXP;
1980       break;
1981     case ')':
1982       if (syntax & RE_NO_BK_PARENS)
1983         token->type = OP_CLOSE_SUBEXP;
1984       break;
1985     case '[':
1986       token->type = OP_OPEN_BRACKET;
1987       break;
1988     case '.':
1989       token->type = OP_PERIOD;
1990       break;
1991     case '^':
1992       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1993           re_string_cur_idx (input) != 0)
1994         {
1995           char prev = re_string_peek_byte (input, -1);
1996           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1997             break;
1998         }
1999       token->type = ANCHOR;
2000       token->opr.ctx_type = LINE_FIRST;
2001       break;
2002     case '$':
2003       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2004           re_string_cur_idx (input) + 1 != re_string_length (input))
2005         {
2006           re_token_t next;
2007           re_string_skip_bytes (input, 1);
2008           peek_token (&next, input, syntax);
2009           re_string_skip_bytes (input, -1);
2010           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2011             break;
2012         }
2013       token->type = ANCHOR;
2014       token->opr.ctx_type = LINE_LAST;
2015       break;
2016     default:
2017       break;
2018     }
2019   return 1;
2020 }
2021
2022 /* Peek a token from INPUT, and return the length of the token.
2023    We must not use this function out of bracket expressions.  */
2024
2025 static int
2026 peek_token_bracket (token, input, syntax)
2027      re_token_t *token;
2028      re_string_t *input;
2029      reg_syntax_t syntax;
2030 {
2031   unsigned char c;
2032   if (re_string_eoi (input))
2033     {
2034       token->type = END_OF_RE;
2035       return 0;
2036     }
2037   c = re_string_peek_byte (input, 0);
2038   token->opr.c = c;
2039
2040 #ifdef RE_ENABLE_I18N
2041   if (input->mb_cur_max > 1 &&
2042       !re_string_first_byte (input, re_string_cur_idx (input)))
2043     {
2044       token->type = CHARACTER;
2045       return 1;
2046     }
2047 #endif /* RE_ENABLE_I18N */
2048
2049   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2050       && re_string_cur_idx (input) + 1 < re_string_length (input))
2051     {
2052       /* In this case, '\' escape a character.  */
2053       unsigned char c2;
2054       re_string_skip_bytes (input, 1);
2055       c2 = re_string_peek_byte (input, 0);
2056       token->opr.c = c2;
2057       token->type = CHARACTER;
2058       return 1;
2059     }
2060   if (c == '[') /* '[' is a special char in a bracket exps.  */
2061     {
2062       unsigned char c2;
2063       int token_len;
2064       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2065         c2 = re_string_peek_byte (input, 1);
2066       else
2067         c2 = 0;
2068       token->opr.c = c2;
2069       token_len = 2;
2070       switch (c2)
2071         {
2072         case '.':
2073           token->type = OP_OPEN_COLL_ELEM;
2074           break;
2075         case '=':
2076           token->type = OP_OPEN_EQUIV_CLASS;
2077           break;
2078         case ':':
2079           if (syntax & RE_CHAR_CLASSES)
2080             {
2081               token->type = OP_OPEN_CHAR_CLASS;
2082               break;
2083             }
2084           /* else fall through.  */
2085         default:
2086           token->type = CHARACTER;
2087           token->opr.c = c;
2088           token_len = 1;
2089           break;
2090         }
2091       return token_len;
2092     }
2093   switch (c)
2094     {
2095     case '-':
2096       token->type = OP_CHARSET_RANGE;
2097       break;
2098     case ']':
2099       token->type = OP_CLOSE_BRACKET;
2100       break;
2101     case '^':
2102       token->type = OP_NON_MATCH_LIST;
2103       break;
2104     default:
2105       token->type = CHARACTER;
2106     }
2107   return 1;
2108 }
2109 \f
2110 /* Functions for parser.  */
2111
2112 /* Entry point of the parser.
2113    Parse the regular expression REGEXP and return the structure tree.
2114    If an error is occured, ERR is set by error code, and return NULL.
2115    This function build the following tree, from regular expression <reg_exp>:
2116            CAT
2117            / \
2118           /   \
2119    <reg_exp>  EOR
2120
2121    CAT means concatenation.
2122    EOR means end of regular expression.  */
2123
2124 static bin_tree_t *
2125 parse (regexp, preg, syntax, err)
2126      re_string_t *regexp;
2127      regex_t *preg;
2128      reg_syntax_t syntax;
2129      reg_errcode_t *err;
2130 {
2131   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2132   bin_tree_t *tree, *eor, *root;
2133   re_token_t current_token;
2134   dfa->syntax = syntax;
2135   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2136   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2137   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2138     return NULL;
2139   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2140   if (tree != NULL)
2141     root = create_tree (dfa, tree, eor, CONCAT);
2142   else
2143     root = eor;
2144   if (BE (eor == NULL || root == NULL, 0))
2145     {
2146       *err = REG_ESPACE;
2147       return NULL;
2148     }
2149   return root;
2150 }
2151
2152 /* This function build the following tree, from regular expression
2153    <branch1>|<branch2>:
2154            ALT
2155            / \
2156           /   \
2157    <branch1> <branch2>
2158
2159    ALT means alternative, which represents the operator `|'.  */
2160
2161 static bin_tree_t *
2162 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2163      re_string_t *regexp;
2164      regex_t *preg;
2165      re_token_t *token;
2166      reg_syntax_t syntax;
2167      int nest;
2168      reg_errcode_t *err;
2169 {
2170   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2171   bin_tree_t *tree, *branch = NULL;
2172   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2173   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2174     return NULL;
2175
2176   while (token->type == OP_ALT)
2177     {
2178       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2179       if (token->type != OP_ALT && token->type != END_OF_RE
2180           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2181         {
2182           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2183           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2184             return NULL;
2185         }
2186       else
2187         branch = NULL;
2188       tree = create_tree (dfa, tree, branch, OP_ALT);
2189       if (BE (tree == NULL, 0))
2190         {
2191           *err = REG_ESPACE;
2192           return NULL;
2193         }
2194     }
2195   return tree;
2196 }
2197
2198 /* This function build the following tree, from regular expression
2199    <exp1><exp2>:
2200         CAT
2201         / \
2202        /   \
2203    <exp1> <exp2>
2204
2205    CAT means concatenation.  */
2206
2207 static bin_tree_t *
2208 parse_branch (regexp, preg, token, syntax, nest, err)
2209      re_string_t *regexp;
2210      regex_t *preg;
2211      re_token_t *token;
2212      reg_syntax_t syntax;
2213      int nest;
2214      reg_errcode_t *err;
2215 {
2216   bin_tree_t *tree, *exp;
2217   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2218   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2219   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2220     return NULL;
2221
2222   while (token->type != OP_ALT && token->type != END_OF_RE
2223          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2224     {
2225       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2226       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2227         {
2228           return NULL;
2229         }
2230       if (tree != NULL && exp != NULL)
2231         {
2232           tree = create_tree (dfa, tree, exp, CONCAT);
2233           if (tree == NULL)
2234             {
2235               *err = REG_ESPACE;
2236               return NULL;
2237             }
2238         }
2239       else if (tree == NULL)
2240         tree = exp;
2241       /* Otherwise exp == NULL, we don't need to create new tree.  */
2242     }
2243   return tree;
2244 }
2245
2246 /* This function build the following tree, from regular expression a*:
2247          *
2248          |
2249          a
2250 */
2251
2252 static bin_tree_t *
2253 parse_expression (regexp, preg, token, syntax, nest, err)
2254      re_string_t *regexp;
2255      regex_t *preg;
2256      re_token_t *token;
2257      reg_syntax_t syntax;
2258      int nest;
2259      reg_errcode_t *err;
2260 {
2261   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2262   bin_tree_t *tree;
2263   switch (token->type)
2264     {
2265     case CHARACTER:
2266       tree = create_token_tree (dfa, NULL, NULL, token);
2267       if (BE (tree == NULL, 0))
2268         {
2269           *err = REG_ESPACE;
2270           return NULL;
2271         }
2272 #ifdef RE_ENABLE_I18N
2273       if (dfa->mb_cur_max > 1)
2274         {
2275           while (!re_string_eoi (regexp)
2276                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2277             {
2278               bin_tree_t *mbc_remain;
2279               fetch_token (token, regexp, syntax);
2280               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2281               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2282               if (BE (mbc_remain == NULL || tree == NULL, 0))
2283                 {
2284                   *err = REG_ESPACE;
2285                   return NULL;
2286                 }
2287             }
2288         }
2289 #endif
2290       break;
2291     case OP_OPEN_SUBEXP:
2292       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2293       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2294         return NULL;
2295       break;
2296     case OP_OPEN_BRACKET:
2297       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2298       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2299         return NULL;
2300       break;
2301     case OP_BACK_REF:
2302       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2303         {
2304           *err = REG_ESUBREG;
2305           return NULL;
2306         }
2307       dfa->used_bkref_map |= 1 << token->opr.idx;
2308       tree = create_token_tree (dfa, NULL, NULL, token);
2309       if (BE (tree == NULL, 0))
2310         {
2311           *err = REG_ESPACE;
2312           return NULL;
2313         }
2314       ++dfa->nbackref;
2315       dfa->has_mb_node = 1;
2316       break;
2317     case OP_OPEN_DUP_NUM:
2318       if (syntax & RE_CONTEXT_INVALID_DUP)
2319         {
2320           *err = REG_BADRPT;
2321           return NULL;
2322         }
2323       /* FALLTHROUGH */
2324     case OP_DUP_ASTERISK:
2325     case OP_DUP_PLUS:
2326     case OP_DUP_QUESTION:
2327       if (syntax & RE_CONTEXT_INVALID_OPS)
2328         {
2329           *err = REG_BADRPT;
2330           return NULL;
2331         }
2332       else if (syntax & RE_CONTEXT_INDEP_OPS)
2333         {
2334           fetch_token (token, regexp, syntax);
2335           return parse_expression (regexp, preg, token, syntax, nest, err);
2336         }
2337       /* else fall through  */
2338     case OP_CLOSE_SUBEXP:
2339       if ((token->type == OP_CLOSE_SUBEXP) &&
2340           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2341         {
2342           *err = REG_ERPAREN;
2343           return NULL;
2344         }
2345       /* else fall through  */
2346     case OP_CLOSE_DUP_NUM:
2347       /* We treat it as a normal character.  */
2348
2349       /* Then we can these characters as normal characters.  */
2350       token->type = CHARACTER;
2351       /* mb_partial and word_char bits should be initialized already
2352          by peek_token.  */
2353       tree = create_token_tree (dfa, NULL, NULL, token);
2354       if (BE (tree == NULL, 0))
2355         {
2356           *err = REG_ESPACE;
2357           return NULL;
2358         }
2359       break;
2360     case ANCHOR:
2361       if ((token->opr.ctx_type
2362            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2363           && dfa->word_ops_used == 0)
2364         init_word_char (dfa);
2365       if (token->opr.ctx_type == WORD_DELIM
2366           || token->opr.ctx_type == NOT_WORD_DELIM)
2367         {
2368           bin_tree_t *tree_first, *tree_last;
2369           if (token->opr.ctx_type == WORD_DELIM)
2370             {
2371               token->opr.ctx_type = WORD_FIRST;
2372               tree_first = create_token_tree (dfa, NULL, NULL, token);
2373               token->opr.ctx_type = WORD_LAST;
2374             }
2375           else
2376             {
2377               token->opr.ctx_type = INSIDE_WORD;
2378               tree_first = create_token_tree (dfa, NULL, NULL, token);
2379               token->opr.ctx_type = INSIDE_NOTWORD;
2380             }
2381           tree_last = create_token_tree (dfa, NULL, NULL, token);
2382           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2383           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2384             {
2385               *err = REG_ESPACE;
2386               return NULL;
2387             }
2388         }
2389       else
2390         {
2391           tree = create_token_tree (dfa, NULL, NULL, token);
2392           if (BE (tree == NULL, 0))
2393             {
2394               *err = REG_ESPACE;
2395               return NULL;
2396             }
2397         }
2398       /* We must return here, since ANCHORs can't be followed
2399          by repetition operators.
2400          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2401              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2402       fetch_token (token, regexp, syntax);
2403       return tree;
2404     case OP_PERIOD:
2405       tree = create_token_tree (dfa, NULL, NULL, token);
2406       if (BE (tree == NULL, 0))
2407         {
2408           *err = REG_ESPACE;
2409           return NULL;
2410         }
2411       if (dfa->mb_cur_max > 1)
2412         dfa->has_mb_node = 1;
2413       break;
2414     case OP_WORD:
2415     case OP_NOTWORD:
2416       tree = build_charclass_op (dfa, regexp->trans,
2417                                  (const unsigned char *) "alnum",
2418                                  (const unsigned char *) "_",
2419                                  token->type == OP_NOTWORD, err);
2420       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2421         return NULL;
2422       break;
2423     case OP_SPACE:
2424     case OP_NOTSPACE:
2425       tree = build_charclass_op (dfa, regexp->trans,
2426                                  (const unsigned char *) "space",
2427                                  (const unsigned char *) "",
2428                                  token->type == OP_NOTSPACE, err);
2429       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2430         return NULL;
2431       break;
2432     case OP_ALT:
2433     case END_OF_RE:
2434       return NULL;
2435     case BACK_SLASH:
2436       *err = REG_EESCAPE;
2437       return NULL;
2438     default:
2439       /* Must not happen?  */
2440 #ifdef DEBUG
2441       assert (0);
2442 #endif
2443       return NULL;
2444     }
2445   fetch_token (token, regexp, syntax);
2446
2447   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2448          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2449     {
2450       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2451       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2452         return NULL;
2453       /* In BRE consecutive duplications are not allowed.  */
2454       if ((syntax & RE_CONTEXT_INVALID_DUP)
2455           && (token->type == OP_DUP_ASTERISK
2456               || token->type == OP_OPEN_DUP_NUM))
2457         {
2458           *err = REG_BADRPT;
2459           return NULL;
2460         }
2461     }
2462
2463   return tree;
2464 }
2465
2466 /* This function build the following tree, from regular expression
2467    (<reg_exp>):
2468          SUBEXP
2469             |
2470         <reg_exp>
2471 */
2472
2473 static bin_tree_t *
2474 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2475      re_string_t *regexp;
2476      regex_t *preg;
2477      re_token_t *token;
2478      reg_syntax_t syntax;
2479      int nest;
2480      reg_errcode_t *err;
2481 {
2482   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2483   bin_tree_t *tree;
2484   size_t cur_nsub;
2485   cur_nsub = preg->re_nsub++;
2486
2487   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488
2489   /* The subexpression may be a null string.  */
2490   if (token->type == OP_CLOSE_SUBEXP)
2491     tree = NULL;
2492   else
2493     {
2494       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2496         *err = REG_EPAREN;
2497       if (BE (*err != REG_NOERROR, 0))
2498         return NULL;
2499     }
2500
2501   if (cur_nsub <= '9' - '1')
2502     dfa->completed_bkref_map |= 1 << cur_nsub;
2503
2504   tree = create_tree (dfa, tree, NULL, SUBEXP);
2505   if (BE (tree == NULL, 0))
2506     {
2507       *err = REG_ESPACE;
2508       return NULL;
2509     }
2510   tree->token.opr.idx = cur_nsub;
2511   return tree;
2512 }
2513
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2515
2516 static bin_tree_t *
2517 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2518      bin_tree_t *elem;
2519      re_string_t *regexp;
2520      re_dfa_t *dfa;
2521      re_token_t *token;
2522      reg_syntax_t syntax;
2523      reg_errcode_t *err;
2524 {
2525   bin_tree_t *tree = NULL, *old_tree = NULL;
2526   int i, start, end, start_idx = re_string_cur_idx (regexp);
2527   re_token_t start_token = *token;
2528
2529   if (token->type == OP_OPEN_DUP_NUM)
2530     {
2531       end = 0;
2532       start = fetch_number (regexp, token, syntax);
2533       if (start == -1)
2534         {
2535           if (token->type == CHARACTER && token->opr.c == ',')
2536             start = 0; /* We treat "{,m}" as "{0,m}".  */
2537           else
2538             {
2539               *err = REG_BADBR; /* <re>{} is invalid.  */
2540               return NULL;
2541             }
2542         }
2543       if (BE (start != -2, 1))
2544         {
2545           /* We treat "{n}" as "{n,n}".  */
2546           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2547                  : ((token->type == CHARACTER && token->opr.c == ',')
2548                     ? fetch_number (regexp, token, syntax) : -2));
2549         }
2550       if (BE (start == -2 || end == -2, 0))
2551         {
2552           /* Invalid sequence.  */
2553           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2554             {
2555               if (token->type == END_OF_RE)
2556                 *err = REG_EBRACE;
2557               else
2558                 *err = REG_BADBR;
2559
2560               return NULL;
2561             }
2562
2563           /* If the syntax bit is set, rollback.  */
2564           re_string_set_index (regexp, start_idx);
2565           *token = start_token;
2566           token->type = CHARACTER;
2567           /* mb_partial and word_char bits should be already initialized by
2568              peek_token.  */
2569           return elem;
2570         }
2571
2572       if (BE (end != -1 && start > end, 0))
2573         {
2574           /* First number greater than second.  */
2575           *err = REG_BADBR;
2576           return NULL;
2577         }
2578     }
2579   else
2580     {
2581       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2582       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2583     }
2584
2585   fetch_token (token, regexp, syntax);
2586
2587   if (BE (elem == NULL, 0))
2588     return NULL;
2589   if (BE (start == 0 && end == 0, 0))
2590     {
2591       postorder (elem, free_tree, NULL);
2592       return NULL;
2593     }
2594
2595   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2596   if (BE (start > 0, 0))
2597     {
2598       tree = elem;
2599       for (i = 2; i <= start; ++i)
2600         {
2601           elem = duplicate_tree (elem, dfa);
2602           tree = create_tree (dfa, tree, elem, CONCAT);
2603           if (BE (elem == NULL || tree == NULL, 0))
2604             goto parse_dup_op_espace;
2605         }
2606
2607       if (start == end)
2608         return tree;
2609
2610       /* Duplicate ELEM before it is marked optional.  */
2611       elem = duplicate_tree (elem, dfa);
2612       old_tree = tree;
2613     }
2614   else
2615     old_tree = NULL;
2616
2617   if (elem->token.type == SUBEXP)
2618     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2619
2620   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2621   if (BE (tree == NULL, 0))
2622     goto parse_dup_op_espace;
2623
2624   /* This loop is actually executed only when end != -1,
2625      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2626      already created the start+1-th copy.  */
2627   for (i = start + 2; i <= end; ++i)
2628     {
2629       elem = duplicate_tree (elem, dfa);
2630       tree = create_tree (dfa, tree, elem, CONCAT);
2631       if (BE (elem == NULL || tree == NULL, 0))
2632         goto parse_dup_op_espace;
2633
2634       tree = create_tree (dfa, tree, NULL, OP_ALT);
2635       if (BE (tree == NULL, 0))
2636         goto parse_dup_op_espace;
2637     }
2638
2639   if (old_tree)
2640     tree = create_tree (dfa, old_tree, tree, CONCAT);
2641
2642   return tree;
2643
2644  parse_dup_op_espace:
2645   *err = REG_ESPACE;
2646   return NULL;
2647 }
2648
2649 /* Size of the names for collating symbol/equivalence_class/character_class.
2650    I'm not sure, but maybe enough.  */
2651 #define BRACKET_NAME_BUF_SIZE 32
2652
2653 #ifndef _LIBC
2654   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2655      Build the range expression which starts from START_ELEM, and ends
2656      at END_ELEM.  The result are written to MBCSET and SBCSET.
2657      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2658      mbcset->range_ends, is a pointer argument sinse we may
2659      update it.  */
2660
2661 static reg_errcode_t
2662 # ifdef RE_ENABLE_I18N
2663 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2664      re_charset_t *mbcset;
2665      int *range_alloc;
2666 # else /* not RE_ENABLE_I18N */
2667 build_range_exp (sbcset, start_elem, end_elem)
2668 # endif /* not RE_ENABLE_I18N */
2669      re_bitset_ptr_t sbcset;
2670      bracket_elem_t *start_elem, *end_elem;
2671 {
2672   unsigned int start_ch, end_ch;
2673   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2674   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2675           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2676           0))
2677     return REG_ERANGE;
2678
2679   /* We can handle no multi character collating elements without libc
2680      support.  */
2681   if (BE ((start_elem->type == COLL_SYM
2682            && strlen ((char *) start_elem->opr.name) > 1)
2683           || (end_elem->type == COLL_SYM
2684               && strlen ((char *) end_elem->opr.name) > 1), 0))
2685     return REG_ECOLLATE;
2686
2687 # ifdef RE_ENABLE_I18N
2688   {
2689     wchar_t wc;
2690     wint_t start_wc;
2691     wint_t end_wc;
2692     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2693
2694     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2695                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2696                    : 0));
2697     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2698               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2699                  : 0));
2700     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2701                 ? __btowc (start_ch) : start_elem->opr.wch);
2702     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2703               ? __btowc (end_ch) : end_elem->opr.wch);
2704     if (start_wc == WEOF || end_wc == WEOF)
2705       return REG_ECOLLATE;
2706     cmp_buf[0] = start_wc;
2707     cmp_buf[4] = end_wc;
2708     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2709       return REG_ERANGE;
2710
2711     /* Got valid collation sequence values, add them as a new entry.
2712        However, for !_LIBC we have no collation elements: if the
2713        character set is single byte, the single byte character set
2714        that we build below suffices.  parse_bracket_exp passes
2715        no MBCSET if dfa->mb_cur_max == 1.  */
2716     if (mbcset)
2717       {
2718         /* Check the space of the arrays.  */
2719         if (BE (*range_alloc == mbcset->nranges, 0))
2720           {
2721             /* There is not enough space, need realloc.  */
2722             wchar_t *new_array_start, *new_array_end;
2723             int new_nranges;
2724
2725             /* +1 in case of mbcset->nranges is 0.  */
2726             new_nranges = 2 * mbcset->nranges + 1;
2727             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2728                are NULL if *range_alloc == 0.  */
2729             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2730                                           new_nranges);
2731             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2732                                         new_nranges);
2733
2734             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2735               return REG_ESPACE;
2736
2737             mbcset->range_starts = new_array_start;
2738             mbcset->range_ends = new_array_end;
2739             *range_alloc = new_nranges;
2740           }
2741
2742         mbcset->range_starts[mbcset->nranges] = start_wc;
2743         mbcset->range_ends[mbcset->nranges++] = end_wc;
2744       }
2745
2746     /* Build the table for single byte characters.  */
2747     for (wc = 0; wc < SBC_MAX; ++wc)
2748       {
2749         cmp_buf[2] = wc;
2750         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2751             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2752           bitset_set (sbcset, wc);
2753       }
2754   }
2755 # else /* not RE_ENABLE_I18N */
2756   {
2757     unsigned int ch;
2758     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2759                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2760                    : 0));
2761     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2762               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2763                  : 0));
2764     if (start_ch > end_ch)
2765       return REG_ERANGE;
2766     /* Build the table for single byte characters.  */
2767     for (ch = 0; ch < SBC_MAX; ++ch)
2768       if (start_ch <= ch  && ch <= end_ch)
2769         bitset_set (sbcset, ch);
2770   }
2771 # endif /* not RE_ENABLE_I18N */
2772   return REG_NOERROR;
2773 }
2774 #endif /* not _LIBC */
2775
2776 #ifndef _LIBC
2777 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2778    Build the collating element which is represented by NAME.
2779    The result are written to MBCSET and SBCSET.
2780    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2781    pointer argument since we may update it.  */
2782
2783 static reg_errcode_t
2784 # ifdef RE_ENABLE_I18N
2785 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2786      re_charset_t *mbcset;
2787      int *coll_sym_alloc;
2788 # else /* not RE_ENABLE_I18N */
2789 build_collating_symbol (sbcset, name)
2790 # endif /* not RE_ENABLE_I18N */
2791      re_bitset_ptr_t sbcset;
2792      const unsigned char *name;
2793 {
2794   size_t name_len = strlen ((const char *) name);
2795   if (BE (name_len != 1, 0))
2796     return REG_ECOLLATE;
2797   else
2798     {
2799       bitset_set (sbcset, name[0]);
2800       return REG_NOERROR;
2801     }
2802 }
2803 #endif /* not _LIBC */
2804
2805 /* This function parse bracket expression like "[abc]", "[a-c]",
2806    "[[.a-a.]]" etc.  */
2807
2808 static bin_tree_t *
2809 parse_bracket_exp (regexp, dfa, token, syntax, err)
2810      re_string_t *regexp;
2811      re_dfa_t *dfa;
2812      re_token_t *token;
2813      reg_syntax_t syntax;
2814      reg_errcode_t *err;
2815 {
2816 #ifdef _LIBC
2817   const unsigned char *collseqmb;
2818   const char *collseqwc;
2819   uint32_t nrules;
2820   int32_t table_size;
2821   const int32_t *symb_table;
2822   const unsigned char *extra;
2823
2824   /* Local function for parse_bracket_exp used in _LIBC environement.
2825      Seek the collating symbol entry correspondings to NAME.
2826      Return the index of the symbol in the SYMB_TABLE.  */
2827
2828   auto inline int32_t
2829   __attribute ((always_inline))
2830   seek_collating_symbol_entry (name, name_len)
2831          const unsigned char *name;
2832          size_t name_len;
2833     {
2834       int32_t hash = elem_hash ((const char *) name, name_len);
2835       int32_t elem = hash % table_size;
2836       int32_t second = hash % (table_size - 2);
2837       while (symb_table[2 * elem] != 0)
2838         {
2839           /* First compare the hashing value.  */
2840           if (symb_table[2 * elem] == hash
2841               /* Compare the length of the name.  */
2842               && name_len == extra[symb_table[2 * elem + 1]]
2843               /* Compare the name.  */
2844               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2845                          name_len) == 0)
2846             {
2847               /* Yep, this is the entry.  */
2848               break;
2849             }
2850
2851           /* Next entry.  */
2852           elem += second;
2853         }
2854       return elem;
2855     }
2856
2857   /* Local function for parse_bracket_exp used in _LIBC environement.
2858      Look up the collation sequence value of BR_ELEM.
2859      Return the value if succeeded, UINT_MAX otherwise.  */
2860
2861   auto inline unsigned int
2862   __attribute ((always_inline))
2863   lookup_collation_sequence_value (br_elem)
2864          bracket_elem_t *br_elem;
2865     {
2866       if (br_elem->type == SB_CHAR)
2867         {
2868           /*
2869           if (MB_CUR_MAX == 1)
2870           */
2871           if (nrules == 0)
2872             return collseqmb[br_elem->opr.ch];
2873           else
2874             {
2875               wint_t wc = __btowc (br_elem->opr.ch);
2876               return __collseq_table_lookup (collseqwc, wc);
2877             }
2878         }
2879       else if (br_elem->type == MB_CHAR)
2880         {
2881           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2882         }
2883       else if (br_elem->type == COLL_SYM)
2884         {
2885           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2886           if (nrules != 0)
2887             {
2888               int32_t elem, idx;
2889               elem = seek_collating_symbol_entry (br_elem->opr.name,
2890                                                   sym_name_len);
2891               if (symb_table[2 * elem] != 0)
2892                 {
2893                   /* We found the entry.  */
2894                   idx = symb_table[2 * elem + 1];
2895                   /* Skip the name of collating element name.  */
2896                   idx += 1 + extra[idx];
2897                   /* Skip the byte sequence of the collating element.  */
2898                   idx += 1 + extra[idx];
2899                   /* Adjust for the alignment.  */
2900                   idx = (idx + 3) & ~3;
2901                   /* Skip the multibyte collation sequence value.  */
2902                   idx += sizeof (unsigned int);
2903                   /* Skip the wide char sequence of the collating element.  */
2904                   idx += sizeof (unsigned int) *
2905                     (1 + *(unsigned int *) (extra + idx));
2906                   /* Return the collation sequence value.  */
2907                   return *(unsigned int *) (extra + idx);
2908                 }
2909               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2910                 {
2911                   /* No valid character.  Match it as a single byte
2912                      character.  */
2913                   return collseqmb[br_elem->opr.name[0]];
2914                 }
2915             }
2916           else if (sym_name_len == 1)
2917             return collseqmb[br_elem->opr.name[0]];
2918         }
2919       return UINT_MAX;
2920     }
2921
2922   /* Local function for parse_bracket_exp used in _LIBC environement.
2923      Build the range expression which starts from START_ELEM, and ends
2924      at END_ELEM.  The result are written to MBCSET and SBCSET.
2925      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2926      mbcset->range_ends, is a pointer argument sinse we may
2927      update it.  */
2928
2929   auto inline reg_errcode_t
2930   __attribute ((always_inline))
2931   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2932          re_charset_t *mbcset;
2933          int *range_alloc;
2934          re_bitset_ptr_t sbcset;
2935          bracket_elem_t *start_elem, *end_elem;
2936     {
2937       unsigned int ch;
2938       uint32_t start_collseq;
2939       uint32_t end_collseq;
2940
2941       /* Equivalence Classes and Character Classes can't be a range
2942          start/end.  */
2943       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2944               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2945               0))
2946         return REG_ERANGE;
2947
2948       start_collseq = lookup_collation_sequence_value (start_elem);
2949       end_collseq = lookup_collation_sequence_value (end_elem);
2950       /* Check start/end collation sequence values.  */
2951       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2952         return REG_ECOLLATE;
2953       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2954         return REG_ERANGE;
2955
2956       /* Got valid collation sequence values, add them as a new entry.
2957          However, if we have no collation elements, and the character set
2958          is single byte, the single byte character set that we
2959          build below suffices. */
2960       if (nrules > 0 || dfa->mb_cur_max > 1)
2961         {
2962           /* Check the space of the arrays.  */
2963           if (BE (*range_alloc == mbcset->nranges, 0))
2964             {
2965               /* There is not enough space, need realloc.  */
2966               uint32_t *new_array_start;
2967               uint32_t *new_array_end;
2968               int new_nranges;
2969
2970               /* +1 in case of mbcset->nranges is 0.  */
2971               new_nranges = 2 * mbcset->nranges + 1;
2972               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2973                                             new_nranges);
2974               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2975                                           new_nranges);
2976
2977               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2978                 return REG_ESPACE;
2979
2980               mbcset->range_starts = new_array_start;
2981               mbcset->range_ends = new_array_end;
2982               *range_alloc = new_nranges;
2983             }
2984
2985           mbcset->range_starts[mbcset->nranges] = start_collseq;
2986           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2987         }
2988
2989       /* Build the table for single byte characters.  */
2990       for (ch = 0; ch < SBC_MAX; ch++)
2991         {
2992           uint32_t ch_collseq;
2993           /*
2994           if (MB_CUR_MAX == 1)
2995           */
2996           if (nrules == 0)
2997             ch_collseq = collseqmb[ch];
2998           else
2999             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3000           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3001             bitset_set (sbcset, ch);
3002         }
3003       return REG_NOERROR;
3004     }
3005
3006   /* Local function for parse_bracket_exp used in _LIBC environement.
3007      Build the collating element which is represented by NAME.
3008      The result are written to MBCSET and SBCSET.
3009      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3010      pointer argument sinse we may update it.  */
3011
3012   auto inline reg_errcode_t
3013   __attribute ((always_inline))
3014   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3015          re_charset_t *mbcset;
3016          int *coll_sym_alloc;
3017          re_bitset_ptr_t sbcset;
3018          const unsigned char *name;
3019     {
3020       int32_t elem, idx;
3021       size_t name_len = strlen ((const char *) name);
3022       if (nrules != 0)
3023         {
3024           elem = seek_collating_symbol_entry (name, name_len);
3025           if (symb_table[2 * elem] != 0)
3026             {
3027               /* We found the entry.  */
3028               idx = symb_table[2 * elem + 1];
3029               /* Skip the name of collating element name.  */
3030               idx += 1 + extra[idx];
3031             }
3032           else if (symb_table[2 * elem] == 0 && name_len == 1)
3033             {
3034               /* No valid character, treat it as a normal
3035                  character.  */
3036               bitset_set (sbcset, name[0]);
3037               return REG_NOERROR;
3038             }
3039           else
3040             return REG_ECOLLATE;
3041
3042           /* Got valid collation sequence, add it as a new entry.  */
3043           /* Check the space of the arrays.  */
3044           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3045             {
3046               /* Not enough, realloc it.  */
3047               /* +1 in case of mbcset->ncoll_syms is 0.  */
3048               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3049               /* Use realloc since mbcset->coll_syms is NULL
3050                  if *alloc == 0.  */
3051               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3052                                                    new_coll_sym_alloc);
3053               if (BE (new_coll_syms == NULL, 0))
3054                 return REG_ESPACE;
3055               mbcset->coll_syms = new_coll_syms;
3056               *coll_sym_alloc = new_coll_sym_alloc;
3057             }
3058           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3059           return REG_NOERROR;
3060         }
3061       else
3062         {
3063           if (BE (name_len != 1, 0))
3064             return REG_ECOLLATE;
3065           else
3066             {
3067               bitset_set (sbcset, name[0]);
3068               return REG_NOERROR;
3069             }
3070         }
3071     }
3072 #endif
3073
3074   re_token_t br_token;
3075   re_bitset_ptr_t sbcset;
3076 #ifdef RE_ENABLE_I18N
3077   re_charset_t *mbcset;
3078   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3079   int equiv_class_alloc = 0, char_class_alloc = 0;
3080 #endif /* not RE_ENABLE_I18N */
3081   int non_match = 0;
3082   bin_tree_t *work_tree;
3083   int token_len;
3084   int first_round = 1;
3085 #ifdef _LIBC
3086   collseqmb = (const unsigned char *)
3087     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3088   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3089   if (nrules)
3090     {
3091       /*
3092       if (MB_CUR_MAX > 1)
3093       */
3094         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3095       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3096       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3097                                                   _NL_COLLATE_SYMB_TABLEMB);
3098       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3099                                                    _NL_COLLATE_SYMB_EXTRAMB);
3100     }
3101 #endif
3102   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3103 #ifdef RE_ENABLE_I18N
3104   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3105 #endif /* RE_ENABLE_I18N */
3106 #ifdef RE_ENABLE_I18N
3107   if (BE (sbcset == NULL || mbcset == NULL, 0))
3108 #else
3109   if (BE (sbcset == NULL, 0))
3110 #endif /* RE_ENABLE_I18N */
3111     {
3112       *err = REG_ESPACE;
3113       return NULL;
3114     }
3115
3116   token_len = peek_token_bracket (token, regexp, syntax);
3117   if (BE (token->type == END_OF_RE, 0))
3118     {
3119       *err = REG_BADPAT;
3120       goto parse_bracket_exp_free_return;
3121     }
3122   if (token->type == OP_NON_MATCH_LIST)
3123     {
3124 #ifdef RE_ENABLE_I18N
3125       mbcset->non_match = 1;
3126 #endif /* not RE_ENABLE_I18N */
3127       non_match = 1;
3128       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3129         bitset_set (sbcset, '\0');
3130       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3131       token_len = peek_token_bracket (token, regexp, syntax);
3132       if (BE (token->type == END_OF_RE, 0))
3133         {
3134           *err = REG_BADPAT;
3135           goto parse_bracket_exp_free_return;
3136         }
3137     }
3138
3139   /* We treat the first ']' as a normal character.  */
3140   if (token->type == OP_CLOSE_BRACKET)
3141     token->type = CHARACTER;
3142
3143   while (1)
3144     {
3145       bracket_elem_t start_elem, end_elem;
3146       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3147       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3148       reg_errcode_t ret;
3149       int token_len2 = 0, is_range_exp = 0;
3150       re_token_t token2;
3151
3152       start_elem.opr.name = start_name_buf;
3153       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3154                                    syntax, first_round);
3155       if (BE (ret != REG_NOERROR, 0))
3156         {
3157           *err = ret;
3158           goto parse_bracket_exp_free_return;
3159         }
3160       first_round = 0;
3161
3162       /* Get information about the next token.  We need it in any case.  */
3163       token_len = peek_token_bracket (token, regexp, syntax);
3164
3165       /* Do not check for ranges if we know they are not allowed.  */
3166       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3167         {
3168           if (BE (token->type == END_OF_RE, 0))
3169             {
3170               *err = REG_EBRACK;
3171               goto parse_bracket_exp_free_return;
3172             }
3173           if (token->type == OP_CHARSET_RANGE)
3174             {
3175               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3176               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3177               if (BE (token2.type == END_OF_RE, 0))
3178                 {
3179                   *err = REG_EBRACK;
3180                   goto parse_bracket_exp_free_return;
3181                 }
3182               if (token2.type == OP_CLOSE_BRACKET)
3183                 {
3184                   /* We treat the last '-' as a normal character.  */
3185                   re_string_skip_bytes (regexp, -token_len);
3186                   token->type = CHARACTER;
3187                 }
3188               else
3189                 is_range_exp = 1;
3190             }
3191         }
3192
3193       if (is_range_exp == 1)
3194         {
3195           end_elem.opr.name = end_name_buf;
3196           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3197                                        dfa, syntax, 1);
3198           if (BE (ret != REG_NOERROR, 0))
3199             {
3200               *err = ret;
3201               goto parse_bracket_exp_free_return;
3202             }
3203
3204           token_len = peek_token_bracket (token, regexp, syntax);
3205
3206 #ifdef _LIBC
3207           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3208                                   &start_elem, &end_elem);
3209 #else
3210 # ifdef RE_ENABLE_I18N
3211           *err = build_range_exp (sbcset,
3212                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3213                                   &range_alloc, &start_elem, &end_elem);
3214 # else
3215           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3216 # endif
3217 #endif /* RE_ENABLE_I18N */
3218           if (BE (*err != REG_NOERROR, 0))
3219             goto parse_bracket_exp_free_return;
3220         }
3221       else
3222         {
3223           switch (start_elem.type)
3224             {
3225             case SB_CHAR:
3226               bitset_set (sbcset, start_elem.opr.ch);
3227               break;
3228 #ifdef RE_ENABLE_I18N
3229             case MB_CHAR:
3230               /* Check whether the array has enough space.  */
3231               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3232                 {
3233                   wchar_t *new_mbchars;
3234                   /* Not enough, realloc it.  */
3235                   /* +1 in case of mbcset->nmbchars is 0.  */
3236                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3237                   /* Use realloc since array is NULL if *alloc == 0.  */
3238                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3239                                             mbchar_alloc);
3240                   if (BE (new_mbchars == NULL, 0))
3241                     goto parse_bracket_exp_espace;
3242                   mbcset->mbchars = new_mbchars;
3243                 }
3244               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3245               break;
3246 #endif /* RE_ENABLE_I18N */
3247             case EQUIV_CLASS:
3248               *err = build_equiv_class (sbcset,
3249 #ifdef RE_ENABLE_I18N
3250                                         mbcset, &equiv_class_alloc,
3251 #endif /* RE_ENABLE_I18N */
3252                                         start_elem.opr.name);
3253               if (BE (*err != REG_NOERROR, 0))
3254                 goto parse_bracket_exp_free_return;
3255               break;
3256             case COLL_SYM:
3257               *err = build_collating_symbol (sbcset,
3258 #ifdef RE_ENABLE_I18N
3259                                              mbcset, &coll_sym_alloc,
3260 #endif /* RE_ENABLE_I18N */
3261                                              start_elem.opr.name);
3262               if (BE (*err != REG_NOERROR, 0))
3263                 goto parse_bracket_exp_free_return;
3264               break;
3265             case CHAR_CLASS:
3266               *err = build_charclass (regexp->trans, sbcset,
3267 #ifdef RE_ENABLE_I18N
3268                                       mbcset, &char_class_alloc,
3269 #endif /* RE_ENABLE_I18N */
3270                                       start_elem.opr.name, syntax);
3271               if (BE (*err != REG_NOERROR, 0))
3272                goto parse_bracket_exp_free_return;
3273               break;
3274             default:
3275               assert (0);
3276               break;
3277             }
3278         }
3279       if (BE (token->type == END_OF_RE, 0))
3280         {
3281           *err = REG_EBRACK;
3282           goto parse_bracket_exp_free_return;
3283         }
3284       if (token->type == OP_CLOSE_BRACKET)
3285         break;
3286     }
3287
3288   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3289
3290   /* If it is non-matching list.  */
3291   if (non_match)
3292     bitset_not (sbcset);
3293
3294 #ifdef RE_ENABLE_I18N
3295   /* Ensure only single byte characters are set.  */
3296   if (dfa->mb_cur_max > 1)
3297     bitset_mask (sbcset, dfa->sb_char);
3298
3299   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3300       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3301                                                      || mbcset->non_match)))
3302     {
3303       bin_tree_t *mbc_tree;
3304       int sbc_idx;
3305       /* Build a tree for complex bracket.  */
3306       dfa->has_mb_node = 1;
3307       br_token.type = COMPLEX_BRACKET;
3308       br_token.opr.mbcset = mbcset;
3309       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3310       if (BE (mbc_tree == NULL, 0))
3311         goto parse_bracket_exp_espace;
3312       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3313         if (sbcset[sbc_idx])
3314           break;
3315       /* If there are no bits set in sbcset, there is no point
3316          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3317       if (sbc_idx < BITSET_UINTS)
3318         {
3319           /* Build a tree for simple bracket.  */
3320           br_token.type = SIMPLE_BRACKET;
3321           br_token.opr.sbcset = sbcset;
3322           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3323           if (BE (work_tree == NULL, 0))
3324             goto parse_bracket_exp_espace;
3325
3326           /* Then join them by ALT node.  */
3327           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3328           if (BE (work_tree == NULL, 0))
3329             goto parse_bracket_exp_espace;
3330         }
3331       else
3332         {
3333           re_free (sbcset);
3334           work_tree = mbc_tree;
3335         }
3336     }
3337   else
3338 #endif /* not RE_ENABLE_I18N */
3339     {
3340 #ifdef RE_ENABLE_I18N
3341       free_charset (mbcset);
3342 #endif
3343       /* Build a tree for simple bracket.  */
3344       br_token.type = SIMPLE_BRACKET;
3345       br_token.opr.sbcset = sbcset;
3346       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3347       if (BE (work_tree == NULL, 0))
3348         goto parse_bracket_exp_espace;
3349     }
3350   return work_tree;
3351
3352  parse_bracket_exp_espace:
3353   *err = REG_ESPACE;
3354  parse_bracket_exp_free_return:
3355   re_free (sbcset);
3356 #ifdef RE_ENABLE_I18N
3357   free_charset (mbcset);
3358 #endif /* RE_ENABLE_I18N */
3359   return NULL;
3360 }
3361
3362 /* Parse an element in the bracket expression.  */
3363
3364 static reg_errcode_t
3365 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3366                        accept_hyphen)
3367      bracket_elem_t *elem;
3368      re_string_t *regexp;
3369      re_token_t *token;
3370      int token_len;
3371      re_dfa_t *dfa;
3372      reg_syntax_t syntax;
3373      int accept_hyphen;
3374 {
3375 #ifdef RE_ENABLE_I18N
3376   int cur_char_size;
3377   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3378   if (cur_char_size > 1)
3379     {
3380       elem->type = MB_CHAR;
3381       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3382       re_string_skip_bytes (regexp, cur_char_size);
3383       return REG_NOERROR;
3384     }
3385 #endif /* RE_ENABLE_I18N */
3386   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3387   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3388       || token->type == OP_OPEN_EQUIV_CLASS)
3389     return parse_bracket_symbol (elem, regexp, token);
3390   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3391     {
3392       /* A '-' must only appear as anything but a range indicator before
3393          the closing bracket.  Everything else is an error.  */
3394       re_token_t token2;
3395       (void) peek_token_bracket (&token2, regexp, syntax);
3396       if (token2.type != OP_CLOSE_BRACKET)
3397         /* The actual error value is not standardized since this whole
3398            case is undefined.  But ERANGE makes good sense.  */
3399         return REG_ERANGE;
3400     }
3401   elem->type = SB_CHAR;
3402   elem->opr.ch = token->opr.c;
3403   return REG_NOERROR;
3404 }
3405
3406 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3407    such as [:<character_class>:], [.<collating_element>.], and
3408    [=<equivalent_class>=].  */
3409
3410 static reg_errcode_t
3411 parse_bracket_symbol (elem, regexp, token)
3412      bracket_elem_t *elem;
3413      re_string_t *regexp;
3414      re_token_t *token;
3415 {
3416   unsigned char ch, delim = token->opr.c;
3417   int i = 0;
3418   if (re_string_eoi(regexp))
3419     return REG_EBRACK;
3420   for (;; ++i)
3421     {
3422       if (i >= BRACKET_NAME_BUF_SIZE)
3423         return REG_EBRACK;
3424       if (token->type == OP_OPEN_CHAR_CLASS)
3425         ch = re_string_fetch_byte_case (regexp);
3426       else
3427         ch = re_string_fetch_byte (regexp);
3428       if (re_string_eoi(regexp))
3429         return REG_EBRACK;
3430       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3431         break;
3432       elem->opr.name[i] = ch;
3433     }
3434   re_string_skip_bytes (regexp, 1);
3435   elem->opr.name[i] = '\0';
3436   switch (token->type)
3437     {
3438     case OP_OPEN_COLL_ELEM:
3439       elem->type = COLL_SYM;
3440       break;
3441     case OP_OPEN_EQUIV_CLASS:
3442       elem->type = EQUIV_CLASS;
3443       break;
3444     case OP_OPEN_CHAR_CLASS:
3445       elem->type = CHAR_CLASS;
3446       break;
3447     default:
3448       break;
3449     }
3450   return REG_NOERROR;
3451 }
3452
3453   /* Helper function for parse_bracket_exp.
3454      Build the equivalence class which is represented by NAME.
3455      The result are written to MBCSET and SBCSET.
3456      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3457      is a pointer argument sinse we may update it.  */
3458
3459 static reg_errcode_t
3460 #ifdef RE_ENABLE_I18N
3461 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3462      re_charset_t *mbcset;
3463      int *equiv_class_alloc;
3464 #else /* not RE_ENABLE_I18N */
3465 build_equiv_class (sbcset, name)
3466 #endif /* not RE_ENABLE_I18N */
3467      re_bitset_ptr_t sbcset;
3468      const unsigned char *name;
3469 {
3470 #if defined _LIBC
3471   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3472   if (nrules != 0)
3473     {
3474       const int32_t *table, *indirect;
3475       const unsigned char *weights, *extra, *cp;
3476       unsigned char char_buf[2];
3477       int32_t idx1, idx2;
3478       unsigned int ch;
3479       size_t len;
3480       /* This #include defines a local function!  */
3481 # include <locale/weight.h>
3482       /* Calculate the index for equivalence class.  */
3483       cp = name;
3484       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3485       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3486                                                _NL_COLLATE_WEIGHTMB);
3487       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3488                                                    _NL_COLLATE_EXTRAMB);
3489       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3490                                                 _NL_COLLATE_INDIRECTMB);
3491       idx1 = findidx (&cp);
3492       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3493         /* This isn't a valid character.  */
3494         return REG_ECOLLATE;
3495
3496       /* Build single byte matcing table for this equivalence class.  */
3497       char_buf[1] = (unsigned char) '\0';
3498       len = weights[idx1];
3499       for (ch = 0; ch < SBC_MAX; ++ch)
3500         {
3501           char_buf[0] = ch;
3502           cp = char_buf;
3503           idx2 = findidx (&cp);
3504 /*
3505           idx2 = table[ch];
3506 */
3507           if (idx2 == 0)
3508             /* This isn't a valid character.  */
3509             continue;
3510           if (len == weights[idx2])
3511             {
3512               int cnt = 0;
3513               while (cnt <= len &&
3514                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3515                 ++cnt;
3516
3517               if (cnt > len)
3518                 bitset_set (sbcset, ch);
3519             }
3520         }
3521       /* Check whether the array has enough space.  */
3522       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3523         {
3524           /* Not enough, realloc it.  */
3525           /* +1 in case of mbcset->nequiv_classes is 0.  */
3526           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3527           /* Use realloc since the array is NULL if *alloc == 0.  */
3528           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3529                                                    int32_t,
3530                                                    new_equiv_class_alloc);
3531           if (BE (new_equiv_classes == NULL, 0))
3532             return REG_ESPACE;
3533           mbcset->equiv_classes = new_equiv_classes;
3534           *equiv_class_alloc = new_equiv_class_alloc;
3535         }
3536       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3537     }
3538   else
3539 #endif /* _LIBC */
3540     {
3541       if (BE (strlen ((const char *) name) != 1, 0))
3542         return REG_ECOLLATE;
3543       bitset_set (sbcset, *name);
3544     }
3545   return REG_NOERROR;
3546 }
3547
3548   /* Helper function for parse_bracket_exp.
3549      Build the character class which is represented by NAME.
3550      The result are written to MBCSET and SBCSET.
3551      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3552      is a pointer argument sinse we may update it.  */
3553
3554 static reg_errcode_t
3555 #ifdef RE_ENABLE_I18N
3556 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3557      re_charset_t *mbcset;
3558      int *char_class_alloc;
3559 #else /* not RE_ENABLE_I18N */
3560 build_charclass (trans, sbcset, class_name, syntax)
3561 #endif /* not RE_ENABLE_I18N */
3562      RE_TRANSLATE_TYPE trans;
3563      re_bitset_ptr_t sbcset;
3564      const unsigned char *class_name;
3565      reg_syntax_t syntax;
3566 {
3567   int i;
3568   const char *name = (const char *) class_name;
3569
3570   /* In case of REG_ICASE "upper" and "lower" match the both of
3571      upper and lower cases.  */
3572   if ((syntax & RE_ICASE)
3573       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3574     name = "alpha";
3575
3576 #ifdef RE_ENABLE_I18N
3577   /* Check the space of the arrays.  */
3578   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3579     {
3580       /* Not enough, realloc it.  */
3581       /* +1 in case of mbcset->nchar_classes is 0.  */
3582       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3583       /* Use realloc since array is NULL if *alloc == 0.  */
3584       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3585                                                new_char_class_alloc);
3586       if (BE (new_char_classes == NULL, 0))
3587         return REG_ESPACE;
3588       mbcset->char_classes = new_char_classes;
3589       *char_class_alloc = new_char_class_alloc;
3590     }
3591   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3592 #endif /* RE_ENABLE_I18N */
3593
3594 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3595     for (i = 0; i < SBC_MAX; ++i)               \
3596       {                                         \
3597         if (ctype_func (i))                     \
3598           {                                     \
3599             int ch = trans ? trans[i] : i;      \
3600             bitset_set (sbcset, ch);            \
3601           }                                     \
3602       }
3603
3604   if (strcmp (name, "alnum") == 0)
3605     BUILD_CHARCLASS_LOOP (isalnum)
3606   else if (strcmp (name, "cntrl") == 0)
3607     BUILD_CHARCLASS_LOOP (iscntrl)
3608   else if (strcmp (name, "lower") == 0)
3609     BUILD_CHARCLASS_LOOP (islower)
3610   else if (strcmp (name, "space") == 0)
3611     BUILD_CHARCLASS_LOOP (isspace)
3612   else if (strcmp (name, "alpha") == 0)
3613     BUILD_CHARCLASS_LOOP (isalpha)
3614   else if (strcmp (name, "digit") == 0)
3615     BUILD_CHARCLASS_LOOP (isdigit)
3616   else if (strcmp (name, "print") == 0)
3617     BUILD_CHARCLASS_LOOP (isprint)
3618   else if (strcmp (name, "upper") == 0)
3619     BUILD_CHARCLASS_LOOP (isupper)
3620   else if (strcmp (name, "blank") == 0)
3621     BUILD_CHARCLASS_LOOP (isblank)
3622   else if (strcmp (name, "graph") == 0)
3623     BUILD_CHARCLASS_LOOP (isgraph)
3624   else if (strcmp (name, "punct") == 0)
3625     BUILD_CHARCLASS_LOOP (ispunct)
3626   else if (strcmp (name, "xdigit") == 0)
3627     BUILD_CHARCLASS_LOOP (isxdigit)
3628   else
3629     return REG_ECTYPE;
3630
3631   return REG_NOERROR;
3632 }
3633
3634 static bin_tree_t *
3635 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3636      re_dfa_t *dfa;
3637      RE_TRANSLATE_TYPE trans;
3638      const unsigned char *class_name;
3639      const unsigned char *extra;
3640      int non_match;
3641      reg_errcode_t *err;
3642 {
3643   re_bitset_ptr_t sbcset;
3644 #ifdef RE_ENABLE_I18N
3645   re_charset_t *mbcset;
3646   int alloc = 0;
3647 #endif /* not RE_ENABLE_I18N */
3648   reg_errcode_t ret;
3649   re_token_t br_token;
3650   bin_tree_t *tree;
3651
3652   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3653 #ifdef RE_ENABLE_I18N
3654   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3655 #endif /* RE_ENABLE_I18N */
3656
3657 #ifdef RE_ENABLE_I18N
3658   if (BE (sbcset == NULL || mbcset == NULL, 0))
3659 #else /* not RE_ENABLE_I18N */
3660   if (BE (sbcset == NULL, 0))
3661 #endif /* not RE_ENABLE_I18N */
3662     {
3663       *err = REG_ESPACE;
3664       return NULL;
3665     }
3666
3667   if (non_match)
3668     {
3669 #ifdef RE_ENABLE_I18N
3670       /*
3671       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3672         bitset_set(cset->sbcset, '\0');
3673       */
3674       mbcset->non_match = 1;
3675 #endif /* not RE_ENABLE_I18N */
3676     }
3677
3678   /* We don't care the syntax in this case.  */
3679   ret = build_charclass (trans, sbcset,
3680 #ifdef RE_ENABLE_I18N
3681                          mbcset, &alloc,
3682 #endif /* RE_ENABLE_I18N */
3683                          class_name, 0);
3684
3685   if (BE (ret != REG_NOERROR, 0))
3686     {
3687       re_free (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689       free_charset (mbcset);
3690 #endif /* RE_ENABLE_I18N */
3691       *err = ret;
3692       return NULL;
3693     }
3694   /* \w match '_' also.  */
3695   for (; *extra; extra++)
3696     bitset_set (sbcset, *extra);
3697
3698   /* If it is non-matching list.  */
3699   if (non_match)
3700     bitset_not (sbcset);
3701
3702 #ifdef RE_ENABLE_I18N
3703   /* Ensure only single byte characters are set.  */
3704   if (dfa->mb_cur_max > 1)
3705     bitset_mask (sbcset, dfa->sb_char);
3706 #endif
3707
3708   /* Build a tree for simple bracket.  */
3709   br_token.type = SIMPLE_BRACKET;
3710   br_token.opr.sbcset = sbcset;
3711   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3712   if (BE (tree == NULL, 0))
3713     goto build_word_op_espace;
3714
3715 #ifdef RE_ENABLE_I18N
3716   if (dfa->mb_cur_max > 1)
3717     {
3718       bin_tree_t *mbc_tree;
3719       /* Build a tree for complex bracket.  */
3720       br_token.type = COMPLEX_BRACKET;
3721       br_token.opr.mbcset = mbcset;
3722       dfa->has_mb_node = 1;
3723       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724       if (BE (mbc_tree == NULL, 0))
3725         goto build_word_op_espace;
3726       /* Then join them by ALT node.  */
3727       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3728       if (BE (mbc_tree != NULL, 1))
3729         return tree;
3730     }
3731   else
3732     {
3733       free_charset (mbcset);
3734       return tree;
3735     }
3736 #else /* not RE_ENABLE_I18N */
3737   return tree;
3738 #endif /* not RE_ENABLE_I18N */
3739
3740  build_word_op_espace:
3741   re_free (sbcset);
3742 #ifdef RE_ENABLE_I18N
3743   free_charset (mbcset);
3744 #endif /* RE_ENABLE_I18N */
3745   *err = REG_ESPACE;
3746   return NULL;
3747 }
3748
3749 /* This is intended for the expressions like "a{1,3}".
3750    Fetch a number from `input', and return the number.
3751    Return -1, if the number field is empty like "{,1}".
3752    Return -2, If an error is occured.  */
3753
3754 static int
3755 fetch_number (input, token, syntax)
3756      re_string_t *input;
3757      re_token_t *token;
3758      reg_syntax_t syntax;
3759 {
3760   int num = -1;
3761   unsigned char c;
3762   while (1)
3763     {
3764       fetch_token (token, input, syntax);
3765       c = token->opr.c;
3766       if (BE (token->type == END_OF_RE, 0))
3767         return -2;
3768       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769         break;
3770       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3771              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3772       num = (num > RE_DUP_MAX) ? -2 : num;
3773     }
3774   return num;
3775 }
3776 \f
3777 #ifdef RE_ENABLE_I18N
3778 static void
3779 free_charset (re_charset_t *cset)
3780 {
3781   re_free (cset->mbchars);
3782 # ifdef _LIBC
3783   re_free (cset->coll_syms);
3784   re_free (cset->equiv_classes);
3785   re_free (cset->range_starts);
3786   re_free (cset->range_ends);
3787 # endif
3788   re_free (cset->char_classes);
3789   re_free (cset);
3790 }
3791 #endif /* RE_ENABLE_I18N */
3792 \f
3793 /* Functions for binary tree operation.  */
3794
3795 /* Create a tree node.  */
3796
3797 static bin_tree_t *
3798 create_tree (dfa, left, right, type)
3799      re_dfa_t *dfa;
3800      bin_tree_t *left;
3801      bin_tree_t *right;
3802      re_token_type_t type;
3803 {
3804   re_token_t t;
3805   t.type = type;
3806   return create_token_tree (dfa, left, right, &t);
3807 }
3808
3809 static bin_tree_t *
3810 create_token_tree (dfa, left, right, token)
3811      re_dfa_t *dfa;
3812      bin_tree_t *left;
3813      bin_tree_t *right;
3814      const re_token_t *token;
3815 {
3816   bin_tree_t *tree;
3817   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3818     {
3819       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3820
3821       if (storage == NULL)
3822         return NULL;
3823       storage->next = dfa->str_tree_storage;
3824       dfa->str_tree_storage = storage;
3825       dfa->str_tree_storage_idx = 0;
3826     }
3827   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3828
3829   tree->parent = NULL;
3830   tree->left = left;
3831   tree->right = right;
3832   tree->token = *token;
3833   tree->token.duplicated = 0;
3834   tree->token.opt_subexp = 0;
3835   tree->first = NULL;
3836   tree->next = NULL;
3837   tree->node_idx = -1;
3838
3839   if (left != NULL)
3840     left->parent = tree;
3841   if (right != NULL)
3842     right->parent = tree;
3843   return tree;
3844 }
3845
3846 /* Mark the tree SRC as an optional subexpression.
3847    To be called from preorder or postorder.  */
3848
3849 static reg_errcode_t
3850 mark_opt_subexp (extra, node)
3851      void *extra;
3852      bin_tree_t *node;
3853 {
3854   int idx = (int) (long) extra;
3855   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3856     node->token.opt_subexp = 1;
3857
3858   return REG_NOERROR;
3859 }
3860
3861 /* Free the allocated memory inside NODE. */
3862
3863 static void
3864 free_token (re_token_t *node)
3865 {
3866 #ifdef RE_ENABLE_I18N
3867   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3868     free_charset (node->opr.mbcset);
3869   else
3870 #endif /* RE_ENABLE_I18N */
3871     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3872       re_free (node->opr.sbcset);
3873 }
3874
3875 /* Worker function for tree walking.  Free the allocated memory inside NODE
3876    and its children. */
3877
3878 static reg_errcode_t
3879 free_tree (void *extra, bin_tree_t *node)
3880 {
3881   free_token (&node->token);
3882   return REG_NOERROR;
3883 }
3884
3885
3886 /* Duplicate the node SRC, and return new node.  This is a preorder
3887    visit similar to the one implemented by the generic visitor, but
3888    we need more infrastructure to maintain two parallel trees --- so,
3889    it's easier to duplicate.  */
3890
3891 static bin_tree_t *
3892 duplicate_tree (root, dfa)
3893      const bin_tree_t *root;
3894      re_dfa_t *dfa;
3895 {
3896   const bin_tree_t *node;
3897   bin_tree_t *dup_root;
3898   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3899
3900   for (node = root; ; )
3901     {
3902       /* Create a new tree and link it back to the current parent.  */
3903       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3904       if (*p_new == NULL)
3905         return NULL;
3906       (*p_new)->parent = dup_node;
3907       (*p_new)->token.duplicated = 1;
3908       dup_node = *p_new;
3909
3910       /* Go to the left node, or up and to the right.  */
3911       if (node->left)
3912         {
3913           node = node->left;
3914           p_new = &dup_node->left;
3915         }
3916       else
3917         {
3918           const bin_tree_t *prev = NULL;
3919           while (node->right == prev || node->right == NULL)
3920             {
3921               prev = node;
3922               node = node->parent;
3923               dup_node = dup_node->parent;
3924               if (!node)
3925                 return dup_root;
3926             }
3927           node = node->right;
3928           p_new = &dup_node->right;
3929         }
3930     }
3931 }