posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (re_dfa_t *dfa);
  37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  42                                              int top_clone_node, int root_node,
  43                                              unsigned int constraint);
  44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  45                                      unsigned int constraint);
  46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  47                                    unsigned int constraint);
  48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  50                                          int node, int root);
  51 static void calc_inveclosure (re_dfa_t *dfa);
  52 static int fetch_number (re_string_t *input, re_token_t *token,
  53                          reg_syntax_t syntax);
  54 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
  55 static int peek_token (re_token_t *token, re_string_t *input,
  56                         reg_syntax_t syntax);
  57 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  58                                reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifndef _LIBC
  89 # ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  91                                       re_charset_t *mbcset, int *range_alloc,
  92                                       bracket_elem_t *start_elem,
  93                                       bracket_elem_t *end_elem);
  94 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  95                                              re_charset_t *mbcset,
  96                                              int *coll_sym_alloc,
  97                                              const unsigned char *name);
  98 # else /* not RE_ENABLE_I18N */
  99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 100                                       bracket_elem_t *start_elem,
 101                                       bracket_elem_t *end_elem);
 102 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 103                                              const unsigned char *name);
 104 # endif /* not RE_ENABLE_I18N */
 105 #endif /* not _LIBC */
 106 #ifdef RE_ENABLE_I18N
 107 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 108                                         re_charset_t *mbcset,
 109                                         int *equiv_class_alloc,
 110                                         const unsigned char *name);
 111 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 112                                       re_bitset_ptr_t sbcset,
 113                                       re_charset_t *mbcset,
 114                                       int *char_class_alloc,
 115                                       const unsigned char *class_name,
 116                                       reg_syntax_t syntax);
 117 #else  /* not RE_ENABLE_I18N */
 118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 119                                         const unsigned char *name);
 120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 121                                       re_bitset_ptr_t sbcset,
 122                                       const unsigned char *class_name,
 123                                       reg_syntax_t syntax);
 124 #endif /* not RE_ENABLE_I18N */
 125 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
 126                                        const unsigned char *class_name,
 127                                        const unsigned char *extra, int not,
 128                                        reg_errcode_t *err);
 129 static void free_bin_tree (bin_tree_t *tree);
 130 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 131                                 re_token_type_t type, int index);
 132 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 133 \f
 134 /* This table gives an error message for each of the error codes listed
 135    in regex.h.  Obviously the order here has to be same as there.
 136    POSIX doesn't require that we do anything for REG_NOERROR,
 137    but why not be nice?  */
 138
 139 const char __re_error_msgid[] attribute_hidden =
 140   {
 141 #define REG_NOERROR_IDX 0
 142     gettext_noop ("Success")    /* REG_NOERROR */
 143     "\0"
 144 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 145     gettext_noop ("No match")   /* REG_NOMATCH */
 146     "\0"
 147 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 148     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 149     "\0"
 150 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 151     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 152     "\0"
 153 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 154     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 155     "\0"
 156 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 157     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 158     "\0"
 159 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 160     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 161     "\0"
 162 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 163     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 164     "\0"
 165 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 166     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 167     "\0"
 168 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 169     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 170     "\0"
 171 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 172     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 173     "\0"
 174 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 175     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 176     "\0"
 177 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 178     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 179     "\0"
 180 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 181     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 182     "\0"
 183 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 184     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 185     "\0"
 186 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 187     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 188     "\0"
 189 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 190     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 191   };
 192
 193 const size_t __re_error_msgid_idx[] attribute_hidden =
 194   {
 195     REG_NOERROR_IDX,
 196     REG_NOMATCH_IDX,
 197     REG_BADPAT_IDX,
 198     REG_ECOLLATE_IDX,
 199     REG_ECTYPE_IDX,
 200     REG_EESCAPE_IDX,
 201     REG_ESUBREG_IDX,
 202     REG_EBRACK_IDX,
 203     REG_EPAREN_IDX,
 204     REG_EBRACE_IDX,
 205     REG_BADBR_IDX,
 206     REG_ERANGE_IDX,
 207     REG_ESPACE_IDX,
 208     REG_BADRPT_IDX,
 209     REG_EEND_IDX,
 210     REG_ESIZE_IDX,
 211     REG_ERPAREN_IDX
 212   };
 213 \f
 214 /* Entry points for GNU code.  */
 215
 216 /* re_compile_pattern is the GNU regular expression compiler: it
 217    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 218    Returns 0 if the pattern was valid, otherwise an error string.
 219
 220    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 221    are set in BUFP on entry.  */
 222
 223 const char *
 224 re_compile_pattern (pattern, length, bufp)
 225     const char *pattern;
 226     size_t length;
 227     struct re_pattern_buffer *bufp;
 228 {
 229   reg_errcode_t ret;
 230
 231   /* And GNU code determines whether or not to get register information
 232      by passing null for the REGS argument to re_match, etc., not by
 233      setting no_sub.  */
 234   bufp->no_sub = 0;
 235
 236   /* Match anchors at newline.  */
 237   bufp->newline_anchor = 1;
 238
 239   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 240
 241   if (!ret)
 242     return NULL;
 243   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 244 }
 245 #ifdef _LIBC
 246 weak_alias (__re_compile_pattern, re_compile_pattern)
 247 #endif
 248
 249 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 250    also be assigned to arbitrarily: each pattern buffer stores its own
 251    syntax, so it can be changed between regex compilations.  */
 252 /* This has no initializer because initialized variables in Emacs
 253    become read-only after dumping.  */
 254 reg_syntax_t re_syntax_options;
 255
 256
 257 /* Specify the precise syntax of regexps for compilation.  This provides
 258    for compatibility for various utilities which historically have
 259    different, incompatible syntaxes.
 260
 261    The argument SYNTAX is a bit mask comprised of the various bits
 262    defined in regex.h.  We return the old syntax.  */
 263
 264 reg_syntax_t
 265 re_set_syntax (syntax)
 266     reg_syntax_t syntax;
 267 {
 268   reg_syntax_t ret = re_syntax_options;
 269
 270   re_syntax_options = syntax;
 271   return ret;
 272 }
 273 #ifdef _LIBC
 274 weak_alias (__re_set_syntax, re_set_syntax)
 275 #endif
 276
 277 int
 278 re_compile_fastmap (bufp)
 279     struct re_pattern_buffer *bufp;
 280 {
 281   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 282   char *fastmap = bufp->fastmap;
 283
 284   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 285   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 286   if (dfa->init_state != dfa->init_state_word)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 288   if (dfa->init_state != dfa->init_state_nl)
 289     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 290   if (dfa->init_state != dfa->init_state_begbuf)
 291     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 292   bufp->fastmap_accurate = 1;
 293   return 0;
 294 }
 295 #ifdef _LIBC
 296 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 297 #endif
 298
 299 static inline void
 300 __attribute ((always_inline))
 301 re_set_fastmap (char *fastmap, int icase, int ch)
 302 {
 303   fastmap[ch] = 1;
 304   if (icase)
 305     fastmap[tolower (ch)] = 1;
 306 }
 307
 308 /* Helper function for re_compile_fastmap.
 309    Compile fastmap for the initial_state INIT_STATE.  */
 310
 311 static void
 312 re_compile_fastmap_iter (bufp, init_state, fastmap)
 313      regex_t *bufp;
 314      const re_dfastate_t *init_state;
 315      char *fastmap;
 316 {
 317   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 318   int node_cnt;
 319   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 320   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 321     {
 322       int node = init_state->nodes.elems[node_cnt];
 323       re_token_type_t type = dfa->nodes[node].type;
 324
 325       if (type == CHARACTER)
 326         {
 327           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 328 #ifdef RE_ENABLE_I18N
 329           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 330             {
 331               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 332               wchar_t wc;
 333               mbstate_t state;
 334
 335               p = buf;
 336               *p++ = dfa->nodes[node].opr.c;
 337               while (++node < dfa->nodes_len
 338                      && dfa->nodes[node].type == CHARACTER
 339                      && dfa->nodes[node].mb_partial)
 340                 *p++ = dfa->nodes[node].opr.c;
 341               memset (&state, 0, sizeof (state));
 342               if (mbrtowc (&wc, (const char *) buf, p - buf,
 343                            &state) == p - buf
 344                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 345                 re_set_fastmap (fastmap, 0, buf[0]);
 346             }
 347 #endif
 348         }
 349       else if (type == SIMPLE_BRACKET)
 350         {
 351           int i, j, ch;
 352           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 353             for (j = 0; j < UINT_BITS; ++j, ++ch)
 354               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 355                 re_set_fastmap (fastmap, icase, ch);
 356         }
 357 #ifdef RE_ENABLE_I18N
 358       else if (type == COMPLEX_BRACKET)
 359         {
 360           int i;
 361           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 362           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 363               || cset->nranges || cset->nchar_classes)
 364             {
 365 # ifdef _LIBC
 366               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 367                 {
 368                   /* In this case we want to catch the bytes which are
 369                      the first byte of any collation elements.
 370                      e.g. In da_DK, we want to catch 'a' since "aa"
 371                           is a valid collation element, and don't catch
 372                           'b' since 'b' is the only collation element
 373                           which starts from 'b'.  */
 374                   int j, ch;
 375                   const int32_t *table = (const int32_t *)
 376                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 377                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 378                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 379                       if (table[ch] < 0)
 380                         re_set_fastmap (fastmap, icase, ch);
 381                 }
 382 # else
 383               if (dfa->mb_cur_max > 1)
 384                 for (i = 0; i < SBC_MAX; ++i)
 385                   if (__btowc (i) == WEOF)
 386                     re_set_fastmap (fastmap, icase, i);
 387 # endif /* not _LIBC */
 388             }
 389           for (i = 0; i < cset->nmbchars; ++i)
 390             {
 391               char buf[256];
 392               mbstate_t state;
 393               memset (&state, '\0', sizeof (state));
 394               __wcrtomb (buf, cset->mbchars[i], &state);
 395               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 396               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 397                 {
 398                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 399                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 400                 }
 401             }
 402         }
 403 #endif /* RE_ENABLE_I18N */
 404       else if (type == END_OF_RE || type == OP_PERIOD
 405                || type == OP_UTF8_PERIOD)
 406         {
 407           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 408           if (type == END_OF_RE)
 409             bufp->can_be_null = 1;
 410           return;
 411         }
 412     }
 413 }
 414 \f
 415 /* Entry point for POSIX code.  */
 416 /* regcomp takes a regular expression as a string and compiles it.
 417
 418    PREG is a regex_t *.  We do not expect any fields to be initialized,
 419    since POSIX says we shouldn't.  Thus, we set
 420
 421      `buffer' to the compiled pattern;
 422      `used' to the length of the compiled pattern;
 423      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 424        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 425        RE_SYNTAX_POSIX_BASIC;
 426      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 427      `fastmap' to an allocated space for the fastmap;
 428      `fastmap_accurate' to zero;
 429      `re_nsub' to the number of subexpressions in PATTERN.
 430
 431    PATTERN is the address of the pattern string.
 432
 433    CFLAGS is a series of bits which affect compilation.
 434
 435      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 436      use POSIX basic syntax.
 437
 438      If REG_NEWLINE is set, then . and [^...] don't match newline.
 439      Also, regexec will try a match beginning after every newline.
 440
 441      If REG_ICASE is set, then we considers upper- and lowercase
 442      versions of letters to be equivalent when matching.
 443
 444      If REG_NOSUB is set, then when PREG is passed to regexec, that
 445      routine will report only success or failure, and nothing about the
 446      registers.
 447
 448    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 449    the return codes and their meanings.)  */
 450
 451 int
 452 regcomp (preg, pattern, cflags)
 453     regex_t *__restrict preg;
 454     const char *__restrict pattern;
 455     int cflags;
 456 {
 457   reg_errcode_t ret;
 458   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 459                          : RE_SYNTAX_POSIX_BASIC);
 460
 461   preg->buffer = NULL;
 462   preg->allocated = 0;
 463   preg->used = 0;
 464
 465   /* Try to allocate space for the fastmap.  */
 466   preg->fastmap = re_malloc (char, SBC_MAX);
 467   if (BE (preg->fastmap == NULL, 0))
 468     return REG_ESPACE;
 469
 470   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 471
 472   /* If REG_NEWLINE is set, newlines are treated differently.  */
 473   if (cflags & REG_NEWLINE)
 474     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 475       syntax &= ~RE_DOT_NEWLINE;
 476       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 477       /* It also changes the matching behavior.  */
 478       preg->newline_anchor = 1;
 479     }
 480   else
 481     preg->newline_anchor = 0;
 482   preg->no_sub = !!(cflags & REG_NOSUB);
 483   preg->translate = NULL;
 484
 485   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 486
 487   /* POSIX doesn't distinguish between an unmatched open-group and an
 488      unmatched close-group: both are REG_EPAREN.  */
 489   if (ret == REG_ERPAREN)
 490     ret = REG_EPAREN;
 491
 492   /* We have already checked preg->fastmap != NULL.  */
 493   if (BE (ret == REG_NOERROR, 1))
 494     /* Compute the fastmap now, since regexec cannot modify the pattern
 495        buffer.  This function nevers fails in this implementation.  */
 496     (void) re_compile_fastmap (preg);
 497   else
 498     {
 499       /* Some error occurred while compiling the expression.  */
 500       re_free (preg->fastmap);
 501       preg->fastmap = NULL;
 502     }
 503
 504   return (int) ret;
 505 }
 506 #ifdef _LIBC
 507 weak_alias (__regcomp, regcomp)
 508 #endif
 509
 510 /* Returns a message corresponding to an error code, ERRCODE, returned
 511    from either regcomp or regexec.   We don't use PREG here.  */
 512
 513 size_t
 514 regerror (errcode, preg, errbuf, errbuf_size)
 515     int errcode;
 516     const regex_t *preg;
 517     char *errbuf;
 518     size_t errbuf_size;
 519 {
 520   const char *msg;
 521   size_t msg_size;
 522
 523   if (BE (errcode < 0
 524           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 525                                / sizeof (__re_error_msgid_idx[0])), 0))
 526     /* Only error codes returned by the rest of the code should be passed
 527        to this routine.  If we are given anything else, or if other regex
 528        code generates an invalid error code, then the program has a bug.
 529        Dump core so we can fix it.  */
 530     abort ();
 531
 532   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 533
 534   msg_size = strlen (msg) + 1; /* Includes the null.  */
 535
 536   if (BE (errbuf_size != 0, 1))
 537     {
 538       if (BE (msg_size > errbuf_size, 0))
 539         {
 540 #if defined HAVE_MEMPCPY || defined _LIBC
 541           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 542 #else
 543           memcpy (errbuf, msg, errbuf_size - 1);
 544           errbuf[errbuf_size - 1] = 0;
 545 #endif
 546         }
 547       else
 548         memcpy (errbuf, msg, msg_size);
 549     }
 550
 551   return msg_size;
 552 }
 553 #ifdef _LIBC
 554 weak_alias (__regerror, regerror)
 555 #endif
 556
 557
 558 static void
 559 free_dfa_content (re_dfa_t *dfa)
 560 {
 561   int i, j;
 562
 563   re_free (dfa->subexps);
 564
 565   for (i = 0; i < dfa->nodes_len; ++i)
 566     {
 567       re_token_t *node = dfa->nodes + i;
 568 #ifdef RE_ENABLE_I18N
 569       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 570         free_charset (node->opr.mbcset);
 571       else
 572 #endif /* RE_ENABLE_I18N */
 573         if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 574           re_free (node->opr.sbcset);
 575     }
 576   re_free (dfa->nexts);
 577   for (i = 0; i < dfa->nodes_len; ++i)
 578     {
 579       if (dfa->eclosures != NULL)
 580         re_node_set_free (dfa->eclosures + i);
 581       if (dfa->inveclosures != NULL)
 582         re_node_set_free (dfa->inveclosures + i);
 583       if (dfa->edests != NULL)
 584         re_node_set_free (dfa->edests + i);
 585     }
 586   re_free (dfa->edests);
 587   re_free (dfa->eclosures);
 588   re_free (dfa->inveclosures);
 589   re_free (dfa->nodes);
 590
 591   for (i = 0; i <= dfa->state_hash_mask; ++i)
 592     {
 593       struct re_state_table_entry *entry = dfa->state_table + i;
 594       for (j = 0; j < entry->num; ++j)
 595         {
 596           re_dfastate_t *state = entry->array[j];
 597           free_state (state);
 598         }
 599       re_free (entry->array);
 600     }
 601   re_free (dfa->state_table);
 602
 603   if (dfa->word_char != NULL)
 604     re_free (dfa->word_char);
 605 #ifdef DEBUG
 606   re_free (dfa->re_str);
 607 #endif
 608
 609   re_free (dfa);
 610 }
 611
 612
 613 /* Free dynamically allocated space used by PREG.  */
 614
 615 void
 616 regfree (preg)
 617     regex_t *preg;
 618 {
 619   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 620   if (BE (dfa != NULL, 1))
 621     free_dfa_content (dfa);
 622
 623   re_free (preg->fastmap);
 624 }
 625 #ifdef _LIBC
 626 weak_alias (__regfree, regfree)
 627 #endif
 628 \f
 629 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 630    them unless specifically requested.  */
 631
 632 #if defined _REGEX_RE_COMP || defined _LIBC
 633
 634 /* BSD has one and only one pattern buffer.  */
 635 static struct re_pattern_buffer re_comp_buf;
 636
 637 char *
 638 # ifdef _LIBC
 639 /* Make these definitions weak in libc, so POSIX programs can redefine
 640    these names if they don't use our functions, and still use
 641    regcomp/regexec above without link errors.  */
 642 weak_function
 643 # endif
 644 re_comp (s)
 645      const char *s;
 646 {
 647   reg_errcode_t ret;
 648   char *fastmap;
 649
 650   if (!s)
 651     {
 652       if (!re_comp_buf.buffer)
 653         return gettext ("No previous regular expression");
 654       return 0;
 655     }
 656
 657   if (re_comp_buf.buffer)
 658     {
 659       fastmap = re_comp_buf.fastmap;
 660       re_comp_buf.fastmap = NULL;
 661       __regfree (&re_comp_buf);
 662       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 663       re_comp_buf.fastmap = fastmap;
 664     }
 665
 666   if (re_comp_buf.fastmap == NULL)
 667     {
 668       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 669       if (re_comp_buf.fastmap == NULL)
 670         return (char *) gettext (__re_error_msgid
 671                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 672     }
 673
 674   /* Since `re_exec' always passes NULL for the `regs' argument, we
 675      don't need to initialize the pattern buffer fields which affect it.  */
 676
 677   /* Match anchors at newlines.  */
 678   re_comp_buf.newline_anchor = 1;
 679
 680   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 681
 682   if (!ret)
 683     return NULL;
 684
 685   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 686   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 687 }
 688
 689 #ifdef _LIBC
 690 libc_freeres_fn (free_mem)
 691 {
 692   __regfree (&re_comp_buf);
 693 }
 694 #endif
 695
 696 #endif /* _REGEX_RE_COMP */
 697 \f
 698 /* Internal entry point.
 699    Compile the regular expression PATTERN, whose length is LENGTH.
 700    SYNTAX indicate regular expression's syntax.  */
 701
 702 static reg_errcode_t
 703 re_compile_internal (preg, pattern, length, syntax)
 704      regex_t *preg;
 705      const char * pattern;
 706      int length;
 707      reg_syntax_t syntax;
 708 {
 709   reg_errcode_t err = REG_NOERROR;
 710   re_dfa_t *dfa;
 711   re_string_t regexp;
 712
 713   /* Initialize the pattern buffer.  */
 714   preg->fastmap_accurate = 0;
 715   preg->syntax = syntax;
 716   preg->not_bol = preg->not_eol = 0;
 717   preg->used = 0;
 718   preg->re_nsub = 0;
 719   preg->can_be_null = 0;
 720   preg->regs_allocated = REGS_UNALLOCATED;
 721
 722   /* Initialize the dfa.  */
 723   dfa = (re_dfa_t *) preg->buffer;
 724   if (preg->allocated < sizeof (re_dfa_t))
 725     {
 726       /* If zero allocated, but buffer is non-null, try to realloc
 727          enough space.  This loses if buffer's address is bogus, but
 728          that is the user's responsibility.  If ->buffer is NULL this
 729          is a simple allocation.  */
 730       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 731       if (dfa == NULL)
 732         return REG_ESPACE;
 733       preg->allocated = sizeof (re_dfa_t);
 734     }
 735   preg->buffer = (unsigned char *) dfa;
 736   preg->used = sizeof (re_dfa_t);
 737
 738   err = init_dfa (dfa, length);
 739   if (BE (err != REG_NOERROR, 0))
 740     {
 741       re_free (dfa);
 742       preg->buffer = NULL;
 743       preg->allocated = 0;
 744       return err;
 745     }
 746 #ifdef DEBUG
 747   dfa->re_str = re_malloc (char, length + 1);
 748   strncpy (dfa->re_str, pattern, length + 1);
 749 #endif
 750
 751   err = re_string_construct (&regexp, pattern, length, preg->translate,
 752                              syntax & RE_ICASE, dfa);
 753   if (BE (err != REG_NOERROR, 0))
 754     {
 755       re_free (dfa);
 756       preg->buffer = NULL;
 757       preg->allocated = 0;
 758       return err;
 759     }
 760
 761   /* Parse the regular expression, and build a structure tree.  */
 762   preg->re_nsub = 0;
 763   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 764   if (BE (dfa->str_tree == NULL, 0))
 765     goto re_compile_internal_free_return;
 766
 767 #ifdef RE_ENABLE_I18N
 768   /* If possible, do searching in single byte encoding to speed things up.  */
 769   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 770     optimize_utf8 (dfa);
 771 #endif
 772
 773   /* Analyze the tree and collect information which is necessary to
 774      create the dfa.  */
 775   err = analyze (dfa);
 776   if (BE (err != REG_NOERROR, 0))
 777     goto re_compile_internal_free_return;
 778
 779   /* Then create the initial state of the dfa.  */
 780   err = create_initial_state (dfa);
 781
 782   /* Release work areas.  */
 783   free_workarea_compile (preg);
 784   re_string_destruct (&regexp);
 785
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788     re_compile_internal_free_return:
 789       free_dfa_content (dfa);
 790       preg->buffer = NULL;
 791       preg->allocated = 0;
 792     }
 793
 794   return err;
 795 }
 796
 797 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 798    as the initial length of some arrays.  */
 799
 800 static reg_errcode_t
 801 init_dfa (dfa, pat_len)
 802      re_dfa_t *dfa;
 803      int pat_len;
 804 {
 805   int table_size;
 806
 807   memset (dfa, '\0', sizeof (re_dfa_t));
 808
 809   dfa->nodes_alloc = pat_len + 1;
 810   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 811
 812   dfa->states_alloc = pat_len + 1;
 813
 814   /*  table_size = 2 ^ ceil(log pat_len) */
 815   for (table_size = 1; table_size > 0; table_size <<= 1)
 816     if (table_size > pat_len)
 817       break;
 818
 819   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 820   dfa->state_hash_mask = table_size - 1;
 821
 822   dfa->subexps_alloc = 1;
 823   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 824   dfa->word_char = NULL;
 825
 826   dfa->mb_cur_max = MB_CUR_MAX;
 827 #ifdef _LIBC
 828   if (dfa->mb_cur_max > 1
 829       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 830     dfa->is_utf8 = 1;
 831   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 832                        != 0);
 833 #endif
 834
 835   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 836           || dfa->subexps == NULL, 0))
 837     {
 838       /* We don't bother to free anything which was allocated.  Very
 839          soon the process will go down anyway.  */
 840       dfa->subexps = NULL;
 841       dfa->state_table = NULL;
 842       dfa->nodes = NULL;
 843       return REG_ESPACE;
 844     }
 845   return REG_NOERROR;
 846 }
 847
 848 /* Initialize WORD_CHAR table, which indicate which character is
 849    "word".  In this case "word" means that it is the word construction
 850    character used by some operators like "\<", "\>", etc.  */
 851
 852 static reg_errcode_t
 853 init_word_char (dfa)
 854      re_dfa_t *dfa;
 855 {
 856   int i, j, ch;
 857   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 858   if (BE (dfa->word_char == NULL, 0))
 859     return REG_ESPACE;
 860   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 861     for (j = 0; j < UINT_BITS; ++j, ++ch)
 862       if (isalnum (ch) || ch == '_')
 863         dfa->word_char[i] |= 1 << j;
 864   return REG_NOERROR;
 865 }
 866
 867 /* Free the work area which are only used while compiling.  */
 868
 869 static void
 870 free_workarea_compile (preg)
 871      regex_t *preg;
 872 {
 873   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 874   free_bin_tree (dfa->str_tree);
 875   dfa->str_tree = NULL;
 876   re_free (dfa->org_indices);
 877   dfa->org_indices = NULL;
 878 }
 879
 880 /* Create initial states for all contexts.  */
 881
 882 static reg_errcode_t
 883 create_initial_state (dfa)
 884      re_dfa_t *dfa;
 885 {
 886   int first, i;
 887   reg_errcode_t err;
 888   re_node_set init_nodes;
 889
 890   /* Initial states have the epsilon closure of the node which is
 891      the first node of the regular expression.  */
 892   first = dfa->str_tree->first;
 893   dfa->init_node = first;
 894   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 895   if (BE (err != REG_NOERROR, 0))
 896     return err;
 897
 898   /* The back-references which are in initial states can epsilon transit,
 899      since in this case all of the subexpressions can be null.
 900      Then we add epsilon closures of the nodes which are the next nodes of
 901      the back-references.  */
 902   if (dfa->nbackref > 0)
 903     for (i = 0; i < init_nodes.nelem; ++i)
 904       {
 905         int node_idx = init_nodes.elems[i];
 906         re_token_type_t type = dfa->nodes[node_idx].type;
 907
 908         int clexp_idx;
 909         if (type != OP_BACK_REF)
 910           continue;
 911         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 912           {
 913             re_token_t *clexp_node;
 914             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 915             if (clexp_node->type == OP_CLOSE_SUBEXP
 916                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 917               break;
 918           }
 919         if (clexp_idx == init_nodes.nelem)
 920           continue;
 921
 922         if (type == OP_BACK_REF)
 923           {
 924             int dest_idx = dfa->edests[node_idx].elems[0];
 925             if (!re_node_set_contains (&init_nodes, dest_idx))
 926               {
 927                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 928                 i = 0;
 929               }
 930           }
 931       }
 932
 933   /* It must be the first time to invoke acquire_state.  */
 934   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 935   /* We don't check ERR here, since the initial state must not be NULL.  */
 936   if (BE (dfa->init_state == NULL, 0))
 937     return err;
 938   if (dfa->init_state->has_constraint)
 939     {
 940       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 941                                                        CONTEXT_WORD);
 942       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 943                                                      CONTEXT_NEWLINE);
 944       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 945                                                          &init_nodes,
 946                                                          CONTEXT_NEWLINE
 947                                                          | CONTEXT_BEGBUF);
 948       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 949               || dfa->init_state_begbuf == NULL, 0))
 950         return err;
 951     }
 952   else
 953     dfa->init_state_word = dfa->init_state_nl
 954       = dfa->init_state_begbuf = dfa->init_state;
 955
 956   re_node_set_free (&init_nodes);
 957   return REG_NOERROR;
 958 }
 959 \f
 960 #ifdef RE_ENABLE_I18N
 961 /* If it is possible to do searching in single byte encoding instead of UTF-8
 962    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
 963    DFA nodes where needed.  */
 964
 965 static void
 966 optimize_utf8 (dfa)
 967      re_dfa_t *dfa;
 968 {
 969   int node, i, mb_chars = 0, has_period = 0;
 970
 971   for (node = 0; node < dfa->nodes_len; ++node)
 972     switch (dfa->nodes[node].type)
 973       {
 974       case CHARACTER:
 975         if (dfa->nodes[node].opr.c >= 0x80)
 976           mb_chars = 1;
 977         break;
 978       case ANCHOR:
 979         switch (dfa->nodes[node].opr.idx)
 980           {
 981           case LINE_FIRST:
 982           case LINE_LAST:
 983           case BUF_FIRST:
 984           case BUF_LAST:
 985             break;
 986           default:
 987             /* Word anchors etc. cannot be handled.  */
 988             return;
 989           }
 990         break;
 991       case OP_PERIOD:
 992         has_period = 1;
 993         break;
 994       case OP_BACK_REF:
 995       case OP_ALT:
 996       case END_OF_RE:
 997       case OP_DUP_ASTERISK:
 998       case OP_DUP_QUESTION:
 999       case OP_DUP_PLUS:
1000       case OP_OPEN_SUBEXP:
1001       case OP_CLOSE_SUBEXP:
1002         break;
1003       case SIMPLE_BRACKET:
1004         /* Just double check.  */
1005         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1006           if (dfa->nodes[node].opr.sbcset[i])
1007             return;
1008         break;
1009       default:
1010         return;
1011       }
1012
1013   if (mb_chars || has_period)
1014     for (node = 0; node < dfa->nodes_len; ++node)
1015       {
1016         if (dfa->nodes[node].type == CHARACTER
1017             && dfa->nodes[node].opr.c >= 0x80)
1018           dfa->nodes[node].mb_partial = 0;
1019         else if (dfa->nodes[node].type == OP_PERIOD)
1020           dfa->nodes[node].type = OP_UTF8_PERIOD;
1021       }
1022
1023   /* The search can be in single byte locale.  */
1024   dfa->mb_cur_max = 1;
1025   dfa->is_utf8 = 0;
1026   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1027 }
1028 #endif
1029 \f
1030 /* Analyze the structure tree, and calculate "first", "next", "edest",
1031    "eclosure", and "inveclosure".  */
1032
1033 static reg_errcode_t
1034 analyze (dfa)
1035      re_dfa_t *dfa;
1036 {
1037   int i;
1038   reg_errcode_t ret;
1039
1040   /* Allocate arrays.  */
1041   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1042   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1043   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1044   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1045   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1046   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1047           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1048     return REG_ESPACE;
1049   /* Initialize them.  */
1050   for (i = 0; i < dfa->nodes_len; ++i)
1051     {
1052       dfa->nexts[i] = -1;
1053       re_node_set_init_empty (dfa->edests + i);
1054       re_node_set_init_empty (dfa->eclosures + i);
1055       re_node_set_init_empty (dfa->inveclosures + i);
1056     }
1057
1058   ret = analyze_tree (dfa, dfa->str_tree);
1059   if (BE (ret == REG_NOERROR, 1))
1060     {
1061       ret = calc_eclosure (dfa);
1062       if (ret == REG_NOERROR)
1063         calc_inveclosure (dfa);
1064     }
1065   return ret;
1066 }
1067
1068 /* Helper functions for analyze.
1069    This function calculate "first", "next", and "edest" for the subtree
1070    whose root is NODE.  */
1071
1072 static reg_errcode_t
1073 analyze_tree (dfa, node)
1074      re_dfa_t *dfa;
1075      bin_tree_t *node;
1076 {
1077   reg_errcode_t ret;
1078   if (node->first == -1)
1079     calc_first (dfa, node);
1080   if (node->next == -1)
1081     calc_next (dfa, node);
1082   if (node->eclosure.nelem == 0)
1083     calc_epsdest (dfa, node);
1084   /* Calculate "first" etc. for the left child.  */
1085   if (node->left != NULL)
1086     {
1087       ret = analyze_tree (dfa, node->left);
1088       if (BE (ret != REG_NOERROR, 0))
1089         return ret;
1090     }
1091   /* Calculate "first" etc. for the right child.  */
1092   if (node->right != NULL)
1093     {
1094       ret = analyze_tree (dfa, node->right);
1095       if (BE (ret != REG_NOERROR, 0))
1096         return ret;
1097     }
1098   return REG_NOERROR;
1099 }
1100
1101 /* Calculate "first" for the node NODE.  */
1102 static void
1103 calc_first (dfa, node)
1104      re_dfa_t *dfa;
1105      bin_tree_t *node;
1106 {
1107   int idx, type;
1108   idx = node->node_idx;
1109   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1110
1111   switch (type)
1112     {
1113 #ifdef DEBUG
1114     case OP_OPEN_BRACKET:
1115     case OP_CLOSE_BRACKET:
1116     case OP_OPEN_DUP_NUM:
1117     case OP_CLOSE_DUP_NUM:
1118     case OP_NON_MATCH_LIST:
1119     case OP_OPEN_COLL_ELEM:
1120     case OP_CLOSE_COLL_ELEM:
1121     case OP_OPEN_EQUIV_CLASS:
1122     case OP_CLOSE_EQUIV_CLASS:
1123     case OP_OPEN_CHAR_CLASS:
1124     case OP_CLOSE_CHAR_CLASS:
1125       /* These must not be appeared here.  */
1126       assert (0);
1127 #endif
1128     case END_OF_RE:
1129     case CHARACTER:
1130     case OP_PERIOD:
1131     case OP_DUP_ASTERISK:
1132     case OP_DUP_QUESTION:
1133 #ifdef RE_ENABLE_I18N
1134     case OP_UTF8_PERIOD:
1135     case COMPLEX_BRACKET:
1136 #endif /* RE_ENABLE_I18N */
1137     case SIMPLE_BRACKET:
1138     case OP_BACK_REF:
1139     case ANCHOR:
1140     case OP_OPEN_SUBEXP:
1141     case OP_CLOSE_SUBEXP:
1142       node->first = idx;
1143       break;
1144     case OP_DUP_PLUS:
1145 #ifdef DEBUG
1146       assert (node->left != NULL);
1147 #endif
1148       if (node->left->first == -1)
1149         calc_first (dfa, node->left);
1150       node->first = node->left->first;
1151       break;
1152     case OP_ALT:
1153       node->first = idx;
1154       break;
1155       /* else fall through */
1156     default:
1157 #ifdef DEBUG
1158       assert (node->left != NULL);
1159 #endif
1160       if (node->left->first == -1)
1161         calc_first (dfa, node->left);
1162       node->first = node->left->first;
1163       break;
1164     }
1165 }
1166
1167 /* Calculate "next" for the node NODE.  */
1168
1169 static void
1170 calc_next (dfa, node)
1171      re_dfa_t *dfa;
1172      bin_tree_t *node;
1173 {
1174   int idx, type;
1175   bin_tree_t *parent = node->parent;
1176   if (parent == NULL)
1177     {
1178       node->next = -1;
1179       idx = node->node_idx;
1180       if (node->type == 0)
1181         dfa->nexts[idx] = node->next;
1182       return;
1183     }
1184
1185   idx = parent->node_idx;
1186   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1187
1188   switch (type)
1189     {
1190     case OP_DUP_ASTERISK:
1191     case OP_DUP_PLUS:
1192       node->next = idx;
1193       break;
1194     case CONCAT:
1195       if (parent->left == node)
1196         {
1197           if (parent->right->first == -1)
1198             calc_first (dfa, parent->right);
1199           node->next = parent->right->first;
1200           break;
1201         }
1202       /* else fall through */
1203     default:
1204       if (parent->next == -1)
1205         calc_next (dfa, parent);
1206       node->next = parent->next;
1207       break;
1208     }
1209   idx = node->node_idx;
1210   if (node->type == 0)
1211     dfa->nexts[idx] = node->next;
1212 }
1213
1214 /* Calculate "edest" for the node NODE.  */
1215
1216 static void
1217 calc_epsdest (dfa, node)
1218      re_dfa_t *dfa;
1219      bin_tree_t *node;
1220 {
1221   int idx;
1222   idx = node->node_idx;
1223   if (node->type == 0)
1224     {
1225       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1226           || dfa->nodes[idx].type == OP_DUP_PLUS
1227           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1228         {
1229           if (node->left->first == -1)
1230             calc_first (dfa, node->left);
1231           if (node->next == -1)
1232             calc_next (dfa, node);
1233           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1234                               node->next);
1235         }
1236       else if (dfa->nodes[idx].type == OP_ALT)
1237         {
1238           int left, right;
1239           if (node->left != NULL)
1240             {
1241               if (node->left->first == -1)
1242                 calc_first (dfa, node->left);
1243               left = node->left->first;
1244             }
1245           else
1246             {
1247               if (node->next == -1)
1248                 calc_next (dfa, node);
1249               left = node->next;
1250             }
1251           if (node->right != NULL)
1252             {
1253               if (node->right->first == -1)
1254                 calc_first (dfa, node->right);
1255               right = node->right->first;
1256             }
1257           else
1258             {
1259               if (node->next == -1)
1260                 calc_next (dfa, node);
1261               right = node->next;
1262             }
1263           re_node_set_init_2 (dfa->edests + idx, left, right);
1264         }
1265       else if (dfa->nodes[idx].type == ANCHOR
1266                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1267                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1268                || dfa->nodes[idx].type == OP_BACK_REF)
1269         re_node_set_init_1 (dfa->edests + idx, node->next);
1270       else
1271         assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1272     }
1273 }
1274
1275 /* Duplicate the epsilon closure of the node ROOT_NODE.
1276    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1277    to their own constraint.  */
1278
1279 static reg_errcode_t
1280 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1281                         init_constraint)
1282      re_dfa_t *dfa;
1283      int top_org_node, top_clone_node, root_node;
1284      unsigned int init_constraint;
1285 {
1286   reg_errcode_t err;
1287   int org_node, clone_node, ret;
1288   unsigned int constraint = init_constraint;
1289   for (org_node = top_org_node, clone_node = top_clone_node;;)
1290     {
1291       int org_dest, clone_dest;
1292       if (dfa->nodes[org_node].type == OP_BACK_REF)
1293         {
1294           /* If the back reference epsilon-transit, its destination must
1295              also have the constraint.  Then duplicate the epsilon closure
1296              of the destination of the back reference, and store it in
1297              edests of the back reference.  */
1298           org_dest = dfa->nexts[org_node];
1299           re_node_set_empty (dfa->edests + clone_node);
1300           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1301           if (BE (err != REG_NOERROR, 0))
1302             return err;
1303           dfa->nexts[clone_node] = dfa->nexts[org_node];
1304           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1305           if (BE (ret < 0, 0))
1306             return REG_ESPACE;
1307         }
1308       else if (dfa->edests[org_node].nelem == 0)
1309         {
1310           /* In case of the node can't epsilon-transit, don't duplicate the
1311              destination and store the original destination as the
1312              destination of the node.  */
1313           dfa->nexts[clone_node] = dfa->nexts[org_node];
1314           break;
1315         }
1316       else if (dfa->edests[org_node].nelem == 1)
1317         {
1318           /* In case of the node can epsilon-transit, and it has only one
1319              destination.  */
1320           org_dest = dfa->edests[org_node].elems[0];
1321           re_node_set_empty (dfa->edests + clone_node);
1322           if (dfa->nodes[org_node].type == ANCHOR)
1323             {
1324               /* In case of the node has another constraint, append it.  */
1325               if (org_node == root_node && clone_node != org_node)
1326                 {
1327                   /* ...but if the node is root_node itself, it means the
1328                      epsilon closure have a loop, then tie it to the
1329                      destination of the root_node.  */
1330                   ret = re_node_set_insert (dfa->edests + clone_node,
1331                                             org_dest);
1332                   if (BE (ret < 0, 0))
1333                     return REG_ESPACE;
1334                   break;
1335                 }
1336               constraint |= dfa->nodes[org_node].opr.ctx_type;
1337             }
1338           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1339           if (BE (err != REG_NOERROR, 0))
1340             return err;
1341           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1342           if (BE (ret < 0, 0))
1343             return REG_ESPACE;
1344         }
1345       else /* dfa->edests[org_node].nelem == 2 */
1346         {
1347           /* In case of the node can epsilon-transit, and it has two
1348              destinations. E.g. '|', '*', '+', '?'.   */
1349           org_dest = dfa->edests[org_node].elems[0];
1350           re_node_set_empty (dfa->edests + clone_node);
1351           /* Search for a duplicated node which satisfies the constraint.  */
1352           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1353           if (clone_dest == -1)
1354             {
1355               /* There are no such a duplicated node, create a new one.  */
1356               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1357               if (BE (err != REG_NOERROR, 0))
1358                 return err;
1359               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1360               if (BE (ret < 0, 0))
1361                 return REG_ESPACE;
1362               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1363                                             root_node, constraint);
1364               if (BE (err != REG_NOERROR, 0))
1365                 return err;
1366             }
1367           else
1368             {
1369               /* There are a duplicated node which satisfy the constraint,
1370                  use it to avoid infinite loop.  */
1371               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1372               if (BE (ret < 0, 0))
1373                 return REG_ESPACE;
1374             }
1375
1376           org_dest = dfa->edests[org_node].elems[1];
1377           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1378           if (BE (err != REG_NOERROR, 0))
1379             return err;
1380           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1381           if (BE (ret < 0, 0))
1382             return REG_ESPACE;
1383         }
1384       org_node = org_dest;
1385       clone_node = clone_dest;
1386     }
1387   return REG_NOERROR;
1388 }
1389
1390 /* Search for a node which is duplicated from the node ORG_NODE, and
1391    satisfies the constraint CONSTRAINT.  */
1392
1393 static int
1394 search_duplicated_node (dfa, org_node, constraint)
1395      re_dfa_t *dfa;
1396      int org_node;
1397      unsigned int constraint;
1398 {
1399   int idx;
1400   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1401     {
1402       if (org_node == dfa->org_indices[idx]
1403           && constraint == dfa->nodes[idx].constraint)
1404         return idx; /* Found.  */
1405     }
1406   return -1; /* Not found.  */
1407 }
1408
1409 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1410    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1411    otherwise return the error code.  */
1412
1413 static reg_errcode_t
1414 duplicate_node (new_idx, dfa, org_idx, constraint)
1415      re_dfa_t *dfa;
1416      int *new_idx, org_idx;
1417      unsigned int constraint;
1418 {
1419   re_token_t dup;
1420   int dup_idx;
1421
1422   dup = dfa->nodes[org_idx];
1423   dup_idx = re_dfa_add_node (dfa, dup, 1);
1424   if (BE (dup_idx == -1, 0))
1425     return REG_ESPACE;
1426   dfa->nodes[dup_idx].constraint = constraint;
1427   if (dfa->nodes[org_idx].type == ANCHOR)
1428     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1429   dfa->nodes[dup_idx].duplicated = 1;
1430   re_node_set_init_empty (dfa->edests + dup_idx);
1431   re_node_set_init_empty (dfa->eclosures + dup_idx);
1432   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1433
1434   /* Store the index of the original node.  */
1435   dfa->org_indices[dup_idx] = org_idx;
1436   *new_idx = dup_idx;
1437   return REG_NOERROR;
1438 }
1439
1440 static void
1441 calc_inveclosure (dfa)
1442      re_dfa_t *dfa;
1443 {
1444   int src, idx, dest;
1445   for (src = 0; src < dfa->nodes_len; ++src)
1446     {
1447       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1448         {
1449           dest = dfa->eclosures[src].elems[idx];
1450           re_node_set_insert (dfa->inveclosures + dest, src);
1451         }
1452     }
1453 }
1454
1455 /* Calculate "eclosure" for all the node in DFA.  */
1456
1457 static reg_errcode_t
1458 calc_eclosure (dfa)
1459      re_dfa_t *dfa;
1460 {
1461   int node_idx, incomplete;
1462 #ifdef DEBUG
1463   assert (dfa->nodes_len > 0);
1464 #endif
1465   incomplete = 0;
1466   /* For each nodes, calculate epsilon closure.  */
1467   for (node_idx = 0; ; ++node_idx)
1468     {
1469       reg_errcode_t err;
1470       re_node_set eclosure_elem;
1471       if (node_idx == dfa->nodes_len)
1472         {
1473           if (!incomplete)
1474             break;
1475           incomplete = 0;
1476           node_idx = 0;
1477         }
1478
1479 #ifdef DEBUG
1480       assert (dfa->eclosures[node_idx].nelem != -1);
1481 #endif
1482       /* If we have already calculated, skip it.  */
1483       if (dfa->eclosures[node_idx].nelem != 0)
1484         continue;
1485       /* Calculate epsilon closure of `node_idx'.  */
1486       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1487       if (BE (err != REG_NOERROR, 0))
1488         return err;
1489
1490       if (dfa->eclosures[node_idx].nelem == 0)
1491         {
1492           incomplete = 1;
1493           re_node_set_free (&eclosure_elem);
1494         }
1495     }
1496   return REG_NOERROR;
1497 }
1498
1499 /* Calculate epsilon closure of NODE.  */
1500
1501 static reg_errcode_t
1502 calc_eclosure_iter (new_set, dfa, node, root)
1503      re_node_set *new_set;
1504      re_dfa_t *dfa;
1505      int node, root;
1506 {
1507   reg_errcode_t err;
1508   unsigned int constraint;
1509   int i, incomplete;
1510   re_node_set eclosure;
1511   incomplete = 0;
1512   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1513   if (BE (err != REG_NOERROR, 0))
1514     return err;
1515
1516   /* This indicates that we are calculating this node now.
1517      We reference this value to avoid infinite loop.  */
1518   dfa->eclosures[node].nelem = -1;
1519
1520   constraint = ((dfa->nodes[node].type == ANCHOR)
1521                 ? dfa->nodes[node].opr.ctx_type : 0);
1522   /* If the current node has constraints, duplicate all nodes.
1523      Since they must inherit the constraints.  */
1524   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1525     {
1526       int org_node, cur_node;
1527       org_node = cur_node = node;
1528       err = duplicate_node_closure (dfa, node, node, node, constraint);
1529       if (BE (err != REG_NOERROR, 0))
1530         return err;
1531     }
1532
1533   /* Expand each epsilon destination nodes.  */
1534   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1535     for (i = 0; i < dfa->edests[node].nelem; ++i)
1536       {
1537         re_node_set eclosure_elem;
1538         int edest = dfa->edests[node].elems[i];
1539         /* If calculating the epsilon closure of `edest' is in progress,
1540            return intermediate result.  */
1541         if (dfa->eclosures[edest].nelem == -1)
1542           {
1543             incomplete = 1;
1544             continue;
1545           }
1546         /* If we haven't calculated the epsilon closure of `edest' yet,
1547            calculate now. Otherwise use calculated epsilon closure.  */
1548         if (dfa->eclosures[edest].nelem == 0)
1549           {
1550             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1551             if (BE (err != REG_NOERROR, 0))
1552               return err;
1553           }
1554         else
1555           eclosure_elem = dfa->eclosures[edest];
1556         /* Merge the epsilon closure of `edest'.  */
1557         re_node_set_merge (&eclosure, &eclosure_elem);
1558         /* If the epsilon closure of `edest' is incomplete,
1559            the epsilon closure of this node is also incomplete.  */
1560         if (dfa->eclosures[edest].nelem == 0)
1561           {
1562             incomplete = 1;
1563             re_node_set_free (&eclosure_elem);
1564           }
1565       }
1566
1567   /* Epsilon closures include itself.  */
1568   re_node_set_insert (&eclosure, node);
1569   if (incomplete && !root)
1570     dfa->eclosures[node].nelem = 0;
1571   else
1572     dfa->eclosures[node] = eclosure;
1573   *new_set = eclosure;
1574   return REG_NOERROR;
1575 }
1576 \f
1577 /* Functions for token which are used in the parser.  */
1578
1579 /* Fetch a token from INPUT.
1580    We must not use this function inside bracket expressions.  */
1581
1582 static re_token_t
1583 fetch_token (input, syntax)
1584      re_string_t *input;
1585      reg_syntax_t syntax;
1586 {
1587   re_token_t token;
1588   int consumed_byte;
1589   consumed_byte = peek_token (&token, input, syntax);
1590   re_string_skip_bytes (input, consumed_byte);
1591   return token;
1592 }
1593
1594 /* Peek a token from INPUT, and return the length of the token.
1595    We must not use this function inside bracket expressions.  */
1596
1597 static int
1598 peek_token (token, input, syntax)
1599      re_token_t *token;
1600      re_string_t *input;
1601      reg_syntax_t syntax;
1602 {
1603   unsigned char c;
1604
1605   if (re_string_eoi (input))
1606     {
1607       token->type = END_OF_RE;
1608       return 0;
1609     }
1610
1611   c = re_string_peek_byte (input, 0);
1612   token->opr.c = c;
1613
1614 #ifdef RE_ENABLE_I18N
1615   token->mb_partial = 0;
1616   if (input->mb_cur_max > 1 &&
1617       !re_string_first_byte (input, re_string_cur_idx (input)))
1618     {
1619       token->type = CHARACTER;
1620       token->mb_partial = 1;
1621       return 1;
1622     }
1623 #endif
1624   if (c == '\\')
1625     {
1626       unsigned char c2;
1627       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1628         {
1629           token->type = BACK_SLASH;
1630           return 1;
1631         }
1632
1633       c2 = re_string_peek_byte_case (input, 1);
1634       token->opr.c = c2;
1635       token->type = CHARACTER;
1636       switch (c2)
1637         {
1638         case '|':
1639           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1640             token->type = OP_ALT;
1641           break;
1642         case '1': case '2': case '3': case '4': case '5':
1643         case '6': case '7': case '8': case '9':
1644           if (!(syntax & RE_NO_BK_REFS))
1645             {
1646               token->type = OP_BACK_REF;
1647               token->opr.idx = c2 - '0';
1648             }
1649           break;
1650         case '<':
1651           if (!(syntax & RE_NO_GNU_OPS))
1652             {
1653               token->type = ANCHOR;
1654               token->opr.idx = WORD_FIRST;
1655             }
1656           break;
1657         case '>':
1658           if (!(syntax & RE_NO_GNU_OPS))
1659             {
1660               token->type = ANCHOR;
1661               token->opr.idx = WORD_LAST;
1662             }
1663           break;
1664         case 'b':
1665           if (!(syntax & RE_NO_GNU_OPS))
1666             {
1667               token->type = ANCHOR;
1668               token->opr.idx = WORD_DELIM;
1669             }
1670           break;
1671         case 'B':
1672           if (!(syntax & RE_NO_GNU_OPS))
1673             {
1674               token->type = ANCHOR;
1675               token->opr.idx = INSIDE_WORD;
1676             }
1677           break;
1678         case 'w':
1679           if (!(syntax & RE_NO_GNU_OPS))
1680             token->type = OP_WORD;
1681           break;
1682         case 'W':
1683           if (!(syntax & RE_NO_GNU_OPS))
1684             token->type = OP_NOTWORD;
1685           break;
1686         case 's':
1687           if (!(syntax & RE_NO_GNU_OPS))
1688             token->type = OP_SPACE;
1689           break;
1690         case 'S':
1691           if (!(syntax & RE_NO_GNU_OPS))
1692             token->type = OP_NOTSPACE;
1693           break;
1694         case '`':
1695           if (!(syntax & RE_NO_GNU_OPS))
1696             {
1697               token->type = ANCHOR;
1698               token->opr.idx = BUF_FIRST;
1699             }
1700           break;
1701         case '\'':
1702           if (!(syntax & RE_NO_GNU_OPS))
1703             {
1704               token->type = ANCHOR;
1705               token->opr.idx = BUF_LAST;
1706             }
1707           break;
1708         case '(':
1709           if (!(syntax & RE_NO_BK_PARENS))
1710             token->type = OP_OPEN_SUBEXP;
1711           break;
1712         case ')':
1713           if (!(syntax & RE_NO_BK_PARENS))
1714             token->type = OP_CLOSE_SUBEXP;
1715           break;
1716         case '+':
1717           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1718             token->type = OP_DUP_PLUS;
1719           break;
1720         case '?':
1721           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1722             token->type = OP_DUP_QUESTION;
1723           break;
1724         case '{':
1725           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1726             token->type = OP_OPEN_DUP_NUM;
1727           break;
1728         case '}':
1729           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1730             token->type = OP_CLOSE_DUP_NUM;
1731           break;
1732         default:
1733           break;
1734         }
1735       return 2;
1736     }
1737
1738   token->type = CHARACTER;
1739   switch (c)
1740     {
1741     case '\n':
1742       if (syntax & RE_NEWLINE_ALT)
1743         token->type = OP_ALT;
1744       break;
1745     case '|':
1746       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1747         token->type = OP_ALT;
1748       break;
1749     case '*':
1750       token->type = OP_DUP_ASTERISK;
1751       break;
1752     case '+':
1753       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1754         token->type = OP_DUP_PLUS;
1755       break;
1756     case '?':
1757       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1758         token->type = OP_DUP_QUESTION;
1759       break;
1760     case '{':
1761       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1762         token->type = OP_OPEN_DUP_NUM;
1763       break;
1764     case '}':
1765       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1766         token->type = OP_CLOSE_DUP_NUM;
1767       break;
1768     case '(':
1769       if (syntax & RE_NO_BK_PARENS)
1770         token->type = OP_OPEN_SUBEXP;
1771       break;
1772     case ')':
1773       if (syntax & RE_NO_BK_PARENS)
1774         token->type = OP_CLOSE_SUBEXP;
1775       break;
1776     case '[':
1777       token->type = OP_OPEN_BRACKET;
1778       break;
1779     case '.':
1780       token->type = OP_PERIOD;
1781       break;
1782     case '^':
1783       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1784           re_string_cur_idx (input) != 0)
1785         {
1786           char prev = re_string_peek_byte (input, -1);
1787           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1788             break;
1789         }
1790       token->type = ANCHOR;
1791       token->opr.idx = LINE_FIRST;
1792       break;
1793     case '$':
1794       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1795           re_string_cur_idx (input) + 1 != re_string_length (input))
1796         {
1797           re_token_t next;
1798           re_string_skip_bytes (input, 1);
1799           peek_token (&next, input, syntax);
1800           re_string_skip_bytes (input, -1);
1801           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1802             break;
1803         }
1804       token->type = ANCHOR;
1805       token->opr.idx = LINE_LAST;
1806       break;
1807     default:
1808       break;
1809     }
1810   return 1;
1811 }
1812
1813 /* Peek a token from INPUT, and return the length of the token.
1814    We must not use this function out of bracket expressions.  */
1815
1816 static int
1817 peek_token_bracket (token, input, syntax)
1818      re_token_t *token;
1819      re_string_t *input;
1820      reg_syntax_t syntax;
1821 {
1822   unsigned char c;
1823   if (re_string_eoi (input))
1824     {
1825       token->type = END_OF_RE;
1826       return 0;
1827     }
1828   c = re_string_peek_byte (input, 0);
1829   token->opr.c = c;
1830
1831 #ifdef RE_ENABLE_I18N
1832   if (input->mb_cur_max > 1 &&
1833       !re_string_first_byte (input, re_string_cur_idx (input)))
1834     {
1835       token->type = CHARACTER;
1836       return 1;
1837     }
1838 #endif /* RE_ENABLE_I18N */
1839
1840   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1841     {
1842       /* In this case, '\' escape a character.  */
1843       unsigned char c2;
1844       re_string_skip_bytes (input, 1);
1845       c2 = re_string_peek_byte (input, 0);
1846       token->opr.c = c2;
1847       token->type = CHARACTER;
1848       return 1;
1849     }
1850   if (c == '[') /* '[' is a special char in a bracket exps.  */
1851     {
1852       unsigned char c2;
1853       int token_len;
1854       c2 = re_string_peek_byte (input, 1);
1855       token->opr.c = c2;
1856       token_len = 2;
1857       switch (c2)
1858         {
1859         case '.':
1860           token->type = OP_OPEN_COLL_ELEM;
1861           break;
1862         case '=':
1863           token->type = OP_OPEN_EQUIV_CLASS;
1864           break;
1865         case ':':
1866           if (syntax & RE_CHAR_CLASSES)
1867             {
1868               token->type = OP_OPEN_CHAR_CLASS;
1869               break;
1870             }
1871           /* else fall through.  */
1872         default:
1873           token->type = CHARACTER;
1874           token->opr.c = c;
1875           token_len = 1;
1876           break;
1877         }
1878       return token_len;
1879     }
1880   switch (c)
1881     {
1882     case '-':
1883       token->type = OP_CHARSET_RANGE;
1884       break;
1885     case ']':
1886       token->type = OP_CLOSE_BRACKET;
1887       break;
1888     case '^':
1889       token->type = OP_NON_MATCH_LIST;
1890       break;
1891     default:
1892       token->type = CHARACTER;
1893     }
1894   return 1;
1895 }
1896 \f
1897 /* Functions for parser.  */
1898
1899 /* Entry point of the parser.
1900    Parse the regular expression REGEXP and return the structure tree.
1901    If an error is occured, ERR is set by error code, and return NULL.
1902    This function build the following tree, from regular expression <reg_exp>:
1903            CAT
1904            / \
1905           /   \
1906    <reg_exp>  EOR
1907
1908    CAT means concatenation.
1909    EOR means end of regular expression.  */
1910
1911 static bin_tree_t *
1912 parse (regexp, preg, syntax, err)
1913      re_string_t *regexp;
1914      regex_t *preg;
1915      reg_syntax_t syntax;
1916      reg_errcode_t *err;
1917 {
1918   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1919   bin_tree_t *tree, *eor, *root;
1920   re_token_t current_token;
1921   int new_idx;
1922   current_token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1923   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1924   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1925     return NULL;
1926   new_idx = re_dfa_add_node (dfa, current_token, 0);
1927   eor = create_tree (NULL, NULL, 0, new_idx);
1928   if (tree != NULL)
1929     root = create_tree (tree, eor, CONCAT, 0);
1930   else
1931     root = eor;
1932   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1933     {
1934       *err = REG_ESPACE;
1935       return NULL;
1936     }
1937   return root;
1938 }
1939
1940 /* This function build the following tree, from regular expression
1941    <branch1>|<branch2>:
1942            ALT
1943            / \
1944           /   \
1945    <branch1> <branch2>
1946
1947    ALT means alternative, which represents the operator `|'.  */
1948
1949 static bin_tree_t *
1950 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1951      re_string_t *regexp;
1952      regex_t *preg;
1953      re_token_t *token;
1954      reg_syntax_t syntax;
1955      int nest;
1956      reg_errcode_t *err;
1957 {
1958   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1959   bin_tree_t *tree, *branch = NULL;
1960   int new_idx;
1961   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1962   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1963     return NULL;
1964
1965   while (token->type == OP_ALT)
1966     {
1967       re_token_t alt_token = *token;
1968       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1969       *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
1970       if (token->type != OP_ALT && token->type != END_OF_RE
1971           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1972         {
1973           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1974           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1975             {
1976               free_bin_tree (tree);
1977               return NULL;
1978             }
1979         }
1980       else
1981         branch = NULL;
1982       tree = create_tree (tree, branch, 0, new_idx);
1983       if (BE (new_idx == -1 || tree == NULL, 0))
1984         {
1985           *err = REG_ESPACE;
1986           return NULL;
1987         }
1988       dfa->has_plural_match = 1;
1989     }
1990   return tree;
1991 }
1992
1993 /* This function build the following tree, from regular expression
1994    <exp1><exp2>:
1995         CAT
1996         / \
1997        /   \
1998    <exp1> <exp2>
1999
2000    CAT means concatenation.  */
2001
2002 static bin_tree_t *
2003 parse_branch (regexp, preg, token, syntax, nest, err)
2004      re_string_t *regexp;
2005      regex_t *preg;
2006      re_token_t *token;
2007      reg_syntax_t syntax;
2008      int nest;
2009      reg_errcode_t *err;
2010 {
2011   bin_tree_t *tree, *exp;
2012   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2013   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2014     return NULL;
2015
2016   while (token->type != OP_ALT && token->type != END_OF_RE
2017          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2018     {
2019       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2020       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2021         {
2022           free_bin_tree (tree);
2023           return NULL;
2024         }
2025       if (tree != NULL && exp != NULL)
2026         {
2027           tree = create_tree (tree, exp, CONCAT, 0);
2028           if (tree == NULL)
2029             {
2030               *err = REG_ESPACE;
2031               return NULL;
2032             }
2033         }
2034       else if (tree == NULL)
2035         tree = exp;
2036       /* Otherwise exp == NULL, we don't need to create new tree.  */
2037     }
2038   return tree;
2039 }
2040
2041 /* This function build the following tree, from regular expression a*:
2042          *
2043          |
2044          a
2045 */
2046
2047 static bin_tree_t *
2048 parse_expression (regexp, preg, token, syntax, nest, err)
2049      re_string_t *regexp;
2050      regex_t *preg;
2051      re_token_t *token;
2052      reg_syntax_t syntax;
2053      int nest;
2054      reg_errcode_t *err;
2055 {
2056   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2057   bin_tree_t *tree;
2058   int new_idx;
2059   switch (token->type)
2060     {
2061     case CHARACTER:
2062       new_idx = re_dfa_add_node (dfa, *token, 0);
2063       tree = create_tree (NULL, NULL, 0, new_idx);
2064       if (BE (new_idx == -1 || tree == NULL, 0))
2065         {
2066           *err = REG_ESPACE;
2067           return NULL;
2068         }
2069 #ifdef RE_ENABLE_I18N
2070       if (dfa->mb_cur_max > 1)
2071         {
2072           while (!re_string_eoi (regexp)
2073                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2074             {
2075               bin_tree_t *mbc_remain;
2076               *token = fetch_token (regexp, syntax);
2077               new_idx = re_dfa_add_node (dfa, *token, 0);
2078               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
2079               tree = create_tree (tree, mbc_remain, CONCAT, 0);
2080               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
2081                 {
2082                   *err = REG_ESPACE;
2083                   return NULL;
2084                 }
2085             }
2086         }
2087 #endif
2088       break;
2089     case OP_OPEN_SUBEXP:
2090       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2091       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2092         return NULL;
2093       break;
2094     case OP_OPEN_BRACKET:
2095       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2096       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2097         return NULL;
2098       break;
2099     case OP_BACK_REF:
2100       if (BE (preg->re_nsub < token->opr.idx
2101               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2102         {
2103           *err = REG_ESUBREG;
2104           return NULL;
2105         }
2106       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2107       new_idx = re_dfa_add_node (dfa, *token, 0);
2108       tree = create_tree (NULL, NULL, 0, new_idx);
2109       if (BE (new_idx == -1 || tree == NULL, 0))
2110         {
2111           *err = REG_ESPACE;
2112           return NULL;
2113         }
2114       ++dfa->nbackref;
2115       dfa->has_mb_node = 1;
2116       break;
2117     case OP_OPEN_DUP_NUM:
2118       if (syntax & RE_CONTEXT_INVALID_DUP)
2119         {
2120           *err = REG_BADRPT;
2121           return NULL;
2122         }
2123       /* FALLTHROUGH */
2124     case OP_DUP_ASTERISK:
2125     case OP_DUP_PLUS:
2126     case OP_DUP_QUESTION:
2127       if (syntax & RE_CONTEXT_INVALID_OPS)
2128         {
2129           *err = REG_BADRPT;
2130           return NULL;
2131         }
2132       else if (syntax & RE_CONTEXT_INDEP_OPS)
2133         {
2134           *token = fetch_token (regexp, syntax);
2135           return parse_expression (regexp, preg, token, syntax, nest, err);
2136         }
2137       /* else fall through  */
2138     case OP_CLOSE_SUBEXP:
2139       if ((token->type == OP_CLOSE_SUBEXP) &&
2140           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2141         {
2142           *err = REG_ERPAREN;
2143           return NULL;
2144         }
2145       /* else fall through  */
2146     case OP_CLOSE_DUP_NUM:
2147       /* We treat it as a normal character.  */
2148
2149       /* Then we can these characters as normal characters.  */
2150       token->type = CHARACTER;
2151       new_idx = re_dfa_add_node (dfa, *token, 0);
2152       tree = create_tree (NULL, NULL, 0, new_idx);
2153       if (BE (new_idx == -1 || tree == NULL, 0))
2154         {
2155           *err = REG_ESPACE;
2156           return NULL;
2157         }
2158       break;
2159     case ANCHOR:
2160       if (dfa->word_char == NULL)
2161         {
2162           *err = init_word_char (dfa);
2163           if (BE (*err != REG_NOERROR, 0))
2164             return NULL;
2165         }
2166       if (token->opr.ctx_type == WORD_DELIM)
2167         {
2168           bin_tree_t *tree_first, *tree_last;
2169           int idx_first, idx_last;
2170           token->opr.ctx_type = WORD_FIRST;
2171           idx_first = re_dfa_add_node (dfa, *token, 0);
2172           tree_first = create_tree (NULL, NULL, 0, idx_first);
2173           token->opr.ctx_type = WORD_LAST;
2174           idx_last = re_dfa_add_node (dfa, *token, 0);
2175           tree_last = create_tree (NULL, NULL, 0, idx_last);
2176           token->type = OP_ALT;
2177           new_idx = re_dfa_add_node (dfa, *token, 0);
2178           tree = create_tree (tree_first, tree_last, 0, new_idx);
2179           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2180                   || tree_first == NULL || tree_last == NULL
2181                   || tree == NULL, 0))
2182             {
2183               *err = REG_ESPACE;
2184               return NULL;
2185             }
2186         }
2187       else
2188         {
2189           new_idx = re_dfa_add_node (dfa, *token, 0);
2190           tree = create_tree (NULL, NULL, 0, new_idx);
2191           if (BE (new_idx == -1 || tree == NULL, 0))
2192             {
2193               *err = REG_ESPACE;
2194               return NULL;
2195             }
2196         }
2197       /* We must return here, since ANCHORs can't be followed
2198          by repetition operators.
2199          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2200              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2201       *token = fetch_token (regexp, syntax);
2202       return tree;
2203     case OP_PERIOD:
2204       new_idx = re_dfa_add_node (dfa, *token, 0);
2205       tree = create_tree (NULL, NULL, 0, new_idx);
2206       if (BE (new_idx == -1 || tree == NULL, 0))
2207         {
2208           *err = REG_ESPACE;
2209           return NULL;
2210         }
2211       if (dfa->mb_cur_max > 1)
2212         dfa->has_mb_node = 1;
2213       break;
2214     case OP_WORD:
2215       tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
2216       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217         return NULL;
2218       break;
2219     case OP_NOTWORD:
2220       tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 1, err);
2221       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2222         return NULL;
2223       break;
2224     case OP_SPACE:
2225       tree = build_charclass_op (dfa, regexp->trans, "space", "", 0, err);
2226       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227         return NULL;
2228       break;
2229     case OP_NOTSPACE:
2230       tree = build_charclass_op (dfa, regexp->trans, "space", "", 1, err);
2231       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2232         return NULL;
2233       break;
2234     case OP_ALT:
2235     case END_OF_RE:
2236       return NULL;
2237     case BACK_SLASH:
2238       *err = REG_EESCAPE;
2239       return NULL;
2240     default:
2241       /* Must not happen?  */
2242 #ifdef DEBUG
2243       assert (0);
2244 #endif
2245       return NULL;
2246     }
2247   *token = fetch_token (regexp, syntax);
2248
2249   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2250          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2251     {
2252       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2253       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2254         return NULL;
2255       /* In BRE consecutive duplications are not allowed.  */
2256       if ((syntax & RE_CONTEXT_INVALID_DUP)
2257           && (token->type == OP_DUP_ASTERISK
2258               || token->type == OP_OPEN_DUP_NUM))
2259         {
2260           *err = REG_BADRPT;
2261           return NULL;
2262         }
2263       dfa->has_plural_match = 1;
2264     }
2265
2266   return tree;
2267 }
2268
2269 /* This function build the following tree, from regular expression
2270    (<reg_exp>):
2271          SUBEXP
2272             |
2273         <reg_exp>
2274 */
2275
2276 static bin_tree_t *
2277 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2278      re_string_t *regexp;
2279      regex_t *preg;
2280      re_token_t *token;
2281      reg_syntax_t syntax;
2282      int nest;
2283      reg_errcode_t *err;
2284 {
2285   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2286   bin_tree_t *tree, *left_par, *right_par;
2287   size_t cur_nsub;
2288   int new_idx;
2289   cur_nsub = preg->re_nsub++;
2290   if (dfa->subexps_alloc < preg->re_nsub)
2291     {
2292       re_subexp_t *new_array;
2293       dfa->subexps_alloc *= 2;
2294       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2295       if (BE (new_array == NULL, 0))
2296         {
2297           dfa->subexps_alloc /= 2;
2298           *err = REG_ESPACE;
2299           return NULL;
2300         }
2301       dfa->subexps = new_array;
2302     }
2303   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2304   dfa->subexps[cur_nsub].end = -1;
2305
2306   new_idx = re_dfa_add_node (dfa, *token, 0);
2307   left_par = create_tree (NULL, NULL, 0, new_idx);
2308   if (BE (new_idx == -1 || left_par == NULL, 0))
2309     {
2310       *err = REG_ESPACE;
2311       return NULL;
2312     }
2313   dfa->nodes[new_idx].opr.idx = cur_nsub;
2314   *token = fetch_token (regexp, syntax | RE_CARET_ANCHORS_HERE);
2315
2316   /* The subexpression may be a null string.  */
2317   if (token->type == OP_CLOSE_SUBEXP)
2318     tree = NULL;
2319   else
2320     {
2321       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2322       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2323         return NULL;
2324     }
2325   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2326     {
2327       free_bin_tree (tree);
2328       *err = REG_EPAREN;
2329       return NULL;
2330     }
2331   new_idx = re_dfa_add_node (dfa, *token, 0);
2332   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2333   right_par = create_tree (NULL, NULL, 0, new_idx);
2334   tree = ((tree == NULL) ? right_par
2335           : create_tree (tree, right_par, CONCAT, 0));
2336   tree = create_tree (left_par, tree, CONCAT, 0);
2337   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2338     {
2339       *err = REG_ESPACE;
2340       return NULL;
2341     }
2342   dfa->nodes[new_idx].opr.idx = cur_nsub;
2343
2344   return tree;
2345 }
2346
2347 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2348
2349 static bin_tree_t *
2350 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2351      bin_tree_t *dup_elem;
2352      re_string_t *regexp;
2353      re_dfa_t *dfa;
2354      re_token_t *token;
2355      reg_syntax_t syntax;
2356      reg_errcode_t *err;
2357 {
2358   re_token_t dup_token;
2359   bin_tree_t *tree = dup_elem, *work_tree;
2360   int new_idx, start_idx = re_string_cur_idx (regexp);
2361   re_token_t start_token = *token;
2362   if (token->type == OP_OPEN_DUP_NUM)
2363     {
2364       int i;
2365       int end = 0;
2366       int start = fetch_number (regexp, token, syntax);
2367       bin_tree_t *elem;
2368       if (start == -1)
2369         {
2370           if (token->type == CHARACTER && token->opr.c == ',')
2371             start = 0; /* We treat "{,m}" as "{0,m}".  */
2372           else
2373             {
2374               *err = REG_BADBR; /* <re>{} is invalid.  */
2375               return NULL;
2376             }
2377         }
2378       if (BE (start != -2, 1))
2379         {
2380           /* We treat "{n}" as "{n,n}".  */
2381           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2382                  : ((token->type == CHARACTER && token->opr.c == ',')
2383                     ? fetch_number (regexp, token, syntax) : -2));
2384         }
2385       if (BE (start == -2 || end == -2, 0))
2386         {
2387           /* Invalid sequence.  */
2388           if (token->type == OP_CLOSE_DUP_NUM)
2389             goto parse_dup_op_invalid_interval;
2390           else
2391             goto parse_dup_op_ebrace;
2392         }
2393       if (BE (start == 0 && end == 0, 0))
2394         {
2395           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2396           *token = fetch_token (regexp, syntax);
2397           free_bin_tree (dup_elem);
2398           return NULL;
2399         }
2400
2401       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2402       elem = tree;
2403       for (i = 0; i < start; ++i)
2404         if (i != 0)
2405           {
2406             work_tree = duplicate_tree (elem, dfa);
2407             tree = create_tree (tree, work_tree, CONCAT, 0);
2408             if (BE (work_tree == NULL || tree == NULL, 0))
2409               goto parse_dup_op_espace;
2410           }
2411
2412       if (end == -1)
2413         {
2414           /* We treat "<re>{0,}" as "<re>*".  */
2415           dup_token.type = OP_DUP_ASTERISK;
2416           if (start > 0)
2417             {
2418               elem = duplicate_tree (elem, dfa);
2419               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2420               work_tree = create_tree (elem, NULL, 0, new_idx);
2421               tree = create_tree (tree, work_tree, CONCAT, 0);
2422               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2423                       || tree == NULL, 0))
2424                 goto parse_dup_op_espace;
2425             }
2426           else
2427             {
2428               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2429               tree = create_tree (elem, NULL, 0, new_idx);
2430               if (BE (new_idx == -1 || tree == NULL, 0))
2431                 goto parse_dup_op_espace;
2432             }
2433         }
2434       else if (BE (start > end, 0))
2435         {
2436           /* First  number greater than first.  */
2437           *err = REG_BADBR;
2438           return NULL;
2439         }
2440       else if (end - start > 0)
2441         {
2442           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2443           dup_token.type = OP_DUP_QUESTION;
2444           if (start > 0)
2445             {
2446               elem = duplicate_tree (elem, dfa);
2447               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2448               elem = create_tree (elem, NULL, 0, new_idx);
2449               tree = create_tree (tree, elem, CONCAT, 0);
2450               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2451                 goto parse_dup_op_espace;
2452             }
2453           else
2454             {
2455               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2456               tree = elem = create_tree (elem, NULL, 0, new_idx);
2457               if (BE (new_idx == -1 || tree == NULL, 0))
2458                 goto parse_dup_op_espace;
2459             }
2460           for (i = 1; i < end - start; ++i)
2461             {
2462               work_tree = duplicate_tree (elem, dfa);
2463               tree = create_tree (tree, work_tree, CONCAT, 0);
2464               if (BE (work_tree == NULL || tree == NULL, 0))
2465                 {
2466                   *err = REG_ESPACE;
2467                   return NULL;
2468                 }
2469             }
2470         }
2471     }
2472   else
2473     {
2474       new_idx = re_dfa_add_node (dfa, *token, 0);
2475       tree = create_tree (tree, NULL, 0, new_idx);
2476       if (BE (new_idx == -1 || tree == NULL, 0))
2477         {
2478           *err = REG_ESPACE;
2479           return NULL;
2480         }
2481     }
2482   *token = fetch_token (regexp, syntax);
2483   return tree;
2484
2485  parse_dup_op_espace:
2486   free_bin_tree (tree);
2487   *err = REG_ESPACE;
2488   return NULL;
2489
2490  parse_dup_op_ebrace:
2491   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2492     {
2493       *err = REG_EBRACE;
2494       return NULL;
2495     }
2496   goto parse_dup_op_rollback;
2497  parse_dup_op_invalid_interval:
2498   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2499     {
2500       *err = REG_BADBR;
2501       return NULL;
2502     }
2503  parse_dup_op_rollback:
2504   re_string_set_index (regexp, start_idx);
2505   *token = start_token;
2506   token->type = CHARACTER;
2507   return dup_elem;
2508 }
2509
2510 /* Size of the names for collating symbol/equivalence_class/character_class.
2511    I'm not sure, but maybe enough.  */
2512 #define BRACKET_NAME_BUF_SIZE 32
2513
2514 #ifndef _LIBC
2515   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2516      Build the range expression which starts from START_ELEM, and ends
2517      at END_ELEM.  The result are written to MBCSET and SBCSET.
2518      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2519      mbcset->range_ends, is a pointer argument sinse we may
2520      update it.  */
2521
2522 static reg_errcode_t
2523 # ifdef RE_ENABLE_I18N
2524 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2525      re_charset_t *mbcset;
2526      int *range_alloc;
2527 # else /* not RE_ENABLE_I18N */
2528 build_range_exp (sbcset, start_elem, end_elem)
2529 # endif /* not RE_ENABLE_I18N */
2530      re_bitset_ptr_t sbcset;
2531      bracket_elem_t *start_elem, *end_elem;
2532 {
2533   unsigned int start_ch, end_ch;
2534   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2535   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2536           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2537           0))
2538     return REG_ERANGE;
2539
2540   /* We can handle no multi character collating elements without libc
2541      support.  */
2542   if (BE ((start_elem->type == COLL_SYM
2543            && strlen ((char *) start_elem->opr.name) > 1)
2544           || (end_elem->type == COLL_SYM
2545               && strlen ((char *) end_elem->opr.name) > 1), 0))
2546     return REG_ECOLLATE;
2547
2548 # ifdef RE_ENABLE_I18N
2549   {
2550     wchar_t wc, start_wc, end_wc;
2551     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2552
2553     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2554                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2555                    : 0));
2556     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2557               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2558                  : 0));
2559     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2560                 ? __btowc (start_ch) : start_elem->opr.wch);
2561     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2562               ? __btowc (end_ch) : end_elem->opr.wch);
2563     cmp_buf[0] = start_wc;
2564     cmp_buf[4] = end_wc;
2565     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2566       return REG_ERANGE;
2567
2568     /* Check the space of the arrays.  */
2569     if (*range_alloc == mbcset->nranges)
2570       {
2571         /* There are not enough space, need realloc.  */
2572         wchar_t *new_array_start, *new_array_end;
2573         int new_nranges;
2574
2575         /* +1 in case of mbcset->nranges is 0.  */
2576         new_nranges = 2 * mbcset->nranges + 1;
2577         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2578            are NULL if *range_alloc == 0.  */
2579         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2580                                       new_nranges);
2581         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2582                                     new_nranges);
2583
2584         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2585           return REG_ESPACE;
2586
2587         mbcset->range_starts = new_array_start;
2588         mbcset->range_ends = new_array_end;
2589         *range_alloc = new_nranges;
2590       }
2591
2592     mbcset->range_starts[mbcset->nranges] = start_wc;
2593     mbcset->range_ends[mbcset->nranges++] = end_wc;
2594
2595     /* Build the table for single byte characters.  */
2596     for (wc = 0; wc <= SBC_MAX; ++wc)
2597       {
2598         cmp_buf[2] = wc;
2599         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2600             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2601           bitset_set (sbcset, wc);
2602       }
2603   }
2604 # else /* not RE_ENABLE_I18N */
2605   {
2606     unsigned int ch;
2607     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2608                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2609                    : 0));
2610     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2611               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2612                  : 0));
2613     if (start_ch > end_ch)
2614       return REG_ERANGE;
2615     /* Build the table for single byte characters.  */
2616     for (ch = 0; ch <= SBC_MAX; ++ch)
2617       if (start_ch <= ch  && ch <= end_ch)
2618         bitset_set (sbcset, ch);
2619   }
2620 # endif /* not RE_ENABLE_I18N */
2621   return REG_NOERROR;
2622 }
2623 #endif /* not _LIBC */
2624
2625 #ifndef _LIBC
2626 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2627    Build the collating element which is represented by NAME.
2628    The result are written to MBCSET and SBCSET.
2629    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2630    pointer argument since we may update it.  */
2631
2632 static reg_errcode_t
2633 # ifdef RE_ENABLE_I18N
2634 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2635      re_charset_t *mbcset;
2636      int *coll_sym_alloc;
2637 # else /* not RE_ENABLE_I18N */
2638 build_collating_symbol (sbcset, name)
2639 # endif /* not RE_ENABLE_I18N */
2640      re_bitset_ptr_t sbcset;
2641      const unsigned char *name;
2642 {
2643   size_t name_len = strlen ((const char *) name);
2644   if (BE (name_len != 1, 0))
2645     return REG_ECOLLATE;
2646   else
2647     {
2648       bitset_set (sbcset, name[0]);
2649       return REG_NOERROR;
2650     }
2651 }
2652 #endif /* not _LIBC */
2653
2654 /* This function parse bracket expression like "[abc]", "[a-c]",
2655    "[[.a-a.]]" etc.  */
2656
2657 static bin_tree_t *
2658 parse_bracket_exp (regexp, dfa, token, syntax, err)
2659      re_string_t *regexp;
2660      re_dfa_t *dfa;
2661      re_token_t *token;
2662      reg_syntax_t syntax;
2663      reg_errcode_t *err;
2664 {
2665 #ifdef _LIBC
2666   const unsigned char *collseqmb;
2667   const char *collseqwc;
2668   uint32_t nrules;
2669   int32_t table_size;
2670   const int32_t *symb_table;
2671   const unsigned char *extra;
2672
2673   /* Local function for parse_bracket_exp used in _LIBC environement.
2674      Seek the collating symbol entry correspondings to NAME.
2675      Return the index of the symbol in the SYMB_TABLE.  */
2676
2677   static inline int32_t
2678   __attribute ((always_inline))
2679   seek_collating_symbol_entry (name, name_len)
2680          const unsigned char *name;
2681          size_t name_len;
2682     {
2683       int32_t hash = elem_hash ((const char *) name, name_len);
2684       int32_t elem = hash % table_size;
2685       int32_t second = hash % (table_size - 2);
2686       while (symb_table[2 * elem] != 0)
2687         {
2688           /* First compare the hashing value.  */
2689           if (symb_table[2 * elem] == hash
2690               /* Compare the length of the name.  */
2691               && name_len == extra[symb_table[2 * elem + 1]]
2692               /* Compare the name.  */
2693               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2694                          name_len) == 0)
2695             {
2696               /* Yep, this is the entry.  */
2697               break;
2698             }
2699
2700           /* Next entry.  */
2701           elem += second;
2702         }
2703       return elem;
2704     }
2705
2706   /* Local function for parse_bracket_exp used in _LIBC environement.
2707      Look up the collation sequence value of BR_ELEM.
2708      Return the value if succeeded, UINT_MAX otherwise.  */
2709
2710   static inline unsigned int
2711   __attribute ((always_inline))
2712   lookup_collation_sequence_value (br_elem)
2713          bracket_elem_t *br_elem;
2714     {
2715       if (br_elem->type == SB_CHAR)
2716         {
2717           /*
2718           if (MB_CUR_MAX == 1)
2719           */
2720           if (nrules == 0)
2721             return collseqmb[br_elem->opr.ch];
2722           else
2723             {
2724               wint_t wc = __btowc (br_elem->opr.ch);
2725               return __collseq_table_lookup (collseqwc, wc);
2726             }
2727         }
2728       else if (br_elem->type == MB_CHAR)
2729         {
2730           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2731         }
2732       else if (br_elem->type == COLL_SYM)
2733         {
2734           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2735           if (nrules != 0)
2736             {
2737               int32_t elem, idx;
2738               elem = seek_collating_symbol_entry (br_elem->opr.name,
2739                                                   sym_name_len);
2740               if (symb_table[2 * elem] != 0)
2741                 {
2742                   /* We found the entry.  */
2743                   idx = symb_table[2 * elem + 1];
2744                   /* Skip the name of collating element name.  */
2745                   idx += 1 + extra[idx];
2746                   /* Skip the byte sequence of the collating element.  */
2747                   idx += 1 + extra[idx];
2748                   /* Adjust for the alignment.  */
2749                   idx = (idx + 3) & ~3;
2750                   /* Skip the multibyte collation sequence value.  */
2751                   idx += sizeof (unsigned int);
2752                   /* Skip the wide char sequence of the collating element.  */
2753                   idx += sizeof (unsigned int) *
2754                     (1 + *(unsigned int *) (extra + idx));
2755                   /* Return the collation sequence value.  */
2756                   return *(unsigned int *) (extra + idx);
2757                 }
2758               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2759                 {
2760                   /* No valid character.  Match it as a single byte
2761                      character.  */
2762                   return collseqmb[br_elem->opr.name[0]];
2763                 }
2764             }
2765           else if (sym_name_len == 1)
2766             return collseqmb[br_elem->opr.name[0]];
2767         }
2768       return UINT_MAX;
2769     }
2770
2771   /* Local function for parse_bracket_exp used in _LIBC environement.
2772      Build the range expression which starts from START_ELEM, and ends
2773      at END_ELEM.  The result are written to MBCSET and SBCSET.
2774      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2775      mbcset->range_ends, is a pointer argument sinse we may
2776      update it.  */
2777
2778   static inline reg_errcode_t
2779   __attribute ((always_inline))
2780 # ifdef RE_ENABLE_I18N
2781   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2782          re_charset_t *mbcset;
2783          int *range_alloc;
2784 # else /* not RE_ENABLE_I18N */
2785   build_range_exp (sbcset, start_elem, end_elem)
2786 # endif /* not RE_ENABLE_I18N */
2787          re_bitset_ptr_t sbcset;
2788          bracket_elem_t *start_elem, *end_elem;
2789     {
2790       unsigned int ch;
2791       uint32_t start_collseq;
2792       uint32_t end_collseq;
2793
2794 # ifdef RE_ENABLE_I18N
2795       /* Check the space of the arrays.  */
2796       if (*range_alloc == mbcset->nranges)
2797         {
2798           /* There are not enough space, need realloc.  */
2799           uint32_t *new_array_start;
2800           uint32_t *new_array_end;
2801           int new_nranges;
2802
2803           /* +1 in case of mbcset->nranges is 0.  */
2804           new_nranges = 2 * mbcset->nranges + 1;
2805           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2806              are NULL if *range_alloc == 0.  */
2807           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2808                                         new_nranges);
2809           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2810                                       new_nranges);
2811
2812           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2813             return REG_ESPACE;
2814
2815           mbcset->range_starts = new_array_start;
2816           mbcset->range_ends = new_array_end;
2817           *range_alloc = new_nranges;
2818         }
2819 # endif /* RE_ENABLE_I18N */
2820
2821       /* Equivalence Classes and Character Classes can't be a range
2822          start/end.  */
2823       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2824               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2825               0))
2826         return REG_ERANGE;
2827
2828       start_collseq = lookup_collation_sequence_value (start_elem);
2829       end_collseq = lookup_collation_sequence_value (end_elem);
2830       /* Check start/end collation sequence values.  */
2831       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2832         return REG_ECOLLATE;
2833       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2834         return REG_ERANGE;
2835
2836 # ifdef RE_ENABLE_I18N
2837       /* Got valid collation sequence values, add them as a new entry.  */
2838       mbcset->range_starts[mbcset->nranges] = start_collseq;
2839       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2840 # endif /* RE_ENABLE_I18N */
2841
2842       /* Build the table for single byte characters.  */
2843       for (ch = 0; ch <= SBC_MAX; ch++)
2844         {
2845           uint32_t ch_collseq;
2846           /*
2847           if (MB_CUR_MAX == 1)
2848           */
2849           if (nrules == 0)
2850             ch_collseq = collseqmb[ch];
2851           else
2852             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2853           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2854             bitset_set (sbcset, ch);
2855         }
2856       return REG_NOERROR;
2857     }
2858
2859   /* Local function for parse_bracket_exp used in _LIBC environement.
2860      Build the collating element which is represented by NAME.
2861      The result are written to MBCSET and SBCSET.
2862      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2863      pointer argument sinse we may update it.  */
2864
2865   static inline reg_errcode_t
2866   __attribute ((always_inline))
2867 # ifdef RE_ENABLE_I18N
2868   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2869          re_charset_t *mbcset;
2870          int *coll_sym_alloc;
2871 # else /* not RE_ENABLE_I18N */
2872   build_collating_symbol (sbcset, name)
2873 # endif /* not RE_ENABLE_I18N */
2874          re_bitset_ptr_t sbcset;
2875          const unsigned char *name;
2876     {
2877       int32_t elem, idx;
2878       size_t name_len = strlen ((const char *) name);
2879       if (nrules != 0)
2880         {
2881           elem = seek_collating_symbol_entry (name, name_len);
2882           if (symb_table[2 * elem] != 0)
2883             {
2884               /* We found the entry.  */
2885               idx = symb_table[2 * elem + 1];
2886               /* Skip the name of collating element name.  */
2887               idx += 1 + extra[idx];
2888             }
2889           else if (symb_table[2 * elem] == 0 && name_len == 1)
2890             {
2891               /* No valid character, treat it as a normal
2892                  character.  */
2893               bitset_set (sbcset, name[0]);
2894               return REG_NOERROR;
2895             }
2896           else
2897             return REG_ECOLLATE;
2898
2899 # ifdef RE_ENABLE_I18N
2900           /* Got valid collation sequence, add it as a new entry.  */
2901           /* Check the space of the arrays.  */
2902           if (*coll_sym_alloc == mbcset->ncoll_syms)
2903             {
2904               /* Not enough, realloc it.  */
2905               /* +1 in case of mbcset->ncoll_syms is 0.  */
2906               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2907               /* Use realloc since mbcset->coll_syms is NULL
2908                  if *alloc == 0.  */
2909               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2910                                               *coll_sym_alloc);
2911               if (BE (mbcset->coll_syms == NULL, 0))
2912                 return REG_ESPACE;
2913             }
2914           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2915 # endif /* RE_ENABLE_I18N */
2916           return REG_NOERROR;
2917         }
2918       else
2919         {
2920           if (BE (name_len != 1, 0))
2921             return REG_ECOLLATE;
2922           else
2923             {
2924               bitset_set (sbcset, name[0]);
2925               return REG_NOERROR;
2926             }
2927         }
2928     }
2929 #endif
2930
2931   re_token_t br_token;
2932   re_bitset_ptr_t sbcset;
2933 #ifdef RE_ENABLE_I18N
2934   re_charset_t *mbcset;
2935   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2936   int equiv_class_alloc = 0, char_class_alloc = 0;
2937 #else /* not RE_ENABLE_I18N */
2938   int non_match = 0;
2939 #endif /* not RE_ENABLE_I18N */
2940   bin_tree_t *work_tree;
2941   int token_len, new_idx;
2942 #ifdef _LIBC
2943   collseqmb = (const unsigned char *)
2944     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2945   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2946   if (nrules)
2947     {
2948       /*
2949       if (MB_CUR_MAX > 1)
2950       */
2951         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2952       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2953       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2954                                                   _NL_COLLATE_SYMB_TABLEMB);
2955       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2956                                                    _NL_COLLATE_SYMB_EXTRAMB);
2957     }
2958 #endif
2959   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2960 #ifdef RE_ENABLE_I18N
2961   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2962 #endif /* RE_ENABLE_I18N */
2963 #ifdef RE_ENABLE_I18N
2964   if (BE (sbcset == NULL || mbcset == NULL, 0))
2965 #else
2966   if (BE (sbcset == NULL, 0))
2967 #endif /* RE_ENABLE_I18N */
2968     {
2969       *err = REG_ESPACE;
2970       return NULL;
2971     }
2972
2973   token_len = peek_token_bracket (token, regexp, syntax);
2974   if (BE (token->type == END_OF_RE, 0))
2975     {
2976       *err = REG_BADPAT;
2977       goto parse_bracket_exp_free_return;
2978     }
2979   if (token->type == OP_NON_MATCH_LIST)
2980     {
2981 #ifdef RE_ENABLE_I18N
2982       int i;
2983       mbcset->non_match = 1;
2984 #else /* not RE_ENABLE_I18N */
2985       non_match = 1;
2986 #endif /* not RE_ENABLE_I18N */
2987       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2988         bitset_set (sbcset, '\0');
2989       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2990       token_len = peek_token_bracket (token, regexp, syntax);
2991       if (BE (token->type == END_OF_RE, 0))
2992         {
2993           *err = REG_BADPAT;
2994           goto parse_bracket_exp_free_return;
2995         }
2996 #ifdef RE_ENABLE_I18N
2997       if (dfa->mb_cur_max > 1)
2998         for (i = 0; i < SBC_MAX; ++i)
2999           if (__btowc (i) == WEOF)
3000             bitset_set (sbcset, i);
3001 #endif /* RE_ENABLE_I18N */
3002     }
3003
3004   /* We treat the first ']' as a normal character.  */
3005   if (token->type == OP_CLOSE_BRACKET)
3006     token->type = CHARACTER;
3007
3008   int first_round = 1;
3009   while (1)
3010     {
3011       bracket_elem_t start_elem, end_elem;
3012       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3013       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3014       reg_errcode_t ret;
3015       int token_len2 = 0, is_range_exp = 0;
3016       re_token_t token2;
3017
3018       start_elem.opr.name = start_name_buf;
3019       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3020                                    syntax, first_round);
3021       if (BE (ret != REG_NOERROR, 0))
3022         {
3023           *err = ret;
3024           goto parse_bracket_exp_free_return;
3025         }
3026       first_round = 0;
3027
3028       /* Get information about the next token.  We need it in any case.  */
3029       token_len = peek_token_bracket (token, regexp, syntax);
3030
3031       /* Do not check for ranges if we know they are not allowed.  */
3032       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3033         {
3034           if (BE (token->type == END_OF_RE, 0))
3035             {
3036               *err = REG_EBRACK;
3037               goto parse_bracket_exp_free_return;
3038             }
3039           if (token->type == OP_CHARSET_RANGE)
3040             {
3041               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3042               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3043               if (BE (token2.type == END_OF_RE, 0))
3044                 {
3045                   *err = REG_EBRACK;
3046                   goto parse_bracket_exp_free_return;
3047                 }
3048               if (token2.type == OP_CLOSE_BRACKET)
3049                 {
3050                   /* We treat the last '-' as a normal character.  */
3051                   re_string_skip_bytes (regexp, -token_len);
3052                   token->type = CHARACTER;
3053                 }
3054               else
3055                 is_range_exp = 1;
3056             }
3057         }
3058
3059       if (is_range_exp == 1)
3060         {
3061           end_elem.opr.name = end_name_buf;
3062           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3063                                        dfa, syntax, 1);
3064           if (BE (ret != REG_NOERROR, 0))
3065             {
3066               *err = ret;
3067               goto parse_bracket_exp_free_return;
3068             }
3069
3070           token_len = peek_token_bracket (token, regexp, syntax);
3071
3072           *err = build_range_exp (sbcset,
3073 #ifdef RE_ENABLE_I18N
3074                                   mbcset, &range_alloc,
3075 #endif /* RE_ENABLE_I18N */
3076                                   &start_elem, &end_elem);
3077           if (BE (*err != REG_NOERROR, 0))
3078             goto parse_bracket_exp_free_return;
3079         }
3080       else
3081         {
3082           switch (start_elem.type)
3083             {
3084             case SB_CHAR:
3085               bitset_set (sbcset, start_elem.opr.ch);
3086               break;
3087 #ifdef RE_ENABLE_I18N
3088             case MB_CHAR:
3089               /* Check whether the array has enough space.  */
3090               if (mbchar_alloc == mbcset->nmbchars)
3091                 {
3092                   /* Not enough, realloc it.  */
3093                   /* +1 in case of mbcset->nmbchars is 0.  */
3094                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3095                   /* Use realloc since array is NULL if *alloc == 0.  */
3096                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
3097                                                 mbchar_alloc);
3098                   if (BE (mbcset->mbchars == NULL, 0))
3099                     goto parse_bracket_exp_espace;
3100                 }
3101               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3102               break;
3103 #endif /* RE_ENABLE_I18N */
3104             case EQUIV_CLASS:
3105               *err = build_equiv_class (sbcset,
3106 #ifdef RE_ENABLE_I18N
3107                                         mbcset, &equiv_class_alloc,
3108 #endif /* RE_ENABLE_I18N */
3109                                         start_elem.opr.name);
3110               if (BE (*err != REG_NOERROR, 0))
3111                 goto parse_bracket_exp_free_return;
3112               break;
3113             case COLL_SYM:
3114               *err = build_collating_symbol (sbcset,
3115 #ifdef RE_ENABLE_I18N
3116                                              mbcset, &coll_sym_alloc,
3117 #endif /* RE_ENABLE_I18N */
3118                                              start_elem.opr.name);
3119               if (BE (*err != REG_NOERROR, 0))
3120                 goto parse_bracket_exp_free_return;
3121               break;
3122             case CHAR_CLASS:
3123               *err = build_charclass (regexp->trans, sbcset,
3124 #ifdef RE_ENABLE_I18N
3125                                       mbcset, &char_class_alloc,
3126 #endif /* RE_ENABLE_I18N */
3127                                       start_elem.opr.name, syntax);
3128               if (BE (*err != REG_NOERROR, 0))
3129                goto parse_bracket_exp_free_return;
3130               break;
3131             default:
3132               assert (0);
3133               break;
3134             }
3135         }
3136       if (BE (token->type == END_OF_RE, 0))
3137         {
3138           *err = REG_EBRACK;
3139           goto parse_bracket_exp_free_return;
3140         }
3141       if (token->type == OP_CLOSE_BRACKET)
3142         break;
3143     }
3144
3145   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3146
3147   /* If it is non-matching list.  */
3148 #ifdef RE_ENABLE_I18N
3149   if (mbcset->non_match)
3150 #else /* not RE_ENABLE_I18N */
3151   if (non_match)
3152 #endif /* not RE_ENABLE_I18N */
3153     bitset_not (sbcset);
3154
3155   /* Build a tree for simple bracket.  */
3156   br_token.type = SIMPLE_BRACKET;
3157   br_token.opr.sbcset = sbcset;
3158   new_idx = re_dfa_add_node (dfa, br_token, 0);
3159   work_tree = create_tree (NULL, NULL, 0, new_idx);
3160   if (BE (new_idx == -1 || work_tree == NULL, 0))
3161     goto parse_bracket_exp_espace;
3162
3163 #ifdef RE_ENABLE_I18N
3164   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3165       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166                                                      || mbcset->non_match)))
3167     {
3168       re_token_t alt_token;
3169       bin_tree_t *mbc_tree;
3170       int sbc_idx;
3171       /* Build a tree for complex bracket.  */
3172       dfa->has_mb_node = 1;
3173       dfa->has_plural_match = 1;
3174       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3175         if (sbcset[sbc_idx])
3176           break;
3177       /* If there are no bits set in sbcset, there is no point
3178          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3179       if (sbc_idx == BITSET_UINTS)
3180         {
3181           re_free (sbcset);
3182           dfa->nodes[new_idx].type = COMPLEX_BRACKET;
3183           dfa->nodes[new_idx].opr.mbcset = mbcset;
3184           return work_tree;
3185         }
3186       br_token.type = COMPLEX_BRACKET;
3187       br_token.opr.mbcset = mbcset;
3188       new_idx = re_dfa_add_node (dfa, br_token, 0);
3189       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3190       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3191         goto parse_bracket_exp_espace;
3192       /* Then join them by ALT node.  */
3193       alt_token.type = OP_ALT;
3194       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3195       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3196       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3197         return work_tree;
3198     }
3199   else
3200     {
3201       free_charset (mbcset);
3202       return work_tree;
3203     }
3204 #else /* not RE_ENABLE_I18N */
3205   return work_tree;
3206 #endif /* not RE_ENABLE_I18N */
3207
3208  parse_bracket_exp_espace:
3209   *err = REG_ESPACE;
3210  parse_bracket_exp_free_return:
3211   re_free (sbcset);
3212 #ifdef RE_ENABLE_I18N
3213   free_charset (mbcset);
3214 #endif /* RE_ENABLE_I18N */
3215   return NULL;
3216 }
3217
3218 /* Parse an element in the bracket expression.  */
3219
3220 static reg_errcode_t
3221 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3222                        accept_hyphen)
3223      bracket_elem_t *elem;
3224      re_string_t *regexp;
3225      re_token_t *token;
3226      int token_len;
3227      re_dfa_t *dfa;
3228      reg_syntax_t syntax;
3229      int accept_hyphen;
3230 {
3231 #ifdef RE_ENABLE_I18N
3232   int cur_char_size;
3233   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3234   if (cur_char_size > 1)
3235     {
3236       elem->type = MB_CHAR;
3237       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3238       re_string_skip_bytes (regexp, cur_char_size);
3239       return REG_NOERROR;
3240     }
3241 #endif /* RE_ENABLE_I18N */
3242   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3243   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3244       || token->type == OP_OPEN_EQUIV_CLASS)
3245     return parse_bracket_symbol (elem, regexp, token);
3246   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3247     {
3248       /* A '-' must only appear as anything but a range indicator before
3249          the closing bracket.  Everything else is an error.  */
3250       re_token_t token2;
3251       (void) peek_token_bracket (&token2, regexp, syntax);
3252       if (token2.type != OP_CLOSE_BRACKET)
3253         /* The actual error value is not standardized since this whole
3254            case is undefined.  But ERANGE makes good sense.  */
3255         return REG_ERANGE;
3256     }
3257   elem->type = SB_CHAR;
3258   elem->opr.ch = token->opr.c;
3259   return REG_NOERROR;
3260 }
3261
3262 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3263    such as [:<character_class>:], [.<collating_element>.], and
3264    [=<equivalent_class>=].  */
3265
3266 static reg_errcode_t
3267 parse_bracket_symbol (elem, regexp, token)
3268      bracket_elem_t *elem;
3269      re_string_t *regexp;
3270      re_token_t *token;
3271 {
3272   unsigned char ch, delim = token->opr.c;
3273   int i = 0;
3274   for (;; ++i)
3275     {
3276       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3277         return REG_EBRACK;
3278       if (token->type == OP_OPEN_CHAR_CLASS)
3279         ch = re_string_fetch_byte_case (regexp);
3280       else
3281         ch = re_string_fetch_byte (regexp);
3282       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3283         break;
3284       elem->opr.name[i] = ch;
3285     }
3286   re_string_skip_bytes (regexp, 1);
3287   elem->opr.name[i] = '\0';
3288   switch (token->type)
3289     {
3290     case OP_OPEN_COLL_ELEM:
3291       elem->type = COLL_SYM;
3292       break;
3293     case OP_OPEN_EQUIV_CLASS:
3294       elem->type = EQUIV_CLASS;
3295       break;
3296     case OP_OPEN_CHAR_CLASS:
3297       elem->type = CHAR_CLASS;
3298       break;
3299     default:
3300       break;
3301     }
3302   return REG_NOERROR;
3303 }
3304
3305   /* Helper function for parse_bracket_exp.
3306      Build the equivalence class which is represented by NAME.
3307      The result are written to MBCSET and SBCSET.
3308      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3309      is a pointer argument sinse we may update it.  */
3310
3311 static reg_errcode_t
3312 #ifdef RE_ENABLE_I18N
3313 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3314      re_charset_t *mbcset;
3315      int *equiv_class_alloc;
3316 #else /* not RE_ENABLE_I18N */
3317 build_equiv_class (sbcset, name)
3318 #endif /* not RE_ENABLE_I18N */
3319      re_bitset_ptr_t sbcset;
3320      const unsigned char *name;
3321 {
3322 #if defined _LIBC && defined RE_ENABLE_I18N
3323   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3324   if (nrules != 0)
3325     {
3326       const int32_t *table, *indirect;
3327       const unsigned char *weights, *extra, *cp;
3328       unsigned char char_buf[2];
3329       int32_t idx1, idx2;
3330       unsigned int ch;
3331       size_t len;
3332       /* This #include defines a local function!  */
3333 # include <locale/weight.h>
3334       /* Calculate the index for equivalence class.  */
3335       cp = name;
3336       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3337       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3338                                                _NL_COLLATE_WEIGHTMB);
3339       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3340                                                    _NL_COLLATE_EXTRAMB);
3341       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3342                                                 _NL_COLLATE_INDIRECTMB);
3343       idx1 = findidx (&cp);
3344       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3345         /* This isn't a valid character.  */
3346         return REG_ECOLLATE;
3347
3348       /* Build single byte matcing table for this equivalence class.  */
3349       char_buf[1] = (unsigned char) '\0';
3350       len = weights[idx1];
3351       for (ch = 0; ch < SBC_MAX; ++ch)
3352         {
3353           char_buf[0] = ch;
3354           cp = char_buf;
3355           idx2 = findidx (&cp);
3356 /*
3357           idx2 = table[ch];
3358 */
3359           if (idx2 == 0)
3360             /* This isn't a valid character.  */
3361             continue;
3362           if (len == weights[idx2])
3363             {
3364               int cnt = 0;
3365               while (cnt <= len &&
3366                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3367                 ++cnt;
3368
3369               if (cnt > len)
3370                 bitset_set (sbcset, ch);
3371             }
3372         }
3373       /* Check whether the array has enough space.  */
3374       if (*equiv_class_alloc == mbcset->nequiv_classes)
3375         {
3376           /* Not enough, realloc it.  */
3377           /* +1 in case of mbcset->nequiv_classes is 0.  */
3378           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3379           /* Use realloc since the array is NULL if *alloc == 0.  */
3380           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3381                                               *equiv_class_alloc);
3382           if (BE (mbcset->equiv_classes == NULL, 0))
3383             return REG_ESPACE;
3384         }
3385       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3386     }
3387   else
3388 #endif /* _LIBC && RE_ENABLE_I18N */
3389     {
3390       if (BE (strlen ((const char *) name) != 1, 0))
3391         return REG_ECOLLATE;
3392       bitset_set (sbcset, *name);
3393     }
3394   return REG_NOERROR;
3395 }
3396
3397   /* Helper function for parse_bracket_exp.
3398      Build the character class which is represented by NAME.
3399      The result are written to MBCSET and SBCSET.
3400      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3401      is a pointer argument sinse we may update it.  */
3402
3403 static reg_errcode_t
3404 #ifdef RE_ENABLE_I18N
3405 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3406      re_charset_t *mbcset;
3407      int *char_class_alloc;
3408 #else /* not RE_ENABLE_I18N */
3409 build_charclass (trans, sbcset, class_name, syntax)
3410 #endif /* not RE_ENABLE_I18N */
3411      RE_TRANSLATE_TYPE trans;
3412      re_bitset_ptr_t sbcset;
3413      const unsigned char *class_name;
3414      reg_syntax_t syntax;
3415 {
3416   int i;
3417   const char *name = (const char *) class_name;
3418
3419   /* In case of REG_ICASE "upper" and "lower" match the both of
3420      upper and lower cases.  */
3421   if ((syntax & RE_ICASE)
3422       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3423     name = "alpha";
3424
3425 #ifdef RE_ENABLE_I18N
3426   /* Check the space of the arrays.  */
3427   if (*char_class_alloc == mbcset->nchar_classes)
3428     {
3429       /* Not enough, realloc it.  */
3430       /* +1 in case of mbcset->nchar_classes is 0.  */
3431       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3432       /* Use realloc since array is NULL if *alloc == 0.  */
3433       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3434                                          *char_class_alloc);
3435       if (BE (mbcset->char_classes == NULL, 0))
3436         return REG_ESPACE;
3437     }
3438   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3440
3441 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3442     for (i = 0; i < SBC_MAX; ++i)               \
3443       {                                         \
3444         if (ctype_func (i))                     \
3445           {                                     \
3446             int ch = trans ? trans[i] : i;      \
3447             bitset_set (sbcset, ch);            \
3448           }                                     \
3449       }
3450
3451   if (strcmp (name, "alnum") == 0)
3452     BUILD_CHARCLASS_LOOP (isalnum)
3453   else if (strcmp (name, "cntrl") == 0)
3454     BUILD_CHARCLASS_LOOP (iscntrl)
3455   else if (strcmp (name, "lower") == 0)
3456     BUILD_CHARCLASS_LOOP (islower)
3457   else if (strcmp (name, "space") == 0)
3458     BUILD_CHARCLASS_LOOP (isspace)
3459   else if (strcmp (name, "alpha") == 0)
3460     BUILD_CHARCLASS_LOOP (isalpha)
3461   else if (strcmp (name, "digit") == 0)
3462     BUILD_CHARCLASS_LOOP (isdigit)
3463   else if (strcmp (name, "print") == 0)
3464     BUILD_CHARCLASS_LOOP (isprint)
3465   else if (strcmp (name, "upper") == 0)
3466     BUILD_CHARCLASS_LOOP (isupper)
3467   else if (strcmp (name, "blank") == 0)
3468     BUILD_CHARCLASS_LOOP (isblank)
3469   else if (strcmp (name, "graph") == 0)
3470     BUILD_CHARCLASS_LOOP (isgraph)
3471   else if (strcmp (name, "punct") == 0)
3472     BUILD_CHARCLASS_LOOP (ispunct)
3473   else if (strcmp (name, "xdigit") == 0)
3474     BUILD_CHARCLASS_LOOP (isxdigit)
3475   else
3476     return REG_ECTYPE;
3477
3478   return REG_NOERROR;
3479 }
3480
3481 static bin_tree_t *
3482 build_charclass_op (dfa, trans, class_name, extra, not, err)
3483      re_dfa_t *dfa;
3484      RE_TRANSLATE_TYPE trans;
3485      const unsigned char *class_name;
3486      const unsigned char *extra;
3487      int not;
3488      reg_errcode_t *err;
3489 {
3490   re_bitset_ptr_t sbcset;
3491 #ifdef RE_ENABLE_I18N
3492   re_charset_t *mbcset;
3493   int alloc = 0;
3494 #else /* not RE_ENABLE_I18N */
3495   int non_match = 0;
3496 #endif /* not RE_ENABLE_I18N */
3497   reg_errcode_t ret;
3498   re_token_t br_token;
3499   bin_tree_t *tree;
3500   int new_idx;
3501
3502   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3503 #ifdef RE_ENABLE_I18N
3504   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3505 #endif /* RE_ENABLE_I18N */
3506
3507 #ifdef RE_ENABLE_I18N
3508   if (BE (sbcset == NULL || mbcset == NULL, 0))
3509 #else /* not RE_ENABLE_I18N */
3510   if (BE (sbcset == NULL, 0))
3511 #endif /* not RE_ENABLE_I18N */
3512     {
3513       *err = REG_ESPACE;
3514       return NULL;
3515     }
3516
3517   if (not)
3518     {
3519 #ifdef RE_ENABLE_I18N
3520       int i;
3521       /*
3522       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3523         bitset_set(cset->sbcset, '\0');
3524       */
3525       mbcset->non_match = 1;
3526       if (dfa->mb_cur_max > 1)
3527         for (i = 0; i < SBC_MAX; ++i)
3528           if (__btowc (i) == WEOF)
3529             bitset_set (sbcset, i);
3530 #else /* not RE_ENABLE_I18N */
3531       non_match = 1;
3532 #endif /* not RE_ENABLE_I18N */
3533     }
3534
3535   /* We don't care the syntax in this case.  */
3536   ret = build_charclass (trans, sbcset,
3537 #ifdef RE_ENABLE_I18N
3538                          mbcset, &alloc,
3539 #endif /* RE_ENABLE_I18N */
3540                          class_name, 0);
3541
3542   if (BE (ret != REG_NOERROR, 0))
3543     {
3544       re_free (sbcset);
3545 #ifdef RE_ENABLE_I18N
3546       free_charset (mbcset);
3547 #endif /* RE_ENABLE_I18N */
3548       *err = ret;
3549       return NULL;
3550     }
3551   /* \w match '_' also.  */
3552   for (; *extra; extra++)
3553     bitset_set (sbcset, *extra);
3554
3555   /* If it is non-matching list.  */
3556 #ifdef RE_ENABLE_I18N
3557   if (mbcset->non_match)
3558 #else /* not RE_ENABLE_I18N */
3559   if (non_match)
3560 #endif /* not RE_ENABLE_I18N */
3561     bitset_not (sbcset);
3562
3563   /* Build a tree for simple bracket.  */
3564   br_token.type = SIMPLE_BRACKET;
3565   br_token.opr.sbcset = sbcset;
3566   new_idx = re_dfa_add_node (dfa, br_token, 0);
3567   tree = create_tree (NULL, NULL, 0, new_idx);
3568   if (BE (new_idx == -1 || tree == NULL, 0))
3569     goto build_word_op_espace;
3570
3571 #ifdef RE_ENABLE_I18N
3572   if (dfa->mb_cur_max > 1)
3573     {
3574       re_token_t alt_token;
3575       bin_tree_t *mbc_tree;
3576       /* Build a tree for complex bracket.  */
3577       br_token.type = COMPLEX_BRACKET;
3578       br_token.opr.mbcset = mbcset;
3579       dfa->has_mb_node = 1;
3580       new_idx = re_dfa_add_node (dfa, br_token, 0);
3581       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3582       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3583         goto build_word_op_espace;
3584       /* Then join them by ALT node.  */
3585       alt_token.type = OP_ALT;
3586       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3587       tree = create_tree (tree, mbc_tree, 0, new_idx);
3588       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3589         return tree;
3590     }
3591   else
3592     {
3593       free_charset (mbcset);
3594       return tree;
3595     }
3596 #else /* not RE_ENABLE_I18N */
3597   return tree;
3598 #endif /* not RE_ENABLE_I18N */
3599
3600  build_word_op_espace:
3601   re_free (sbcset);
3602 #ifdef RE_ENABLE_I18N
3603   free_charset (mbcset);
3604 #endif /* RE_ENABLE_I18N */
3605   *err = REG_ESPACE;
3606   return NULL;
3607 }
3608
3609 /* This is intended for the expressions like "a{1,3}".
3610    Fetch a number from `input', and return the number.
3611    Return -1, if the number field is empty like "{,1}".
3612    Return -2, If an error is occured.  */
3613
3614 static int
3615 fetch_number (input, token, syntax)
3616      re_string_t *input;
3617      re_token_t *token;
3618      reg_syntax_t syntax;
3619 {
3620   int num = -1;
3621   unsigned char c;
3622   while (1)
3623     {
3624       *token = fetch_token (input, syntax);
3625       c = token->opr.c;
3626       if (BE (token->type == END_OF_RE, 0))
3627         return -2;
3628       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3629         break;
3630       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3631              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3632       num = (num > RE_DUP_MAX) ? -2 : num;
3633     }
3634   return num;
3635 }
3636 \f
3637 #ifdef RE_ENABLE_I18N
3638 static void
3639 free_charset (re_charset_t *cset)
3640 {
3641   re_free (cset->mbchars);
3642 # ifdef _LIBC
3643   re_free (cset->coll_syms);
3644   re_free (cset->equiv_classes);
3645   re_free (cset->range_starts);
3646   re_free (cset->range_ends);
3647 # endif
3648   re_free (cset->char_classes);
3649   re_free (cset);
3650 }
3651 #endif /* RE_ENABLE_I18N */
3652 \f
3653 /* Functions for binary tree operation.  */
3654
3655 /* Create a node of tree.
3656    Note: This function automatically free left and right if malloc fails.  */
3657
3658 static bin_tree_t *
3659 create_tree (left, right, type, index)
3660      bin_tree_t *left;
3661      bin_tree_t *right;
3662      re_token_type_t type;
3663      int index;
3664 {
3665   bin_tree_t *tree;
3666   tree = re_malloc (bin_tree_t, 1);
3667   if (BE (tree == NULL, 0))
3668     {
3669       free_bin_tree (left);
3670       free_bin_tree (right);
3671       return NULL;
3672     }
3673   tree->parent = NULL;
3674   tree->left = left;
3675   tree->right = right;
3676   tree->type = type;
3677   tree->node_idx = index;
3678   tree->first = -1;
3679   tree->next = -1;
3680   re_node_set_init_empty (&tree->eclosure);
3681
3682   if (left != NULL)
3683     left->parent = tree;
3684   if (right != NULL)
3685     right->parent = tree;
3686   return tree;
3687 }
3688
3689 /* Free the sub tree pointed by TREE.  */
3690
3691 static void
3692 free_bin_tree (tree)
3693      bin_tree_t *tree;
3694 {
3695   if (tree == NULL)
3696     return;
3697   /*re_node_set_free (&tree->eclosure);*/
3698   free_bin_tree (tree->left);
3699   free_bin_tree (tree->right);
3700   re_free (tree);
3701 }
3702
3703 /* Duplicate the node SRC, and return new node.  */
3704
3705 static bin_tree_t *
3706 duplicate_tree (src, dfa)
3707      const bin_tree_t *src;
3708      re_dfa_t *dfa;
3709 {
3710   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3711   int new_node_idx;
3712   /* Since node indies must be according to Post-order of the tree,
3713      we must duplicate the left at first.  */
3714   if (src->left != NULL)
3715     {
3716       left = duplicate_tree (src->left, dfa);
3717       if (left == NULL)
3718         return NULL;
3719     }
3720
3721   /* Secondaly, duplicate the right.  */
3722   if (src->right != NULL)
3723     {
3724       right = duplicate_tree (src->right, dfa);
3725       if (right == NULL)
3726         {
3727           free_bin_tree (left);
3728           return NULL;
3729         }
3730     }
3731
3732   /* At last, duplicate itself.  */
3733   if (src->type == NON_TYPE)
3734     {
3735       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3736       dfa->nodes[new_node_idx].duplicated = 1;
3737       if (BE (new_node_idx == -1, 0))
3738         {
3739           free_bin_tree (left);
3740           free_bin_tree (right);
3741           return NULL;
3742         }
3743     }
3744   else
3745     new_node_idx = src->type;
3746
3747   new_tree = create_tree (left, right, src->type, new_node_idx);
3748   if (BE (new_tree == NULL, 0))
3749     {
3750       free_bin_tree (left);
3751       free_bin_tree (right);
3752     }
3753   return new_tree;
3754 }