src/patches/grep-2.5.1a-redhat_fixes-2.patch

   1 Submitted by: Alexander E. Patrakov
   2 Date: 2005-08-13
   3 Initial Package Version: 2.5.1a
   4 Upstream Status: Partially accepted, partially rejected, but required for LSB >= 2.0 certification
   5 Origin: RedHat
   6 Description: Various fixes from RedHat. Individual patches:
   7
   8    grep-2.5.1-fgrep.patch
   9    grep-2.5.1-bracket.patch
  10    grep-2.5-i18n.patch
  11    grep-2.5.1-oi.patch
  12    grep-2.5.1-manpage.patch
  13    grep-2.5.1-color.patch
  14    grep-2.5.1-icolor.patch
  15    grep-2.5.1-egf-speedup.patch
  16    grep-2.5.1-dfa-optional.patch
  17    grep-2.5.1-tests.patch
  18    grep-2.5.1-w.patch
  19
  20 Testcases:
  21
  22  -fgrep: ???, but required for other patches
  23  -bracket: echo "[" | LANG=en_US.UTF-8 grep "[[:space:]]"
  24  -i18n: many fixes for multibyte locale support, required for LSB.
  25  -oi: echo xxYYzz | LANG=C grep -i -o yy
  26  -manpage: typo
  27  -color: restore the background color correctly
  28  -icolor: ??? echo 'spam foo SPAM FOO' | grep -i --color spam
  29      (but that's also fixed by -oi. Is this patch just a cleanup?)
  30  -egf-speedup: without this, grep is as slow as a snail in UTF-8 locales.
  31  -dfa-optional: disables dfa in multibyte locales by default.
  32  -w: (echo 'foo';echo 'fo') > /tmp/testfile && grep -F -w fo /tmp/testfile
  33
  34 diff -urN grep-2.5.1a.orig/doc/grep.1 grep-2.5.1a/doc/grep.1
  35 --- grep-2.5.1a.orig/doc/grep.1 2004-11-12 16:26:37.000000000 +0500
  36 +++ grep-2.5.1a/doc/grep.1      2005-10-23 09:49:43.000000000 +0600
  37 @@ -191,6 +191,7 @@
  38  .I PATTERN
  39  as a list of fixed strings, separated by newlines,
  40  any of which is to be matched.
  41 +.TP
  42  .BR \-P ", " \-\^\-perl-regexp
  43  Interpret
  44  .I PATTERN
  45 @@ -302,7 +303,7 @@
  46  This is especially useful for tools like zgrep, e.g.
  47  .B "gzip -cd foo.gz |grep --label=foo something"
  48  .TP
  49 -.BR \-\^\-line-buffering
  50 +.BR \-\^\-line-buffered
  51  Use line buffering, it can be a performance penality.
  52  .TP
  53  .BR \-q ", " \-\^\-quiet ", " \-\^\-silent
  54 diff -urN grep-2.5.1a.orig/lib/posix/regex.h grep-2.5.1a/lib/posix/regex.h
  55 --- grep-2.5.1a.orig/lib/posix/regex.h  2001-04-02 23:56:50.000000000 +0600
  56 +++ grep-2.5.1a/lib/posix/regex.h       2005-10-23 09:49:31.000000000 +0600
  57 @@ -109,6 +109,10 @@
  58     If not set, \{, \}, {, and } are literals.  */
  59  #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
  60
  61 +/* If this bit is set, then ignore case when matching.
  62 +   If not set, then case is significant.  */
  63 +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
  64 +
  65  /* If this bit is set, +, ? and | aren't recognized as operators.
  66     If not set, they are.  */
  67  #define RE_LIMITED_OPS (RE_INTERVALS << 1)
  68 diff -urN grep-2.5.1a.orig/src/dfa.c grep-2.5.1a/src/dfa.c
  69 --- grep-2.5.1a.orig/src/dfa.c  2001-09-26 22:57:55.000000000 +0600
  70 +++ grep-2.5.1a/src/dfa.c       2005-10-23 09:49:17.000000000 +0600
  71 @@ -414,7 +414,7 @@
  72
  73  /* This function fetch a wide character, and update cur_mb_len,
  74     used only if the current locale is a multibyte environment.  */
  75 -static wchar_t
  76 +static wint_t
  77  fetch_wc (char const *eoferr)
  78  {
  79    wchar_t wc;
  80 @@ -423,7 +423,7 @@
  81        if (eoferr != 0)
  82         dfaerror (eoferr);
  83        else
  84 -       return -1;
  85 +       return WEOF;
  86      }
  87
  88    cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
  89 @@ -459,7 +459,7 @@
  90  static void
  91  parse_bracket_exp_mb ()
  92  {
  93 -  wchar_t wc, wc1, wc2;
  94 +  wint_t wc, wc1, wc2;
  95
  96    /* Work area to build a mb_char_classes.  */
  97    struct mb_char_classes *work_mbc;
  98 @@ -496,7 +496,7 @@
  99      work_mbc->invert = 0;
 100    do
 101      {
 102 -      wc1 = -1; /* mark wc1 is not initialized".  */
 103 +      wc1 = WEOF; /* mark wc1 is not initialized".  */
 104
 105        /* Note that if we're looking at some other [:...:] construct,
 106          we just treat it as a bunch of ordinary characters.  We can do
 107 @@ -586,7 +586,7 @@
 108                       work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
 109                     }
 110                 }
 111 -             wc = -1;
 112 +             wc1 = wc = WEOF;
 113             }
 114           else
 115             /* We treat '[' as a normal character here.  */
 116 @@ -600,7 +600,7 @@
 117             wc = fetch_wc(("Unbalanced ["));
 118         }
 119
 120 -      if (wc1 == -1)
 121 +      if (wc1 == WEOF)
 122         wc1 = fetch_wc(_("Unbalanced ["));
 123
 124        if (wc1 == L'-')
 125 @@ -630,17 +630,17 @@
 126             }
 127           REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
 128                                range_sts_al, work_mbc->nranges + 1);
 129 -         work_mbc->range_sts[work_mbc->nranges] = wc;
 130 +         work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
 131           REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
 132                                range_ends_al, work_mbc->nranges + 1);
 133 -         work_mbc->range_ends[work_mbc->nranges++] = wc2;
 134 +         work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
 135         }
 136 -      else if (wc != -1)
 137 +      else if (wc != WEOF)
 138         /* build normal characters.  */
 139         {
 140           REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
 141                                work_mbc->nchars + 1);
 142 -         work_mbc->chars[work_mbc->nchars++] = wc;
 143 +         work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
 144         }
 145      }
 146    while ((wc = wc1) != L']');
 147 @@ -2552,6 +2552,8 @@
 148      }
 149
 150    /* match with a character?  */
 151 +  if (case_fold)
 152 +    wc = towlower (wc);
 153    for (i = 0; i<work_mbc->nchars; i++)
 154      {
 155        if (wc == work_mbc->chars[i])
 156 diff -urN grep-2.5.1a.orig/src/grep.c grep-2.5.1a/src/grep.c
 157 --- grep-2.5.1a.orig/src/grep.c 2004-11-12 16:25:35.000000000 +0500
 158 +++ grep-2.5.1a/src/grep.c      2005-10-23 09:50:06.000000000 +0600
 159 @@ -30,6 +30,12 @@
 160  # include <sys/time.h>
 161  # include <sys/resource.h>
 162  #endif
 163 +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
 164 +/* We can handle multibyte string.  */
 165 +# define MBS_SUPPORT
 166 +# include <wchar.h>
 167 +# include <wctype.h>
 168 +#endif
 169  #include <stdio.h>
 170  #include "system.h"
 171  #include "getopt.h"
 172 @@ -558,33 +564,6 @@
 173      {
 174        size_t match_size;
 175        size_t match_offset;
 176 -      if(match_icase)
 177 -        {
 178 -         /* Yuck, this is tricky */
 179 -          char *buf = (char*) xmalloc (lim - beg);
 180 -         char *ibeg = buf;
 181 -         char *ilim = ibeg + (lim - beg);
 182 -         int i;
 183 -         for (i = 0; i < lim - beg; i++)
 184 -           ibeg[i] = tolower (beg[i]);
 185 -         while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1))
 186 -                != (size_t) -1)
 187 -           {
 188 -             char const *b = beg + match_offset;
 189 -             if (b == lim)
 190 -               break;
 191 -             fwrite (beg, sizeof (char), match_offset, stdout);
 192 -             printf ("\33[%sm", grep_color);
 193 -             fwrite (b, sizeof (char), match_size, stdout);
 194 -             fputs ("\33[00m", stdout);
 195 -             beg = b + match_size;
 196 -             ibeg = ibeg + match_offset + match_size;
 197 -           }
 198 -         fwrite (beg, 1, lim - beg, stdout);
 199 -         free (buf);
 200 -         lastout = lim;
 201 -         return;
 202 -       }
 203        while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1))
 204              != (size_t) -1)
 205         {
 206 @@ -601,6 +580,7 @@
 207           fputs ("\33[00m", stdout);
 208           beg = b + match_size;
 209         }
 210 +      fputs ("\33[K", stdout);
 211      }
 212    fwrite (beg, 1, lim - beg, stdout);
 213    if (ferror (stdout))
 214 @@ -1697,6 +1677,37 @@
 215    if (!install_matcher (matcher) && !install_matcher ("default"))
 216      abort ();
 217
 218 +#ifdef MBS_SUPPORT
 219 +  if (MB_CUR_MAX != 1 && match_icase)
 220 +    {
 221 +      wchar_t wc;
 222 +      mbstate_t cur_state, prev_state;
 223 +      int i, len = strlen(keys);
 224 +
 225 +      memset(&cur_state, 0, sizeof(mbstate_t));
 226 +      for (i = 0; i <= len ;)
 227 +       {
 228 +         size_t mbclen;
 229 +         mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state);
 230 +         if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
 231 +           {
 232 +             /* An invalid sequence, or a truncated multibyte character.
 233 +                We treat it as a singlebyte character.  */
 234 +             mbclen = 1;
 235 +           }
 236 +         else
 237 +           {
 238 +             if (iswupper((wint_t)wc))
 239 +               {
 240 +                 wc = towlower((wint_t)wc);
 241 +                 wcrtomb(keys + i, wc, &cur_state);
 242 +               }
 243 +           }
 244 +         i += mbclen;
 245 +       }
 246 +    }
 247 +#endif /* MBS_SUPPORT */
 248 +
 249    (*compile)(keys, keycc);
 250
 251    if ((argc - optind > 1 && !no_filenames) || with_filenames)
 252 diff -urN grep-2.5.1a.orig/src/search.c grep-2.5.1a/src/search.c
 253 --- grep-2.5.1a.orig/src/search.c       2001-04-19 09:42:14.000000000 +0600
 254 +++ grep-2.5.1a/src/search.c    2005-10-23 09:51:25.000000000 +0600
 255 @@ -18,9 +18,13 @@
 256
 257  /* Written August 1992 by Mike Haertel. */
 258
 259 +#ifndef _GNU_SOURCE
 260 +# define _GNU_SOURCE 1
 261 +#endif
 262  #ifdef HAVE_CONFIG_H
 263  # include <config.h>
 264  #endif
 265 +#include <assert.h>
 266  #include <sys/types.h>
 267  #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
 268  /* We can handle multibyte string.  */
 269 @@ -31,7 +35,7 @@
 270
 271  #include "system.h"
 272  #include "grep.h"
 273 -#include "regex.h"
 274 +#include <regex.h>
 275  #include "dfa.h"
 276  #include "kwset.h"
 277  #include "error.h"
 278 @@ -39,6 +43,9 @@
 279  #ifdef HAVE_LIBPCRE
 280  # include <pcre.h>
 281  #endif
 282 +#ifdef HAVE_LANGINFO_CODESET
 283 +# include <langinfo.h>
 284 +#endif
 285
 286  #define NCHAR (UCHAR_MAX + 1)
 287
 288 @@ -70,9 +77,10 @@
 289     call the regexp matcher at all. */
 290  static int kwset_exact_matches;
 291
 292 -#if defined(MBS_SUPPORT)
 293 -static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
 294 -#endif
 295 +/* UTF-8 encoding allows some optimizations that we can't otherwise
 296 +   assume in a multibyte encoding. */
 297 +static int using_utf8;
 298 +
 299  static void kwsinit PARAMS ((void));
 300  static void kwsmusts PARAMS ((void));
 301  static void Gcompile PARAMS ((char const *, size_t));
 302 @@ -84,6 +92,15 @@
 303  static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
 304
 305  void
 306 +check_utf8 (void)
 307 +{
 308 +#ifdef HAVE_LANGINFO_CODESET
 309 +  if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
 310 +    using_utf8 = 1;
 311 +#endif
 312 +}
 313 +
 314 +void
 315  dfaerror (char const *mesg)
 316  {
 317    error (2, 0, mesg);
 318 @@ -141,38 +158,6 @@
 319      }
 320  }
 321
 322 -#ifdef MBS_SUPPORT
 323 -/* This function allocate the array which correspond to "buf".
 324 -   Then this check multibyte string and mark on the positions which
 325 -   are not singlebyte character nor the first byte of a multibyte
 326 -   character.  Caller must free the array.  */
 327 -static char*
 328 -check_multibyte_string(char const *buf, size_t size)
 329 -{
 330 -  char *mb_properties = malloc(size);
 331 -  mbstate_t cur_state;
 332 -  int i;
 333 -  memset(&cur_state, 0, sizeof(mbstate_t));
 334 -  memset(mb_properties, 0, sizeof(char)*size);
 335 -  for (i = 0; i < size ;)
 336 -    {
 337 -      size_t mbclen;
 338 -      mbclen = mbrlen(buf + i, size - i, &cur_state);
 339 -
 340 -      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
 341 -       {
 342 -         /* An invalid sequence, or a truncated multibyte character.
 343 -            We treat it as a singlebyte character.  */
 344 -         mbclen = 1;
 345 -       }
 346 -      mb_properties[i] = mbclen;
 347 -      i += mbclen;
 348 -    }
 349 -
 350 -  return mb_properties;
 351 -}
 352 -#endif
 353 -
 354  static void
 355  Gcompile (char const *pattern, size_t size)
 356  {
 357 @@ -181,7 +166,8 @@
 358    size_t total = size;
 359    char const *motif = pattern;
 360
 361 -  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
 362 +  check_utf8 ();
 363 +  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
 364    dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
 365
 366    /* For GNU regex compiler we have to pass the patterns separately to detect
 367 @@ -233,7 +219,7 @@
 368        static char const line_end[] = "\\)$";
 369        static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
 370        static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
 371 -      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
 372 +      char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
 373        size_t i;
 374        strcpy (n, match_lines ? line_beg : word_beg);
 375        i = strlen (n);
 376 @@ -257,14 +243,15 @@
 377    size_t total = size;
 378    char const *motif = pattern;
 379
 380 +  check_utf8 ();
 381    if (strcmp (matcher, "awk") == 0)
 382      {
 383 -      re_set_syntax (RE_SYNTAX_AWK);
 384 +      re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
 385        dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
 386      }
 387    else
 388      {
 389 -      re_set_syntax (RE_SYNTAX_POSIX_EGREP);
 390 +      re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0));
 391        dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
 392      }
 393
 394 @@ -316,7 +303,7 @@
 395        static char const line_end[] = ")$";
 396        static char const word_beg[] = "(^|[^[:alnum:]_])(";
 397        static char const word_end[] = ")([^[:alnum:]_]|$)";
 398 -      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
 399 +      char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
 400        size_t i;
 401        strcpy (n, match_lines ? line_beg : word_beg);
 402        i = strlen(n);
 403 @@ -339,15 +326,35 @@
 404    char eol = eolbyte;
 405    int backref, start, len;
 406    struct kwsmatch kwsm;
 407 -  size_t i;
 408 +  size_t i, ret_val;
 409 +  static int use_dfa;
 410 +  static int use_dfa_checked = 0;
 411  #ifdef MBS_SUPPORT
 412 -  char *mb_properties = NULL;
 413 +  const char *last_char = NULL;
 414 +  int mb_cur_max = MB_CUR_MAX;
 415 +  mbstate_t mbs;
 416 +  memset (&mbs, '\0', sizeof (mbstate_t));
 417  #endif /* MBS_SUPPORT */
 418
 419 +  if (!use_dfa_checked)
 420 +    {
 421 +      char *grep_use_dfa = getenv ("GREP_USE_DFA");
 422 +      if (!grep_use_dfa)
 423 +       {
 424  #ifdef MBS_SUPPORT
 425 -  if (MB_CUR_MAX > 1 && kwset)
 426 -    mb_properties = check_multibyte_string(buf, size);
 427 +         /* Turn off DFA when processing multibyte input. */
 428 +         use_dfa = (MB_CUR_MAX == 1);
 429 +#else
 430 +         use_dfa = 1;
 431  #endif /* MBS_SUPPORT */
 432 +       }
 433 +      else
 434 +       {
 435 +         use_dfa = atoi (grep_use_dfa);
 436 +       }
 437 +
 438 +      use_dfa_checked = 1;
 439 +    }
 440
 441    buflim = buf + size;
 442
 443 @@ -358,47 +365,124 @@
 444           if (kwset)
 445             {
 446               /* Find a possible match using the KWset matcher. */
 447 -             size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
 448 +#ifdef MBS_SUPPORT
 449 +             size_t bytes_left = 0;
 450 +#endif /* MBS_SUPPORT */
 451 +             size_t offset;
 452 +#ifdef MBS_SUPPORT
 453 +             /* kwsexec doesn't work with match_icase and multibyte input. */
 454 +             if (match_icase && mb_cur_max > 1)
 455 +               /* Avoid kwset */
 456 +               offset = 0;
 457 +             else
 458 +#endif /* MBS_SUPPORT */
 459 +             offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
 460               if (offset == (size_t) -1)
 461 -               {
 462 +               goto failure;
 463  #ifdef MBS_SUPPORT
 464 -                 if (MB_CUR_MAX > 1)
 465 -                   free(mb_properties);
 466 -#endif
 467 -                 return (size_t)-1;
 468 +             if (mb_cur_max > 1 && !using_utf8)
 469 +               {
 470 +                 bytes_left = offset;
 471 +                 while (bytes_left)
 472 +                   {
 473 +                     size_t mlen = mbrlen (beg, bytes_left, &mbs);
 474 +
 475 +                     last_char = beg;
 476 +                     if (mlen == (size_t) -1 || mlen == 0)
 477 +                       {
 478 +                         /* Incomplete character: treat as single-byte. */
 479 +                         memset (&mbs, '\0', sizeof (mbstate_t));
 480 +                         beg++;
 481 +                         bytes_left--;
 482 +                         continue;
 483 +                       }
 484 +
 485 +                     if (mlen == (size_t) -2)
 486 +                       /* Offset points inside multibyte character:
 487 +                        * no good. */
 488 +                       break;
 489 +
 490 +                     beg += mlen;
 491 +                     bytes_left -= mlen;
 492 +                   }
 493                 }
 494 +             else
 495 +#endif /* MBS_SUPPORT */
 496               beg += offset;
 497               /* Narrow down to the line containing the candidate, and
 498                  run it through DFA. */
 499               end = memchr(beg, eol, buflim - beg);
 500               end++;
 501  #ifdef MBS_SUPPORT
 502 -             if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
 503 +             if (mb_cur_max > 1 && bytes_left)
 504                 continue;
 505 -#endif
 506 +#endif /* MBS_SUPPORT */
 507               while (beg > buf && beg[-1] != eol)
 508                 --beg;
 509 -             if (kwsm.index < kwset_exact_matches)
 510 -               goto success;
 511 -             if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
 512 +             if (
 513 +#ifdef MBS_SUPPORT
 514 +                 !(match_icase && mb_cur_max > 1) &&
 515 +#endif /* MBS_SUPPORT */
 516 +                 (kwsm.index < kwset_exact_matches))
 517 +               goto success_in_beg_and_end;
 518 +             if (use_dfa &&
 519 +                 dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
 520                 continue;
 521             }
 522           else
 523             {
 524               /* No good fixed strings; start with DFA. */
 525 -             size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
 526 +#ifdef MBS_SUPPORT
 527 +             size_t bytes_left = 0;
 528 +#endif /* MBS_SUPPORT */
 529 +             size_t offset = 0;
 530 +             if (use_dfa)
 531 +               offset = dfaexec (&dfa, beg, buflim - beg, &backref);
 532               if (offset == (size_t) -1)
 533                 break;
 534               /* Narrow down to the line we've found. */
 535 +#ifdef MBS_SUPPORT
 536 +             if (mb_cur_max > 1 && !using_utf8)
 537 +               {
 538 +                 bytes_left = offset;
 539 +                 while (bytes_left)
 540 +                   {
 541 +                     size_t mlen = mbrlen (beg, bytes_left, &mbs);
 542 +
 543 +                     last_char = beg;
 544 +                     if (mlen == (size_t) -1 || mlen == 0)
 545 +                       {
 546 +                         /* Incomplete character: treat as single-byte. */
 547 +                         memset (&mbs, '\0', sizeof (mbstate_t));
 548 +                         beg++;
 549 +                         bytes_left--;
 550 +                         continue;
 551 +                       }
 552 +
 553 +                     if (mlen == (size_t) -2)
 554 +                       /* Offset points inside multibyte character:
 555 +                        * no good. */
 556 +                       break;
 557 +
 558 +                     beg += mlen;
 559 +                     bytes_left -= mlen;
 560 +                   }
 561 +               }
 562 +             else
 563 +#endif /* MBS_SUPPORT */
 564               beg += offset;
 565               end = memchr (beg, eol, buflim - beg);
 566               end++;
 567 +#ifdef MBS_SUPPORT
 568 +             if (mb_cur_max > 1 && bytes_left)
 569 +               continue;
 570 +#endif /* MBS_SUPPORT */
 571               while (beg > buf && beg[-1] != eol)
 572                 --beg;
 573             }
 574           /* Successful, no backreferences encountered! */
 575 -         if (!backref)
 576 -           goto success;
 577 +         if (use_dfa && !backref)
 578 +           goto success_in_beg_and_end;
 579         }
 580        else
 581         end = beg + size;
 582 @@ -413,14 +497,11 @@
 583                                        end - beg - 1, &(patterns[i].regs))))
 584             {
 585               len = patterns[i].regs.end[0] - start;
 586 -             if (exact)
 587 -               {
 588 -                 *match_size = len;
 589 -                 return start;
 590 -               }
 591 +             if (exact && !match_words)
 592 +               goto success_in_start_and_len;
 593               if ((!match_lines && !match_words)
 594                   || (match_lines && len == end - beg - 1))
 595 -               goto success;
 596 +               goto success_in_beg_and_end;
 597               /* If -w, check if the match aligns with word boundaries.
 598                  We do this iteratively because:
 599                  (a) the line may contain more than one occurence of the
 600 @@ -431,10 +512,84 @@
 601               if (match_words)
 602                 while (start >= 0)
 603                   {
 604 -                   if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
 605 -                       && (len == end - beg - 1
 606 -                           || !WCHAR ((unsigned char) beg[start + len])))
 607 -                     goto success;
 608 +                   int lword_match = 0;
 609 +                   if (start == 0)
 610 +                     lword_match = 1;
 611 +                   else
 612 +                     {
 613 +                       assert (start > 0);
 614 +#ifdef MBS_SUPPORT
 615 +                       if (mb_cur_max > 1)
 616 +                         {
 617 +                           const char *s;
 618 +                           int mr;
 619 +                           wchar_t pwc;
 620 +
 621 +                           if (using_utf8)
 622 +                             {
 623 +                               s = beg + start - 1;
 624 +                               while (s > buf
 625 +                                      && (unsigned char) *s >= 0x80
 626 +                                      && (unsigned char) *s <= 0xbf)
 627 +                                 --s;
 628 +                             }
 629 +                           else
 630 +                             s = last_char;
 631 +                           mr = mbtowc (&pwc, s, beg + start - s);
 632 +                           if (mr <= 0)
 633 +                             {
 634 +                               memset (&mbs, '\0', sizeof (mbstate_t));
 635 +                               lword_match = 1;
 636 +                             }
 637 +                           else if (!(iswalnum (pwc) || pwc == L'_')
 638 +                                    && mr == (int) (beg + start - s))
 639 +                             lword_match = 1;
 640 +                         }
 641 +                       else
 642 +#endif /* MBS_SUPPORT */
 643 +                       if (!WCHAR ((unsigned char) beg[start - 1]))
 644 +                         lword_match = 1;
 645 +                     }
 646 +
 647 +                   if (lword_match)
 648 +                     {
 649 +                       int rword_match = 0;
 650 +                       if (start + len == end - beg - 1)
 651 +                         rword_match = 1;
 652 +                       else
 653 +                         {
 654 +#ifdef MBS_SUPPORT
 655 +                           if (mb_cur_max > 1)
 656 +                             {
 657 +                               wchar_t nwc;
 658 +                               int mr;
 659 +
 660 +                               mr = mbtowc (&nwc, beg + start + len,
 661 +                                            end - beg - start - len - 1);
 662 +                               if (mr <= 0)
 663 +                                 {
 664 +                                   memset (&mbs, '\0', sizeof (mbstate_t));
 665 +                                   rword_match = 1;
 666 +                                 }
 667 +                               else if (!iswalnum (nwc) && nwc != L'_')
 668 +                                 rword_match = 1;
 669 +                             }
 670 +                           else
 671 +#endif /* MBS_SUPPORT */
 672 +                           if (!WCHAR ((unsigned char) beg[start + len]))
 673 +                             rword_match = 1;
 674 +                         }
 675 +
 676 +                       if (rword_match)
 677 +                         {
 678 +                           if (!exact)
 679 +                             /* Returns the whole line. */
 680 +                             goto success_in_beg_and_end;
 681 +                           else
 682 +                             /* Returns just this word match. */
 683 +                             goto success_in_start_and_len;
 684 +                         }
 685 +                     }
 686                     if (len > 0)
 687                       {
 688                         /* Try a shorter length anchored at the same place. */
 689 @@ -461,26 +616,154 @@
 690             }
 691         } /* for Regex patterns.  */
 692      } /* for (beg = end ..) */
 693 -#ifdef MBS_SUPPORT
 694 -  if (MB_CUR_MAX > 1 && mb_properties)
 695 -    free (mb_properties);
 696 -#endif /* MBS_SUPPORT */
 697 +
 698 + failure:
 699    return (size_t) -1;
 700
 701 - success:
 702 -#ifdef MBS_SUPPORT
 703 -  if (MB_CUR_MAX > 1 && mb_properties)
 704 -    free (mb_properties);
 705 -#endif /* MBS_SUPPORT */
 706 -  *match_size = end - beg;
 707 -  return beg - buf;
 708 + success_in_beg_and_end:
 709 +  len = end - beg;
 710 +  start = beg - buf;
 711 +  /* FALLTHROUGH */
 712 +
 713 + success_in_start_and_len:
 714 +  *match_size = len;
 715 +  return start;
 716  }
 717
 718 +#ifdef MBS_SUPPORT
 719 +static int f_i_multibyte; /* whether we're using the new -Fi MB method */
 720 +static struct
 721 +{
 722 +  wchar_t **patterns;
 723 +  size_t count, maxlen;
 724 +  unsigned char *match;
 725 +} Fimb;
 726 +#endif
 727 +
 728  static void
 729  Fcompile (char const *pattern, size_t size)
 730  {
 731 +  int mb_cur_max = MB_CUR_MAX;
 732    char const *beg, *lim, *err;
 733
 734 +  check_utf8 ();
 735 +#ifdef MBS_SUPPORT
 736 +  /* Support -F -i for UTF-8 input. */
 737 +  if (match_icase && mb_cur_max > 1)
 738 +    {
 739 +      mbstate_t mbs;
 740 +      wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
 741 +      const char *patternend = pattern;
 742 +      size_t wcsize;
 743 +      kwset_t fimb_kwset = NULL;
 744 +      char *starts = NULL;
 745 +      wchar_t *wcbeg, *wclim;
 746 +      size_t allocated = 0;
 747 +
 748 +      memset (&mbs, '\0', sizeof (mbs));
 749 +# ifdef __GNU_LIBRARY__
 750 +      wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
 751 +      if (patternend != pattern + size)
 752 +       wcsize = (size_t) -1;
 753 +# else
 754 +      {
 755 +       char *patterncopy = xmalloc (size + 1);
 756 +
 757 +       memcpy (patterncopy, pattern, size);
 758 +       patterncopy[size] = '\0';
 759 +       patternend = patterncopy;
 760 +       wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
 761 +       if (patternend != patterncopy + size)
 762 +         wcsize = (size_t) -1;
 763 +       free (patterncopy);
 764 +      }
 765 +# endif
 766 +      if (wcsize + 2 <= 2)
 767 +       {
 768 +fimb_fail:
 769 +         free (wcpattern);
 770 +         free (starts);
 771 +         if (fimb_kwset)
 772 +           kwsfree (fimb_kwset);
 773 +         free (Fimb.patterns);
 774 +         Fimb.patterns = NULL;
 775 +       }
 776 +      else
 777 +       {
 778 +         if (!(fimb_kwset = kwsalloc (NULL)))
 779 +           error (2, 0, _("memory exhausted"));
 780 +
 781 +         starts = xmalloc (mb_cur_max * 3);
 782 +         wcbeg = wcpattern;
 783 +         do
 784 +           {
 785 +             int i;
 786 +             size_t wclen;
 787 +
 788 +             if (Fimb.count >= allocated)
 789 +               {
 790 +                 if (allocated == 0)
 791 +                   allocated = 128;
 792 +                 else
 793 +                   allocated *= 2;
 794 +                 Fimb.patterns = xrealloc (Fimb.patterns,
 795 +                                           sizeof (wchar_t *) * allocated);
 796 +               }
 797 +             Fimb.patterns[Fimb.count++] = wcbeg;
 798 +             for (wclim = wcbeg;
 799 +                  wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
 800 +               *wclim = towlower (*wclim);
 801 +             *wclim = L'\0';
 802 +             wclen = wclim - wcbeg;
 803 +             if (wclen > Fimb.maxlen)
 804 +               Fimb.maxlen = wclen;
 805 +             if (wclen > 3)
 806 +               wclen = 3;
 807 +             if (wclen == 0)
 808 +               {
 809 +                 if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
 810 +                   error (2, 0, err);
 811 +               }
 812 +             else
 813 +               for (i = 0; i < (1 << wclen); i++)
 814 +                 {
 815 +                   char *p = starts;
 816 +                   int j, k;
 817 +
 818 +                   for (j = 0; j < wclen; ++j)
 819 +                     {
 820 +                       wchar_t wc = wcbeg[j];
 821 +                       if (i & (1 << j))
 822 +                         {
 823 +                           wc = towupper (wc);
 824 +                           if (wc == wcbeg[j])
 825 +                             continue;
 826 +                         }
 827 +                       k = wctomb (p, wc);
 828 +                       if (k <= 0)
 829 +                         goto fimb_fail;
 830 +                       p += k;
 831 +                     }
 832 +                   if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
 833 +                     error (2, 0, err);
 834 +                 }
 835 +             if (wclim < wcpattern + wcsize)
 836 +               ++wclim;
 837 +             wcbeg = wclim;
 838 +           }
 839 +         while (wcbeg < wcpattern + wcsize);
 840 +         f_i_multibyte = 1;
 841 +         kwset = fimb_kwset;
 842 +         free (starts);
 843 +         Fimb.match = xmalloc (Fimb.count);
 844 +         if ((err = kwsprep (kwset)) != 0)
 845 +           error (2, 0, err);
 846 +         return;
 847 +       }
 848 +    }
 849 +#endif /* MBS_SUPPORT */
 850 +
 851 +
 852    kwsinit ();
 853    beg = pattern;
 854    do
 855 @@ -499,6 +782,76 @@
 856      error (2, 0, err);
 857  }
 858
 859 +#ifdef MBS_SUPPORT
 860 +static int
 861 +Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
 862 +{
 863 +  size_t len, letter, i;
 864 +  int ret = -1;
 865 +  mbstate_t mbs;
 866 +  wchar_t wc;
 867 +  int patterns_left;
 868 +
 869 +  assert (match_icase && f_i_multibyte == 1);
 870 +  assert (MB_CUR_MAX > 1);
 871 +
 872 +  memset (&mbs, '\0', sizeof (mbs));
 873 +  memset (Fimb.match, '\1', Fimb.count);
 874 +  letter = len = 0;
 875 +  patterns_left = 1;
 876 +  while (patterns_left && len <= size)
 877 +    {
 878 +      size_t c;
 879 +
 880 +      patterns_left = 0;
 881 +      if (len < size)
 882 +       {
 883 +         c = mbrtowc (&wc, buf + len, size - len, &mbs);
 884 +         if (c + 2 <= 2)
 885 +           return ret;
 886 +
 887 +         wc = towlower (wc);
 888 +       }
 889 +      else
 890 +       {
 891 +         c = 1;
 892 +         wc = L'\0';
 893 +       }
 894 +
 895 +      for (i = 0; i < Fimb.count; i++)
 896 +       {
 897 +         if (Fimb.match[i])
 898 +           {
 899 +             if (Fimb.patterns[i][letter] == L'\0')
 900 +               {
 901 +                 /* Found a match. */
 902 +                 *plen = len;
 903 +                 if (!exact && !match_words)
 904 +                   return 0;
 905 +                 else
 906 +                   {
 907 +                     /* For -w or exact look for longest match.  */
 908 +                     ret = 0;
 909 +                     Fimb.match[i] = '\0';
 910 +                     continue;
 911 +                   }
 912 +               }
 913 +
 914 +             if (Fimb.patterns[i][letter] == wc)
 915 +               patterns_left = 1;
 916 +             else
 917 +               Fimb.match[i] = '\0';
 918 +           }
 919 +       }
 920 +
 921 +      len += c;
 922 +      letter++;
 923 +    }
 924 +
 925 +  return ret;
 926 +}
 927 +#endif /* MBS_SUPPORT */
 928 +
 929  static size_t
 930  Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
 931  {
 932 @@ -506,88 +859,268 @@
 933    register size_t len;
 934    char eol = eolbyte;
 935    struct kwsmatch kwsmatch;
 936 +  size_t ret_val;
 937  #ifdef MBS_SUPPORT
 938 -  char *mb_properties;
 939 -  if (MB_CUR_MAX > 1)
 940 -    mb_properties = check_multibyte_string (buf, size);
 941 +  int mb_cur_max = MB_CUR_MAX;
 942 +  mbstate_t mbs;
 943 +  memset (&mbs, '\0', sizeof (mbstate_t));
 944 +  const char *last_char = NULL;
 945  #endif /* MBS_SUPPORT */
 946
 947    for (beg = buf; beg <= buf + size; ++beg)
 948      {
 949 -      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
 950 +      size_t offset;
 951 +      offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
 952 +
 953        if (offset == (size_t) -1)
 954 -       {
 955 +       goto failure;
 956  #ifdef MBS_SUPPORT
 957 -         if (MB_CUR_MAX > 1)
 958 -           free(mb_properties);
 959 -#endif /* MBS_SUPPORT */
 960 -         return offset;
 961 +      if (mb_cur_max > 1 && !using_utf8)
 962 +       {
 963 +         size_t bytes_left = offset;
 964 +         while (bytes_left)
 965 +           {
 966 +             size_t mlen = mbrlen (beg, bytes_left, &mbs);
 967 +
 968 +             last_char = beg;
 969 +             if (mlen == (size_t) -1 || mlen == 0)
 970 +               {
 971 +                 /* Incomplete character: treat as single-byte. */
 972 +                 memset (&mbs, '\0', sizeof (mbstate_t));
 973 +                 beg++;
 974 +                 bytes_left--;
 975 +                 continue;
 976 +               }
 977 +
 978 +             if (mlen == (size_t) -2)
 979 +               /* Offset points inside multibyte character: no good. */
 980 +               break;
 981 +
 982 +             beg += mlen;
 983 +             bytes_left -= mlen;
 984 +           }
 985 +
 986 +         if (bytes_left)
 987 +           continue;
 988         }
 989 -#ifdef MBS_SUPPORT
 990 -      if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
 991 -       continue; /* It is a part of multibyte character.  */
 992 +      else
 993  #endif /* MBS_SUPPORT */
 994        beg += offset;
 995 -      len = kwsmatch.size[0];
 996 -      if (exact)
 997 -       {
 998 -         *match_size = len;
 999  #ifdef MBS_SUPPORT
1000 -         if (MB_CUR_MAX > 1)
1001 -           free (mb_properties);
1002 +      /* For f_i_multibyte, the string at beg now matches first 3 chars of
1003 +        one of the search strings (less if there are shorter search strings).
1004 +        See if this is a real match.  */
1005 +      if (f_i_multibyte
1006 +         && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
1007 +       goto next_char;
1008  #endif /* MBS_SUPPORT */
1009 -         return beg - buf;
1010 -       }
1011 +      len = kwsmatch.size[0];
1012 +      if (exact && !match_words)
1013 +       goto success_in_beg_and_len;
1014        if (match_lines)
1015         {
1016           if (beg > buf && beg[-1] != eol)
1017 -           continue;
1018 +           goto next_char;
1019           if (beg + len < buf + size && beg[len] != eol)
1020 -           continue;
1021 +           goto next_char;
1022           goto success;
1023         }
1024        else if (match_words)
1025 -       for (try = beg; len; )
1026 -         {
1027 -           if (try > buf && WCHAR((unsigned char) try[-1]))
1028 -             break;
1029 -           if (try + len < buf + size && WCHAR((unsigned char) try[len]))
1030 -             {
1031 -               offset = kwsexec (kwset, beg, --len, &kwsmatch);
1032 -               if (offset == (size_t) -1)
1033 -                 {
1034 +       {
1035 +         while (len)
1036 +           {
1037 +             int word_match = 0;
1038 +             if (beg > buf)
1039 +               {
1040  #ifdef MBS_SUPPORT
1041 -                   if (MB_CUR_MAX > 1)
1042 -                     free (mb_properties);
1043 +                 if (mb_cur_max > 1)
1044 +                   {
1045 +                     const char *s;
1046 +                     int mr;
1047 +                     wchar_t pwc;
1048 +
1049 +                     if (using_utf8)
1050 +                       {
1051 +                         s = beg - 1;
1052 +                         while (s > buf
1053 +                                && (unsigned char) *s >= 0x80
1054 +                                && (unsigned char) *s <= 0xbf)
1055 +                           --s;
1056 +                       }
1057 +                     else
1058 +                       s = last_char;
1059 +                     mr = mbtowc (&pwc, s, beg - s);
1060 +                     if (mr <= 0)
1061 +                       memset (&mbs, '\0', sizeof (mbstate_t));
1062 +                     else if ((iswalnum (pwc) || pwc == L'_')
1063 +                              && mr == (int) (beg - s))
1064 +                       goto next_char;
1065 +                   }
1066 +                 else
1067  #endif /* MBS_SUPPORT */
1068 -                   return offset;
1069 -                 }
1070 -               try = beg + offset;
1071 -               len = kwsmatch.size[0];
1072 -             }
1073 -           else
1074 -             goto success;
1075 -         }
1076 +                 if (WCHAR ((unsigned char) beg[-1]))
1077 +                   goto next_char;
1078 +               }
1079 +#ifdef MBS_SUPPORT
1080 +             if (mb_cur_max > 1)
1081 +               {
1082 +                 wchar_t nwc;
1083 +                 int mr;
1084 +
1085 +                 mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
1086 +                 if (mr <= 0)
1087 +                   {
1088 +                     memset (&mbs, '\0', sizeof (mbstate_t));
1089 +                     word_match = 1;
1090 +                   }
1091 +                 else if (!iswalnum (nwc) && nwc != L'_')
1092 +                   word_match = 1;
1093 +               }
1094 +             else
1095 +#endif /* MBS_SUPPORT */
1096 +               if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
1097 +                 word_match = 1;
1098 +             if (word_match)
1099 +               {
1100 +                 if (!exact)
1101 +                   /* Returns the whole line now we know there's a word match. */
1102 +                   goto success;
1103 +                 else
1104 +                   /* Returns just this word match. */
1105 +                   goto success_in_beg_and_len;
1106 +               }
1107 +             if (len > 0)
1108 +               {
1109 +                 /* Try a shorter length anchored at the same place. */
1110 +                 --len;
1111 +                 offset = kwsexec (kwset, beg, len, &kwsmatch);
1112 +
1113 +                 if (offset == -1)
1114 +                   goto next_char; /* Try a different anchor. */
1115 +#ifdef MBS_SUPPORT
1116 +                 if (mb_cur_max > 1 && !using_utf8)
1117 +                   {
1118 +                     size_t bytes_left = offset;
1119 +                     while (bytes_left)
1120 +                       {
1121 +                         size_t mlen = mbrlen (beg, bytes_left, &mbs);
1122 +
1123 +                         last_char = beg;
1124 +                         if (mlen == (size_t) -1 || mlen == 0)
1125 +                           {
1126 +                             /* Incomplete character: treat as single-byte. */
1127 +                             memset (&mbs, '\0', sizeof (mbstate_t));
1128 +                             beg++;
1129 +                             bytes_left--;
1130 +                             continue;
1131 +                           }
1132 +
1133 +                         if (mlen == (size_t) -2)
1134 +                           {
1135 +                             /* Offset points inside multibyte character:
1136 +                              * no good. */
1137 +                             break;
1138 +                           }
1139 +
1140 +                         beg += mlen;
1141 +                         bytes_left -= mlen;
1142 +                       }
1143 +
1144 +                     if (bytes_left)
1145 +                       {
1146 +                         memset (&mbs, '\0', sizeof (mbstate_t));
1147 +                         goto next_char; /* Try a different anchor. */
1148 +                       }
1149 +                   }
1150 +                 else
1151 +#endif /* MBS_SUPPORT */
1152 +                 beg += offset;
1153 +#ifdef MBS_SUPPORT
1154 +                 /* The string at beg now matches first 3 chars of one of
1155 +                    the search strings (less if there are shorter search
1156 +                    strings).  See if this is a real match.  */
1157 +                 if (f_i_multibyte
1158 +                     && Fimbexec (beg, len - offset, &kwsmatch.size[0],
1159 +                                  exact))
1160 +                   goto next_char;
1161 +#endif /* MBS_SUPPORT */
1162 +                 len = kwsmatch.size[0];
1163 +               }
1164 +           }
1165 +       }
1166        else
1167         goto success;
1168 -    }
1169 -
1170 +next_char:;
1171  #ifdef MBS_SUPPORT
1172 -  if (MB_CUR_MAX > 1)
1173 -    free (mb_properties);
1174 +      /* Advance to next character.  For MB_CUR_MAX == 1 case this is handled
1175 +        by ++beg above.  */
1176 +      if (mb_cur_max > 1)
1177 +       {
1178 +         if (using_utf8)
1179 +           {
1180 +             unsigned char c = *beg;
1181 +             if (c >= 0xc2)
1182 +               {
1183 +                 if (c < 0xe0)
1184 +                   ++beg;
1185 +                 else if (c < 0xf0)
1186 +                   beg += 2;
1187 +                 else if (c < 0xf8)
1188 +                   beg += 3;
1189 +                 else if (c < 0xfc)
1190 +                   beg += 4;
1191 +                 else if (c < 0xfe)
1192 +                   beg += 5;
1193 +               }
1194 +           }
1195 +         else
1196 +           {
1197 +             size_t l = mbrlen (beg, buf + size - beg, &mbs);
1198 +
1199 +             last_char = beg;
1200 +             if (l + 2 >= 2)
1201 +               beg += l - 1;
1202 +             else
1203 +               memset (&mbs, '\0', sizeof (mbstate_t));
1204 +           }
1205 +       }
1206  #endif /* MBS_SUPPORT */
1207 +    }
1208 +
1209 + failure:
1210    return -1;
1211
1212   success:
1213 +#ifdef MBS_SUPPORT
1214 +  if (mb_cur_max > 1 && !using_utf8)
1215 +    {
1216 +      end = beg + len;
1217 +      while (end < buf + size)
1218 +       {
1219 +         size_t mlen = mbrlen (end, buf + size - end, &mbs);
1220 +         if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
1221 +           {
1222 +             memset (&mbs, '\0', sizeof (mbstate_t));
1223 +             mlen = 1;
1224 +           }
1225 +         if (mlen == 1 && *end == eol)
1226 +           break;
1227 +
1228 +         end += mlen;
1229 +       }
1230 +    }
1231 +  else
1232 +#endif /* MBS_SUPPORT */
1233    end = memchr (beg + len, eol, (buf + size) - (beg + len));
1234 +
1235    end++;
1236    while (buf < beg && beg[-1] != eol)
1237      --beg;
1238 -  *match_size = end - beg;
1239 -#ifdef MBS_SUPPORT
1240 -  if (MB_CUR_MAX > 1)
1241 -    free (mb_properties);
1242 -#endif /* MBS_SUPPORT */
1243 +  len = end - beg;
1244 +  /* FALLTHROUGH */
1245 +
1246 + success_in_beg_and_len:
1247 +  *match_size = len;
1248    return beg - buf;
1249  }
1250
1251 diff -urN grep-2.5.1a.orig/src/search.c.orig grep-2.5.1a/src/search.c.orig
1252 --- grep-2.5.1a.orig/src/search.c.orig  1970-01-01 05:00:00.000000000 +0500
1253 +++ grep-2.5.1a/src/search.c.orig       2005-10-23 09:48:39.000000000 +0600
1254 @@ -0,0 +1,714 @@
1255 +/* search.c - searching subroutines using dfa, kwset and regex for grep.
1256 +   Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
1257 +
1258 +   This program is free software; you can redistribute it and/or modify
1259 +   it under the terms of the GNU General Public License as published by
1260 +   the Free Software Foundation; either version 2, or (at your option)
1261 +   any later version.
1262 +
1263 +   This program is distributed in the hope that it will be useful,
1264 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
1265 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1266 +   GNU General Public License for more details.
1267 +
1268 +   You should have received a copy of the GNU General Public License
1269 +   along with this program; if not, write to the Free Software
1270 +   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
1271 +   02111-1307, USA.  */
1272 +
1273 +/* Written August 1992 by Mike Haertel. */
1274 +
1275 +#ifdef HAVE_CONFIG_H
1276 +# include <config.h>
1277 +#endif
1278 +#include <sys/types.h>
1279 +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
1280 +/* We can handle multibyte string.  */
1281 +# define MBS_SUPPORT
1282 +# include <wchar.h>
1283 +# include <wctype.h>
1284 +#endif
1285 +
1286 +#include "system.h"
1287 +#include "grep.h"
1288 +#include "regex.h"
1289 +#include "dfa.h"
1290 +#include "kwset.h"
1291 +#include "error.h"
1292 +#include "xalloc.h"
1293 +#ifdef HAVE_LIBPCRE
1294 +# include <pcre.h>
1295 +#endif
1296 +
1297 +#define NCHAR (UCHAR_MAX + 1)
1298 +
1299 +/* For -w, we also consider _ to be word constituent.  */
1300 +#define WCHAR(C) (ISALNUM(C) || (C) == '_')
1301 +
1302 +/* DFA compiled regexp. */
1303 +static struct dfa dfa;
1304 +
1305 +/* The Regex compiled patterns.  */
1306 +static struct patterns
1307 +{
1308 +  /* Regex compiled regexp. */
1309 +  struct re_pattern_buffer regexbuf;
1310 +  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
1311 +                              Q@#%!# library interface in regex.c.  */
1312 +} patterns0;
1313 +
1314 +struct patterns *patterns;
1315 +size_t pcount;
1316 +
1317 +/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
1318 +   a list of strings, at least one of which is known to occur in
1319 +   any string matching the regexp. */
1320 +static kwset_t kwset;
1321 +
1322 +/* Number of compiled fixed strings known to exactly match the regexp.
1323 +   If kwsexec returns < kwset_exact_matches, then we don't need to
1324 +   call the regexp matcher at all. */
1325 +static int kwset_exact_matches;
1326 +
1327 +#if defined(MBS_SUPPORT)
1328 +static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
1329 +#endif
1330 +static void kwsinit PARAMS ((void));
1331 +static void kwsmusts PARAMS ((void));
1332 +static void Gcompile PARAMS ((char const *, size_t));
1333 +static void Ecompile PARAMS ((char const *, size_t));
1334 +static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
1335 +static void Fcompile PARAMS ((char const *, size_t));
1336 +static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
1337 +static void Pcompile PARAMS ((char const *, size_t ));
1338 +static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
1339 +
1340 +void
1341 +dfaerror (char const *mesg)
1342 +{
1343 +  error (2, 0, mesg);
1344 +}
1345 +
1346 +static void
1347 +kwsinit (void)
1348 +{
1349 +  static char trans[NCHAR];
1350 +  int i;
1351 +
1352 +  if (match_icase)
1353 +    for (i = 0; i < NCHAR; ++i)
1354 +      trans[i] = TOLOWER (i);
1355 +
1356 +  if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
1357 +    error (2, 0, _("memory exhausted"));
1358 +}
1359 +
1360 +/* If the DFA turns out to have some set of fixed strings one of
1361 +   which must occur in the match, then we build a kwset matcher
1362 +   to find those strings, and thus quickly filter out impossible
1363 +   matches. */
1364 +static void
1365 +kwsmusts (void)
1366 +{
1367 +  struct dfamust const *dm;
1368 +  char const *err;
1369 +
1370 +  if (dfa.musts)
1371 +    {
1372 +      kwsinit ();
1373 +      /* First, we compile in the substrings known to be exact
1374 +        matches.  The kwset matcher will return the index
1375 +        of the matching string that it chooses. */
1376 +      for (dm = dfa.musts; dm; dm = dm->next)
1377 +       {
1378 +         if (!dm->exact)
1379 +           continue;
1380 +         ++kwset_exact_matches;
1381 +         if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
1382 +           error (2, 0, err);
1383 +       }
1384 +      /* Now, we compile the substrings that will require
1385 +        the use of the regexp matcher.  */
1386 +      for (dm = dfa.musts; dm; dm = dm->next)
1387 +       {
1388 +         if (dm->exact)
1389 +           continue;
1390 +         if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
1391 +           error (2, 0, err);
1392 +       }
1393 +      if ((err = kwsprep (kwset)) != 0)
1394 +       error (2, 0, err);
1395 +    }
1396 +}
1397 +
1398 +#ifdef MBS_SUPPORT
1399 +/* This function allocate the array which correspond to "buf".
1400 +   Then this check multibyte string and mark on the positions which
1401 +   are not singlebyte character nor the first byte of a multibyte
1402 +   character.  Caller must free the array.  */
1403 +static char*
1404 +check_multibyte_string(char const *buf, size_t size)
1405 +{
1406 +  char *mb_properties = malloc(size);
1407 +  mbstate_t cur_state;
1408 +  int i;
1409 +  memset(&cur_state, 0, sizeof(mbstate_t));
1410 +  memset(mb_properties, 0, sizeof(char)*size);
1411 +  for (i = 0; i < size ;)
1412 +    {
1413 +      size_t mbclen;
1414 +      mbclen = mbrlen(buf + i, size - i, &cur_state);
1415 +
1416 +      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
1417 +       {
1418 +         /* An invalid sequence, or a truncated multibyte character.
1419 +            We treat it as a singlebyte character.  */
1420 +         mbclen = 1;
1421 +       }
1422 +      mb_properties[i] = mbclen;
1423 +      i += mbclen;
1424 +    }
1425 +
1426 +  return mb_properties;
1427 +}
1428 +#endif
1429 +
1430 +static void
1431 +Gcompile (char const *pattern, size_t size)
1432 +{
1433 +  const char *err;
1434 +  char const *sep;
1435 +  size_t total = size;
1436 +  char const *motif = pattern;
1437 +
1438 +  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
1439 +  dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
1440 +
1441 +  /* For GNU regex compiler we have to pass the patterns separately to detect
1442 +     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
1443 +     GNU regex should have raise a syntax error.  The same for backref, where
1444 +     the backref should have been local to each pattern.  */
1445 +  do
1446 +    {
1447 +      size_t len;
1448 +      sep = memchr (motif, '\n', total);
1449 +      if (sep)
1450 +       {
1451 +         len = sep - motif;
1452 +         sep++;
1453 +         total -= (len + 1);
1454 +       }
1455 +      else
1456 +       {
1457 +         len = total;
1458 +         total = 0;
1459 +       }
1460 +
1461 +      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
1462 +      if (patterns == NULL)
1463 +       error (2, errno, _("memory exhausted"));
1464 +
1465 +      patterns[pcount] = patterns0;
1466 +
1467 +      if ((err = re_compile_pattern (motif, len,
1468 +                                   &(patterns[pcount].regexbuf))) != 0)
1469 +       error (2, 0, err);
1470 +      pcount++;
1471 +
1472 +      motif = sep;
1473 +    } while (sep && total != 0);
1474 +
1475 +  /* In the match_words and match_lines cases, we use a different pattern
1476 +     for the DFA matcher that will quickly throw out cases that won't work.
1477 +     Then if DFA succeeds we do some hairy stuff using the regex matcher
1478 +     to decide whether the match should really count. */
1479 +  if (match_words || match_lines)
1480 +    {
1481 +      /* In the whole-word case, we use the pattern:
1482 +        \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
1483 +        In the whole-line case, we use the pattern:
1484 +        ^\(userpattern\)$.  */
1485 +
1486 +      static char const line_beg[] = "^\\(";
1487 +      static char const line_end[] = "\\)$";
1488 +      static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
1489 +      static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
1490 +      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
1491 +      size_t i;
1492 +      strcpy (n, match_lines ? line_beg : word_beg);
1493 +      i = strlen (n);
1494 +      memcpy (n + i, pattern, size);
1495 +      i += size;
1496 +      strcpy (n + i, match_lines ? line_end : word_end);
1497 +      i += strlen (n + i);
1498 +      pattern = n;
1499 +      size = i;
1500 +    }
1501 +
1502 +  dfacomp (pattern, size, &dfa, 1);
1503 +  kwsmusts ();
1504 +}
1505 +
1506 +static void
1507 +Ecompile (char const *pattern, size_t size)
1508 +{
1509 +  const char *err;
1510 +  const char *sep;
1511 +  size_t total = size;
1512 +  char const *motif = pattern;
1513 +
1514 +  if (strcmp (matcher, "awk") == 0)
1515 +    {
1516 +      re_set_syntax (RE_SYNTAX_AWK);
1517 +      dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
1518 +    }
1519 +  else
1520 +    {
1521 +      re_set_syntax (RE_SYNTAX_POSIX_EGREP);
1522 +      dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
1523 +    }
1524 +
1525 +  /* For GNU regex compiler we have to pass the patterns separately to detect
1526 +     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
1527 +     GNU regex should have raise a syntax error.  The same for backref, where
1528 +     the backref should have been local to each pattern.  */
1529 +  do
1530 +    {
1531 +      size_t len;
1532 +      sep = memchr (motif, '\n', total);
1533 +      if (sep)
1534 +       {
1535 +         len = sep - motif;
1536 +         sep++;
1537 +         total -= (len + 1);
1538 +       }
1539 +      else
1540 +       {
1541 +         len = total;
1542 +         total = 0;
1543 +       }
1544 +
1545 +      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
1546 +      if (patterns == NULL)
1547 +       error (2, errno, _("memory exhausted"));
1548 +      patterns[pcount] = patterns0;
1549 +
1550 +      if ((err = re_compile_pattern (motif, len,
1551 +                                   &(patterns[pcount].regexbuf))) != 0)
1552 +       error (2, 0, err);
1553 +      pcount++;
1554 +
1555 +      motif = sep;
1556 +    } while (sep && total != 0);
1557 +
1558 +  /* In the match_words and match_lines cases, we use a different pattern
1559 +     for the DFA matcher that will quickly throw out cases that won't work.
1560 +     Then if DFA succeeds we do some hairy stuff using the regex matcher
1561 +     to decide whether the match should really count. */
1562 +  if (match_words || match_lines)
1563 +    {
1564 +      /* In the whole-word case, we use the pattern:
1565 +        (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
1566 +        In the whole-line case, we use the pattern:
1567 +        ^(userpattern)$.  */
1568 +
1569 +      static char const line_beg[] = "^(";
1570 +      static char const line_end[] = ")$";
1571 +      static char const word_beg[] = "(^|[^[:alnum:]_])(";
1572 +      static char const word_end[] = ")([^[:alnum:]_]|$)";
1573 +      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
1574 +      size_t i;
1575 +      strcpy (n, match_lines ? line_beg : word_beg);
1576 +      i = strlen(n);
1577 +      memcpy (n + i, pattern, size);
1578 +      i += size;
1579 +      strcpy (n + i, match_lines ? line_end : word_end);
1580 +      i += strlen (n + i);
1581 +      pattern = n;
1582 +      size = i;
1583 +    }
1584 +
1585 +  dfacomp (pattern, size, &dfa, 1);
1586 +  kwsmusts ();
1587 +}
1588 +
1589 +static size_t
1590 +EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
1591 +{
1592 +  register char const *buflim, *beg, *end;
1593 +  char eol = eolbyte;
1594 +  int backref, start, len;
1595 +  struct kwsmatch kwsm;
1596 +  size_t i;
1597 +#ifdef MBS_SUPPORT
1598 +  char *mb_properties = NULL;
1599 +#endif /* MBS_SUPPORT */
1600 +
1601 +#ifdef MBS_SUPPORT
1602 +  if (MB_CUR_MAX > 1 && kwset)
1603 +    mb_properties = check_multibyte_string(buf, size);
1604 +#endif /* MBS_SUPPORT */
1605 +
1606 +  buflim = buf + size;
1607 +
1608 +  for (beg = end = buf; end < buflim; beg = end)
1609 +    {
1610 +      if (!exact)
1611 +       {
1612 +         if (kwset)
1613 +           {
1614 +             /* Find a possible match using the KWset matcher. */
1615 +             size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
1616 +             if (offset == (size_t) -1)
1617 +               goto failure;
1618 +             beg += offset;
1619 +             /* Narrow down to the line containing the candidate, and
1620 +                run it through DFA. */
1621 +             end = memchr(beg, eol, buflim - beg);
1622 +             end++;
1623 +#ifdef MBS_SUPPORT
1624 +             if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
1625 +               continue;
1626 +#endif
1627 +             while (beg > buf && beg[-1] != eol)
1628 +               --beg;
1629 +             if (kwsm.index < kwset_exact_matches)
1630 +               goto success_in_beg_and_end;
1631 +             if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
1632 +               continue;
1633 +           }
1634 +         else
1635 +           {
1636 +             /* No good fixed strings; start with DFA. */
1637 +             size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
1638 +             if (offset == (size_t) -1)
1639 +               break;
1640 +             /* Narrow down to the line we've found. */
1641 +             beg += offset;
1642 +             end = memchr (beg, eol, buflim - beg);
1643 +             end++;
1644 +             while (beg > buf && beg[-1] != eol)
1645 +               --beg;
1646 +           }
1647 +         /* Successful, no backreferences encountered! */
1648 +         if (!backref)
1649 +           goto success_in_beg_and_end;
1650 +       }
1651 +      else
1652 +       end = beg + size;
1653 +
1654 +      /* If we've made it to this point, this means DFA has seen
1655 +        a probable match, and we need to run it through Regex. */
1656 +      for (i = 0; i < pcount; i++)
1657 +       {
1658 +         patterns[i].regexbuf.not_eol = 0;
1659 +         if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
1660 +                                      end - beg - 1, 0,
1661 +                                      end - beg - 1, &(patterns[i].regs))))
1662 +           {
1663 +             len = patterns[i].regs.end[0] - start;
1664 +             if (exact && !match_words)
1665 +               goto success_in_start_and_len;
1666 +             if ((!match_lines && !match_words)
1667 +                 || (match_lines && len == end - beg - 1))
1668 +               goto success_in_beg_and_end;
1669 +             /* If -w, check if the match aligns with word boundaries.
1670 +                We do this iteratively because:
1671 +                (a) the line may contain more than one occurence of the
1672 +                pattern, and
1673 +                (b) Several alternatives in the pattern might be valid at a
1674 +                given point, and we may need to consider a shorter one to
1675 +                find a word boundary.  */
1676 +             if (match_words)
1677 +               while (start >= 0)
1678 +                 {
1679 +                   if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
1680 +                       && (len == end - beg - 1
1681 +                           || !WCHAR ((unsigned char) beg[start + len])))
1682 +                     goto success_in_beg_and_end;
1683 +                   if (len > 0)
1684 +                     {
1685 +                       /* Try a shorter length anchored at the same place. */
1686 +                       --len;
1687 +                       patterns[i].regexbuf.not_eol = 1;
1688 +                       len = re_match (&(patterns[i].regexbuf), beg,
1689 +                                       start + len, start,
1690 +                                       &(patterns[i].regs));
1691 +                     }
1692 +                   if (len <= 0)
1693 +                     {
1694 +                       /* Try looking further on. */
1695 +                       if (start == end - beg - 1)
1696 +                         break;
1697 +                       ++start;
1698 +                       patterns[i].regexbuf.not_eol = 0;
1699 +                       start = re_search (&(patterns[i].regexbuf), beg,
1700 +                                          end - beg - 1,
1701 +                                          start, end - beg - 1 - start,
1702 +                                          &(patterns[i].regs));
1703 +                       len = patterns[i].regs.end[0] - start;
1704 +                     }
1705 +                 }
1706 +           }
1707 +       } /* for Regex patterns.  */
1708 +    } /* for (beg = end ..) */
1709 +
1710 + failure:
1711 +#ifdef MBS_SUPPORT
1712 +  if (MB_CUR_MAX > 1 && mb_properties)
1713 +    free (mb_properties);
1714 +#endif /* MBS_SUPPORT */
1715 +  return (size_t) -1;
1716 +
1717 + success_in_beg_and_end:
1718 +  len = end - beg;
1719 +  start = beg - buf;
1720 +  /* FALLTHROUGH */
1721 +
1722 + success_in_start_and_len:
1723 +#ifdef MBS_SUPPORT
1724 +  if (MB_CUR_MAX > 1 && mb_properties)
1725 +    free (mb_properties);
1726 +#endif /* MBS_SUPPORT */
1727 +  *match_size = len;
1728 +  return start;
1729 +}
1730 +
1731 +static void
1732 +Fcompile (char const *pattern, size_t size)
1733 +{
1734 +  char const *beg, *lim, *err;
1735 +
1736 +  kwsinit ();
1737 +  beg = pattern;
1738 +  do
1739 +    {
1740 +      for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
1741 +       ;
1742 +      if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
1743 +       error (2, 0, err);
1744 +      if (lim < pattern + size)
1745 +       ++lim;
1746 +      beg = lim;
1747 +    }
1748 +  while (beg < pattern + size);
1749 +
1750 +  if ((err = kwsprep (kwset)) != 0)
1751 +    error (2, 0, err);
1752 +}
1753 +
1754 +static size_t
1755 +Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
1756 +{
1757 +  register char const *beg, *try, *end;
1758 +  register size_t len;
1759 +  char eol = eolbyte;
1760 +  struct kwsmatch kwsmatch;
1761 +#ifdef MBS_SUPPORT
1762 +  char *mb_properties;
1763 +  if (MB_CUR_MAX > 1)
1764 +    mb_properties = check_multibyte_string (buf, size);
1765 +#endif /* MBS_SUPPORT */
1766 +
1767 +  for (beg = buf; beg <= buf + size; ++beg)
1768 +    {
1769 +      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
1770 +      if (offset == (size_t) -1)
1771 +       goto failure;
1772 +#ifdef MBS_SUPPORT
1773 +      if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
1774 +       continue; /* It is a part of multibyte character.  */
1775 +#endif /* MBS_SUPPORT */
1776 +      beg += offset;
1777 +      len = kwsmatch.size[0];
1778 +      if (exact && !match_words)
1779 +       goto success_in_beg_and_len;
1780 +      if (match_lines)
1781 +       {
1782 +         if (beg > buf && beg[-1] != eol)
1783 +           continue;
1784 +         if (beg + len < buf + size && beg[len] != eol)
1785 +           continue;
1786 +         goto success;
1787 +       }
1788 +      else if (match_words)
1789 +       for (try = beg; len; )
1790 +         {
1791 +           if (try > buf && WCHAR((unsigned char) try[-1]))
1792 +             break;
1793 +           if (try + len < buf + size && WCHAR((unsigned char) try[len]))
1794 +             {
1795 +               offset = kwsexec (kwset, beg, --len, &kwsmatch);
1796 +               if (offset == (size_t) -1)
1797 +                 {
1798 +#ifdef MBS_SUPPORT
1799 +                   if (MB_CUR_MAX > 1)
1800 +                     free (mb_properties);
1801 +#endif /* MBS_SUPPORT */
1802 +                   return offset;
1803 +                 }
1804 +               try = beg + offset;
1805 +               len = kwsmatch.size[0];
1806 +             }
1807 +           else
1808 +             goto success;
1809 +         }
1810 +      else
1811 +       goto success;
1812 +    }
1813 +
1814 + failure:
1815 +#ifdef MBS_SUPPORT
1816 +  if (MB_CUR_MAX > 1)
1817 +    free (mb_properties);
1818 +#endif /* MBS_SUPPORT */
1819 +  return -1;
1820 +
1821 + success:
1822 +  end = memchr (beg + len, eol, (buf + size) - (beg + len));
1823 +  end++;
1824 +  while (buf < beg && beg[-1] != eol)
1825 +    --beg;
1826 +  len = end - beg;
1827 +  /* FALLTHROUGH */
1828 +
1829 + success_in_beg_and_len:
1830 +  *match_size = len;
1831 +#ifdef MBS_SUPPORT
1832 +  if (MB_CUR_MAX > 1)
1833 +    free (mb_properties);
1834 +#endif /* MBS_SUPPORT */
1835 +  return beg - buf;
1836 +}
1837 +
1838 +#if HAVE_LIBPCRE
1839 +/* Compiled internal form of a Perl regular expression.  */
1840 +static pcre *cre;
1841 +
1842 +/* Additional information about the pattern.  */
1843 +static pcre_extra *extra;
1844 +#endif
1845 +
1846 +static void
1847 +Pcompile (char const *pattern, size_t size)
1848 +{
1849 +#if !HAVE_LIBPCRE
1850 +  error (2, 0, _("The -P option is not supported"));
1851 +#else
1852 +  int e;
1853 +  char const *ep;
1854 +  char *re = xmalloc (4 * size + 7);
1855 +  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
1856 +  char const *patlim = pattern + size;
1857 +  char *n = re;
1858 +  char const *p;
1859 +  char const *pnul;
1860 +
1861 +  /* FIXME: Remove this restriction.  */
1862 +  if (eolbyte != '\n')
1863 +    error (2, 0, _("The -P and -z options cannot be combined"));
1864 +
1865 +  *n = '\0';
1866 +  if (match_lines)
1867 +    strcpy (n, "^(");
1868 +  if (match_words)
1869 +    strcpy (n, "\\b(");
1870 +  n += strlen (n);
1871 +
1872 +  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
1873 +     replace each NUL byte in the pattern with the four characters
1874 +     "\000", removing a preceding backslash if there are an odd
1875 +     number of backslashes before the NUL.
1876 +
1877 +     FIXME: This method does not work with some multibyte character
1878 +     encodings, notably Shift-JIS, where a multibyte character can end
1879 +     in a backslash byte.  */
1880 +  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
1881 +    {
1882 +      memcpy (n, p, pnul - p);
1883 +      n += pnul - p;
1884 +      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
1885 +       continue;
1886 +      n -= (pnul - p) & 1;
1887 +      strcpy (n, "\\000");
1888 +      n += 4;
1889 +    }
1890 +
1891 +  memcpy (n, p, patlim - p);
1892 +  n += patlim - p;
1893 +  *n = '\0';
1894 +  if (match_words)
1895 +    strcpy (n, ")\\b");
1896 +  if (match_lines)
1897 +    strcpy (n, ")$");
1898 +
1899 +  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
1900 +  if (!cre)
1901 +    error (2, 0, ep);
1902 +
1903 +  extra = pcre_study (cre, 0, &ep);
1904 +  if (ep)
1905 +    error (2, 0, ep);
1906 +
1907 +  free (re);
1908 +#endif
1909 +}
1910 +
1911 +static size_t
1912 +Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
1913 +{
1914 +#if !HAVE_LIBPCRE
1915 +  abort ();
1916 +  return -1;
1917 +#else
1918 +  /* This array must have at least two elements; everything after that
1919 +     is just for performance improvement in pcre_exec.  */
1920 +  int sub[300];
1921 +
1922 +  int e = pcre_exec (cre, extra, buf, size, 0, 0,
1923 +                    sub, sizeof sub / sizeof *sub);
1924 +
1925 +  if (e <= 0)
1926 +    {
1927 +      switch (e)
1928 +       {
1929 +       case PCRE_ERROR_NOMATCH:
1930 +         return -1;
1931 +
1932 +       case PCRE_ERROR_NOMEMORY:
1933 +         error (2, 0, _("Memory exhausted"));
1934 +
1935 +       default:
1936 +         abort ();
1937 +       }
1938 +    }
1939 +  else
1940 +    {
1941 +      /* Narrow down to the line we've found.  */
1942 +      char const *beg = buf + sub[0];
1943 +      char const *end = buf + sub[1];
1944 +      char const *buflim = buf + size;
1945 +      char eol = eolbyte;
1946 +      if (!exact)
1947 +       {
1948 +         end = memchr (end, eol, buflim - end);
1949 +         end++;
1950 +         while (buf < beg && beg[-1] != eol)
1951 +           --beg;
1952 +       }
1953 +
1954 +      *match_size = end - beg;
1955 +      return beg - buf;
1956 +    }
1957 +#endif
1958 +}
1959 +
1960 +struct matcher const matchers[] = {
1961 +  { "default", Gcompile, EGexecute },
1962 +  { "grep", Gcompile, EGexecute },
1963 +  { "egrep", Ecompile, EGexecute },
1964 +  { "awk", Ecompile, EGexecute },
1965 +  { "fgrep", Fcompile, Fexecute },
1966 +  { "perl", Pcompile, Pexecute },
1967 +  { "", 0, 0 },
1968 +};
1969 diff -urN grep-2.5.1a.orig/tests/fmbtest.sh grep-2.5.1a/tests/fmbtest.sh
1970 --- grep-2.5.1a.orig/tests/fmbtest.sh   1970-01-01 05:00:00.000000000 +0500
1971 +++ grep-2.5.1a/tests/fmbtest.sh        2005-10-23 09:51:12.000000000 +0600
1972 @@ -0,0 +1,111 @@
1973 +#!/bin/sh
1974 +
1975 +: ${srcdir=.}
1976 +
1977 +# If cs_CZ.UTF-8 locale doesn't work, skip this test silently
1978 +LC_ALL=cs_CZ.UTF-8 locale -k LC_CTYPE 2>/dev/null | ${GREP} -q charmap.*UTF-8 \
1979 +  || exit 77
1980 +
1981 +failures=0
1982 +
1983 +cat > csinput <<EOF
1984 +01 Žluťoučká číše
1985 +ČíŠE 02
1986 +03 Z číší Čiší cosi
1987 +04 Čí
1988 +Še 05
1989 +06 ČČČČČČČíšČÍŠčíš
1990 +07 ČČČ ČČČČíšČÍŠčíšEEEE
1991 +čAs 08
1992 +09Čapka
1993 +10ČaSy se měnÍ
1994 +ČÍšE11
1995 +Čas12
1996 +𝇕ČÍšE𝇓13
1997 +ŽČÍšE𝇓14
1998 +𝇕ČÍšEŽ15
1999 +ŽČÍšEŽ16
2000 +ČÍšE𝇓17
2001 +ČÍšEŽ18
2002 +19𝇕ČÍše
2003 +20ŽČÍše
2004 +EOF
2005 +cat > cspatfile <<EOF
2006 +ČÍšE
2007 +Čas
2008 +EOF
2009 +
2010 +for mode in F G E; do
2011 +
2012 +test1="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode} -f cspatfile csinput \
2013 +              | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
2014 +if test "$test1" != "11 12 13 14 15 16 17 18"; then
2015 +  echo "Test #1 ${mode} failed: $test1"
2016 +  failures=1
2017 +fi
2018 +
2019 +test2="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -f cspatfile csinput \
2020 +              | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
2021 +if test "$test2" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then
2022 +  echo "Test #2 ${mode} failed: $test2"
2023 +  failures=1
2024 +fi
2025 +
2026 +test3="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'ČÍšE' -e 'Čas' csinput \
2027 +              | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
2028 +if test "$test3" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then
2029 +  echo "Test #3 ${mode} failed: $test3"
2030 +  failures=1
2031 +fi
2032 +
2033 +test4="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}iw -f cspatfile csinput \
2034 +              | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
2035 +if test "$test4" != "01 02 08 13 17 19"; then
2036 +  echo "Test #4 ${mode} failed: $test4"
2037 +  failures=1
2038 +fi
2039 +
2040 +done
2041 +
2042 +# Test that -F --color=always prefers longer matches.
2043 +test5="`echo 'Cosi tu ČišÍ...' \
2044 +       | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -Fi -e 'čiš' -e 'čiší'`"
2045 +if echo "$test5" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then
2046 +  :
2047 +else
2048 +  echo "Test #5 F failed: $test5"
2049 +  failures=1
2050 +fi
2051 +
2052 +for mode in G E; do
2053 +
2054 +# Test that -{G,E} --color=always prefers earlier pattern matches.
2055 +test6="`echo 'Cosi tu ČišÍ...' \
2056 +       | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiš' -e 'čiší'`"
2057 +if echo "$test6" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČiš.*\[.*m\(.\[K\)\?Í\.\.\.'; then
2058 +  :
2059 +else
2060 +  echo "Test #6 ${mode} failed: $test6"
2061 +  failures=1
2062 +fi
2063 +
2064 +# Test that -{G,E} --color=always prefers earlier pattern matches.
2065 +test7="`echo 'Cosi tu ČišÍ...' \
2066 +       | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiší' -e 'čiš'`"
2067 +if echo "$test7" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then
2068 +  :
2069 +else
2070 +  echo "Test #7 ${mode} failed: $test7"
2071 +  failures=1
2072 +fi
2073 +
2074 +test8="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'Č.šE' -e 'Č[a-f]s' csinput \
2075 +              | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
2076 +if test "$test8" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then
2077 +  echo "Test #8 ${mode} failed: $test8"
2078 +  failures=1
2079 +fi
2080 +
2081 +done
2082 +
2083 +exit $failures
2084 diff -urN grep-2.5.1a.orig/tests/Makefile.am grep-2.5.1a/tests/Makefile.am
2085 --- grep-2.5.1a.orig/tests/Makefile.am  2001-03-07 09:11:27.000000000 +0500
2086 +++ grep-2.5.1a/tests/Makefile.am       2005-10-23 09:51:12.000000000 +0600
2087 @@ -3,7 +3,8 @@
2088  AWK=@AWK@
2089
2090  TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \
2091 -        status.sh empty.sh options.sh backref.sh file.sh
2092 +        status.sh empty.sh options.sh backref.sh file.sh \
2093 +        fmbtest.sh
2094  EXTRA_DIST = $(TESTS) \
2095               khadafy.lines khadafy.regexp \
2096               spencer1.awk spencer1.tests \
2097 diff -urN grep-2.5.1a.orig/tests/Makefile.in grep-2.5.1a/tests/Makefile.in
2098 --- grep-2.5.1a.orig/tests/Makefile.in  2002-03-26 21:09:36.000000000 +0500
2099 +++ grep-2.5.1a/tests/Makefile.in       2005-10-23 09:51:13.000000000 +0600
2100 @@ -97,7 +97,8 @@
2101  AWK = @AWK@
2102
2103  TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \
2104 -        status.sh empty.sh options.sh backref.sh file.sh
2105 +        status.sh empty.sh options.sh backref.sh file.sh \
2106 +       fmbtest.sh
2107
2108  EXTRA_DIST = $(TESTS) \
2109               khadafy.lines khadafy.regexp \