coreutils/patches/coreutils-i18n.patch

   1 Submitted by:            Xi Ruoyao <xry111@mengyan1223.wang>
   2 Date:                    2022-04-19
   3 Initial Package Version: 9.1
   4 Upstream Status:         Rejected
   5 Origin:                  https://src.fedoraproject.org/rpms/coreutils/raw/9325dbb/f/coreutils-i18n.patch
   6 Description:             Fixes i18n issues with various Coreutils programs
   7
   8 From 01010419a6499768563e7b2f3fd56cf16edda75e Mon Sep 17 00:00:00 2001
   9 From: rpm-build <rpm-build>
  10 Date: Mon, 4 Oct 2021 08:54:37 +0200
  11 Subject: [PATCH] coreutils-i18n.patch
  12
  13 ---
  14  bootstrap.conf              |   1 +
  15  configure.ac                |   2 +
  16  lib/linebuffer.h            |   8 +
  17  lib/mbfile.c                |   3 +
  18  lib/mbfile.h                | 255 ++++++++++++
  19  m4/mbfile.m4                |  14 +
  20  src/cut.c                   | 508 +++++++++++++++++++++--
  21  src/expand-common.c         | 114 ++++++
  22  src/expand-common.h         |  12 +
  23  src/expand.c                |  90 +++-
  24  src/fold.c                  | 312 ++++++++++++--
  25  src/join.c                  | 359 ++++++++++++++--
  26  src/local.mk                |   4 +-
  27  src/pr.c                    | 443 ++++++++++++++++++--
  28  src/sort.c                  | 792 +++++++++++++++++++++++++++++++++---
  29  src/unexpand.c              | 101 ++++-
  30  src/uniq.c                  | 119 +++++-
  31  tests/Coreutils.pm          |   3 +
  32  tests/expand/mb.sh          | 183 +++++++++
  33  tests/i18n/sort.sh          |  29 ++
  34  tests/local.mk              |   4 +
  35  tests/misc/expand.pl        |  42 ++
  36  tests/misc/fold.pl          |  50 ++-
  37  tests/misc/join.pl          |  50 +++
  38  tests/misc/sort-mb-tests.sh |  45 ++
  39  tests/misc/sort-merge.pl    |  42 ++
  40  tests/misc/sort.pl          |  40 +-
  41  tests/misc/unexpand.pl      |  39 ++
  42  tests/misc/uniq.pl          |  55 +++
  43  tests/pr/pr-tests.pl        |  49 +++
  44  tests/unexpand/mb.sh        | 172 ++++++++
  45  31 files changed, 3698 insertions(+), 242 deletions(-)
  46  create mode 100644 lib/mbfile.c
  47  create mode 100644 lib/mbfile.h
  48  create mode 100644 m4/mbfile.m4
  49  create mode 100755 tests/expand/mb.sh
  50  create mode 100755 tests/i18n/sort.sh
  51  create mode 100755 tests/misc/sort-mb-tests.sh
  52  create mode 100755 tests/unexpand/mb.sh
  53
  54 diff --git a/bootstrap.conf b/bootstrap.conf
  55 index c1399e3..60b39cf 100644
  56 --- a/bootstrap.conf
  57 +++ b/bootstrap.conf
  58 @@ -162,6 +162,7 @@ gnulib_modules="
  59    maintainer-makefile
  60    malloc-gnu
  61    manywarnings
  62 +  mbfile
  63    mbrlen
  64    mbrtowc
  65    mbsalign
  66 diff --git a/configure.ac b/configure.ac
  67 index 7e4afc9..4656a35 100644
  68 --- a/configure.ac
  69 +++ b/configure.ac
  70 @@ -476,6 +476,8 @@ fi
  71  # I'm leaving it here for now.  This whole thing needs to be modernized...
  72  gl_WINSIZE_IN_PTEM
  73
  74 +gl_MBFILE
  75 +
  76  gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H
  77
  78  if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \
  79 diff --git a/lib/linebuffer.h b/lib/linebuffer.h
  80 index 07d45ca..af62e6c 100644
  81 --- a/lib/linebuffer.h
  82 +++ b/lib/linebuffer.h
  83 @@ -22,6 +22,11 @@
  84  # include "idx.h"
  85  # include <stdio.h>
  86
  87 +/* Get mbstate_t.  */
  88 +# if HAVE_WCHAR_H
  89 +#  include <wchar.h>
  90 +# endif
  91 +
  92  /* A 'struct linebuffer' holds a line of text. */
  93
  94  struct linebuffer
  95 @@ -29,6 +34,9 @@ struct linebuffer
  96    idx_t size;                  /* Allocated. */
  97    idx_t length;                /* Used. */
  98    char *buffer;
  99 +# if HAVE_WCHAR_H
 100 +  mbstate_t state;
 101 +# endif
 102  };
 103
 104  /* Initialize linebuffer LINEBUFFER for use. */
 105 diff --git a/lib/mbfile.c b/lib/mbfile.c
 106 new file mode 100644
 107 index 0000000..b0a468e
 108 --- /dev/null
 109 +++ b/lib/mbfile.c
 110 @@ -0,0 +1,3 @@
 111 +#include <config.h>
 112 +#define MBFILE_INLINE _GL_EXTERN_INLINE
 113 +#include "mbfile.h"
 114 diff --git a/lib/mbfile.h b/lib/mbfile.h
 115 new file mode 100644
 116 index 0000000..11f1b12
 117 --- /dev/null
 118 +++ b/lib/mbfile.h
 119 @@ -0,0 +1,255 @@
 120 +/* Multibyte character I/O: macros for multi-byte encodings.
 121 +   Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc.
 122 +
 123 +   This program is free software: you can redistribute it and/or modify
 124 +   it under the terms of the GNU General Public License as published by
 125 +   the Free Software Foundation; either version 3 of the License, or
 126 +   (at your option) any later version.
 127 +
 128 +   This program is distributed in the hope that it will be useful,
 129 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 130 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 131 +   GNU General Public License for more details.
 132 +
 133 +   You should have received a copy of the GNU General Public License
 134 +   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 135 +
 136 +/* Written by Mitsuru Chinen <mchinen@yamato.ibm.com>
 137 +   and Bruno Haible <bruno@clisp.org>.  */
 138 +
 139 +/* The macros in this file implement multi-byte character input from a
 140 +   stream.
 141 +
 142 +   mb_file_t
 143 +     is the type for multibyte character input stream, usable for variable
 144 +     declarations.
 145 +
 146 +   mbf_char_t
 147 +     is the type for multibyte character or EOF, usable for variable
 148 +     declarations.
 149 +
 150 +   mbf_init (mbf, stream)
 151 +     initializes the MB_FILE for reading from stream.
 152 +
 153 +   mbf_getc (mbc, mbf)
 154 +     reads the next multibyte character from mbf and stores it in mbc.
 155 +
 156 +   mb_iseof (mbc)
 157 +     returns true if mbc represents the EOF value.
 158 +
 159 +   Here are the function prototypes of the macros.
 160 +
 161 +   extern void          mbf_init (mb_file_t mbf, FILE *stream);
 162 +   extern void          mbf_getc (mbf_char_t mbc, mb_file_t mbf);
 163 +   extern bool          mb_iseof (const mbf_char_t mbc);
 164 + */
 165 +
 166 +#ifndef _MBFILE_H
 167 +#define _MBFILE_H 1
 168 +
 169 +#include <assert.h>
 170 +#include <stdbool.h>
 171 +#include <stdio.h>
 172 +#include <string.h>
 173 +
 174 +/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
 175 +   <wchar.h>.
 176 +   BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
 177 +   <wchar.h>.  */
 178 +#include <stdio.h>
 179 +#include <time.h>
 180 +#include <wchar.h>
 181 +
 182 +#include "mbchar.h"
 183 +
 184 +#ifndef _GL_INLINE_HEADER_BEGIN
 185 + #error "Please include config.h first."
 186 +#endif
 187 +_GL_INLINE_HEADER_BEGIN
 188 +#ifndef MBFILE_INLINE
 189 +# define MBFILE_INLINE _GL_INLINE
 190 +#endif
 191 +
 192 +struct mbfile_multi {
 193 +  FILE *fp;
 194 +  bool eof_seen;
 195 +  bool have_pushback;
 196 +  mbstate_t state;
 197 +  unsigned int bufcount;
 198 +  char buf[MBCHAR_BUF_SIZE];
 199 +  struct mbchar pushback;
 200 +};
 201 +
 202 +MBFILE_INLINE void
 203 +mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
 204 +{
 205 +  size_t bytes;
 206 +
 207 +  /* If EOF has already been seen, don't use getc.  This matters if
 208 +     mbf->fp is connected to an interactive tty.  */
 209 +  if (mbf->eof_seen)
 210 +    goto eof;
 211 +
 212 +  /* Return character pushed back, if there is one.  */
 213 +  if (mbf->have_pushback)
 214 +    {
 215 +      mb_copy (mbc, &mbf->pushback);
 216 +      mbf->have_pushback = false;
 217 +      return;
 218 +    }
 219 +
 220 +  /* Before using mbrtowc, we need at least one byte.  */
 221 +  if (mbf->bufcount == 0)
 222 +    {
 223 +      int c = getc (mbf->fp);
 224 +      if (c == EOF)
 225 +        {
 226 +          mbf->eof_seen = true;
 227 +          goto eof;
 228 +        }
 229 +      mbf->buf[0] = (unsigned char) c;
 230 +      mbf->bufcount++;
 231 +    }
 232 +
 233 +  /* Handle most ASCII characters quickly, without calling mbrtowc().  */
 234 +  if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
 235 +    {
 236 +      /* These characters are part of the basic character set.  ISO C 99
 237 +         guarantees that their wide character code is identical to their
 238 +         char code.  */
 239 +      mbc->wc = mbc->buf[0] = mbf->buf[0];
 240 +      mbc->wc_valid = true;
 241 +      mbc->ptr = &mbc->buf[0];
 242 +      mbc->bytes = 1;
 243 +      mbf->bufcount = 0;
 244 +      return;
 245 +    }
 246 +
 247 +  /* Use mbrtowc on an increasing number of bytes.  Read only as many bytes
 248 +     from mbf->fp as needed.  This is needed to give reasonable interactive
 249 +     behaviour when mbf->fp is connected to an interactive tty.  */
 250 +  for (;;)
 251 +    {
 252 +      /* We don't know whether the 'mbrtowc' function updates the state when
 253 +         it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
 254 +         not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour.  We
 255 +         don't have an autoconf test for this, yet.
 256 +         The new behaviour would allow us to feed the bytes one by one into
 257 +         mbrtowc.  But the old behaviour forces us to feed all bytes since
 258 +         the end of the last character into mbrtowc.  Since we want to retry
 259 +         with more bytes when mbrtowc returns -2, we must backup the state
 260 +         before calling mbrtowc, because implementations with the new
 261 +         behaviour will clobber it.  */
 262 +      mbstate_t backup_state = mbf->state;
 263 +
 264 +      bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
 265 +
 266 +      if (bytes == (size_t) -1)
 267 +        {
 268 +          /* An invalid multibyte sequence was encountered.  */
 269 +          /* Return a single byte.  */
 270 +          bytes = 1;
 271 +          mbc->wc_valid = false;
 272 +          break;
 273 +        }
 274 +      else if (bytes == (size_t) -2)
 275 +        {
 276 +          /* An incomplete multibyte character.  */
 277 +          mbf->state = backup_state;
 278 +          if (mbf->bufcount == MBCHAR_BUF_SIZE)
 279 +            {
 280 +              /* An overlong incomplete multibyte sequence was encountered.  */
 281 +              /* Return a single byte.  */
 282 +              bytes = 1;
 283 +              mbc->wc_valid = false;
 284 +              break;
 285 +            }
 286 +          else
 287 +            {
 288 +              /* Read one more byte and retry mbrtowc.  */
 289 +              int c = getc (mbf->fp);
 290 +              if (c == EOF)
 291 +                {
 292 +                  /* An incomplete multibyte character at the end.  */
 293 +                  mbf->eof_seen = true;
 294 +                  bytes = mbf->bufcount;
 295 +                  mbc->wc_valid = false;
 296 +                  break;
 297 +                }
 298 +              mbf->buf[mbf->bufcount] = (unsigned char) c;
 299 +              mbf->bufcount++;
 300 +            }
 301 +        }
 302 +      else
 303 +        {
 304 +          if (bytes == 0)
 305 +            {
 306 +              /* A null wide character was encountered.  */
 307 +              bytes = 1;
 308 +              assert (mbf->buf[0] == '\0');
 309 +              assert (mbc->wc == 0);
 310 +            }
 311 +          mbc->wc_valid = true;
 312 +          break;
 313 +        }
 314 +    }
 315 +
 316 +  /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
 317 +  mbc->ptr = &mbc->buf[0];
 318 +  memcpy (&mbc->buf[0], &mbf->buf[0], bytes);
 319 +  mbc->bytes = bytes;
 320 +
 321 +  mbf->bufcount -= bytes;
 322 +  if (mbf->bufcount > 0)
 323 +    {
 324 +      /* It's not worth calling memmove() for so few bytes.  */
 325 +      unsigned int count = mbf->bufcount;
 326 +      char *p = &mbf->buf[0];
 327 +
 328 +      do
 329 +        {
 330 +          *p = *(p + bytes);
 331 +          p++;
 332 +        }
 333 +      while (--count > 0);
 334 +    }
 335 +  return;
 336 +
 337 +eof:
 338 +  /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
 339 +  mbc->ptr = NULL;
 340 +  mbc->bytes = 0;
 341 +  mbc->wc_valid = false;
 342 +  return;
 343 +}
 344 +
 345 +MBFILE_INLINE void
 346 +mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf)
 347 +{
 348 +  mb_copy (&mbf->pushback, mbc);
 349 +  mbf->have_pushback = true;
 350 +}
 351 +
 352 +typedef struct mbfile_multi mb_file_t;
 353 +
 354 +typedef mbchar_t mbf_char_t;
 355 +
 356 +#define mbf_init(mbf, stream)                                           \
 357 +  ((mbf).fp = (stream),                                                 \
 358 +   (mbf).eof_seen = false,                                              \
 359 +   (mbf).have_pushback = false,                                         \
 360 +   memset (&(mbf).state, '\0', sizeof (mbstate_t)),                     \
 361 +   (mbf).bufcount = 0)
 362 +
 363 +#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf))
 364 +
 365 +#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf))
 366 +
 367 +#define mb_iseof(mbc) ((mbc).bytes == 0)
 368 +
 369 +#ifndef _GL_INLINE_HEADER_BEGIN
 370 + #error "Please include config.h first."
 371 +#endif
 372 +_GL_INLINE_HEADER_BEGIN
 373 +
 374 +#endif /* _MBFILE_H */
 375 diff --git a/m4/mbfile.m4 b/m4/mbfile.m4
 376 new file mode 100644
 377 index 0000000..8589902
 378 --- /dev/null
 379 +++ b/m4/mbfile.m4
 380 @@ -0,0 +1,14 @@
 381 +# mbfile.m4 serial 7
 382 +dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc.
 383 +dnl This file is free software; the Free Software Foundation
 384 +dnl gives unlimited permission to copy and/or distribute it,
 385 +dnl with or without modifications, as long as this notice is preserved.
 386 +
 387 +dnl autoconf tests required for use of mbfile.h
 388 +dnl From Bruno Haible.
 389 +
 390 +AC_DEFUN([gl_MBFILE],
 391 +[
 392 +  AC_REQUIRE([AC_TYPE_MBSTATE_T])
 393 +  :
 394 +])
 395 diff --git a/src/cut.c b/src/cut.c
 396 index 6fd8978..faef877 100644
 397 --- a/src/cut.c
 398 +++ b/src/cut.c
 399 @@ -28,6 +28,11 @@
 400  #include <assert.h>
 401  #include <getopt.h>
 402  #include <sys/types.h>
 403 +
 404 +/* Get mbstate_t, mbrtowc().  */
 405 +#if HAVE_WCHAR_H
 406 +# include <wchar.h>
 407 +#endif
 408  #include "system.h"
 409
 410  #include "error.h"
 411 @@ -37,6 +42,18 @@
 412
 413  #include "set-fields.h"
 414
 415 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
 416 +   installation; work around this configuration error.        */
 417 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
 418 +# undef MB_LEN_MAX
 419 +# define MB_LEN_MAX 16
 420 +#endif
 421 +
 422 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
 423 +#if HAVE_MBRTOWC && defined mbstate_t
 424 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
 425 +#endif
 426 +
 427  /* The official name of this program (e.g., no 'g' prefix).  */
 428  #define PROGRAM_NAME "cut"
 429
 430 @@ -53,6 +70,52 @@
 431      }                                                                  \
 432    while (0)
 433
 434 +/* Refill the buffer BUF to get a multibyte character. */
 435 +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM)                        \
 436 +  do                                                                        \
 437 +    {                                                                        \
 438 +      if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM))        \
 439 +        {                                                                \
 440 +          memmove (BUF, BUFPOS, BUFLEN);                                \
 441 +          BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
 442 +          BUFPOS = BUF;                                                        \
 443 +        }                                                                \
 444 +    }                                                                        \
 445 +  while (0)
 446 +
 447 +/* Get wide character on BUFPOS. BUFPOS is not included after that.
 448 +   If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
 449 +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
 450 +  do                                                                        \
 451 +    {                                                                        \
 452 +      mbstate_t state_bak;                                                \
 453 +                                                                        \
 454 +      if (BUFLEN < 1)                                                        \
 455 +        {                                                                \
 456 +          WC = WEOF;                                                        \
 457 +          break;                                                        \
 458 +        }                                                                \
 459 +                                                                        \
 460 +      /* Get a wide character. */                                        \
 461 +      CONVFAIL = false;                                                        \
 462 +      state_bak = STATE;                                                \
 463 +      MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE);        \
 464 +                                                                        \
 465 +      switch (MBLENGTH)                                                        \
 466 +        {                                                                \
 467 +        case (size_t)-1:                                                \
 468 +        case (size_t)-2:                                                \
 469 +          CONVFAIL = true;                                                        \
 470 +          STATE = state_bak;                                                \
 471 +          /* Fall througn. */                                                \
 472 +                                                                        \
 473 +        case 0:                                                                \
 474 +          MBLENGTH = 1;                                                        \
 475 +          break;                                                        \
 476 +        }                                                                \
 477 +    }                                                                        \
 478 +  while (0)
 479 +
 480
 481  /* Pointer inside RP.  When checking if a byte or field is selected
 482     by a finite range, we check if it is between CURRENT_RP.LO
 483 @@ -60,6 +123,9 @@
 484     CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
 485  static struct field_range_pair *current_rp;
 486
 487 +/* Length of the delimiter given as argument to -d.  */
 488 +size_t delimlen;
 489 +
 490  /* This buffer is used to support the semantics of the -s option
 491     (or lack of same) when the specified field list includes (does
 492     not include) the first field.  In both of those cases, the entire
 493 @@ -72,6 +138,29 @@ static char *field_1_buffer;
 494  /* The number of bytes allocated for FIELD_1_BUFFER.  */
 495  static size_t field_1_bufsize;
 496
 497 +enum operating_mode
 498 +  {
 499 +    undefined_mode,
 500 +
 501 +    /* Output bytes that are at the given positions. */
 502 +    byte_mode,
 503 +
 504 +    /* Output characters that are at the given positions. */
 505 +    character_mode,
 506 +
 507 +    /* Output the given delimiter-separated fields. */
 508 +    field_mode
 509 +  };
 510 +
 511 +static enum operating_mode operating_mode;
 512 +
 513 +/* If nonzero, when in byte mode, don't split multibyte characters.  */
 514 +static int byte_mode_character_aware;
 515 +
 516 +/* If nonzero, the function for single byte locale is work
 517 +   if this program runs on multibyte locale. */
 518 +static int force_singlebyte_mode;
 519 +
 520  /* If true do not output lines containing no delimiter characters.
 521     Otherwise, all such lines are printed.  This option is valid only
 522     with field mode.  */
 523 @@ -83,10 +172,16 @@ static bool complement;
 524
 525  /* The delimiter character for field mode. */
 526  static unsigned char delim;
 527 +#if HAVE_WCHAR_H
 528 +static wchar_t wcdelim;
 529 +#endif
 530
 531  /* The delimiter for each line/record. */
 532  static unsigned char line_delim = '\n';
 533
 534 +/* True if the --output-delimiter=STRING option was specified.  */
 535 +static bool output_delimiter_specified;
 536 +
 537  /* The length of output_delimiter_string.  */
 538  static size_t output_delimiter_length;
 539
 540 @@ -94,9 +189,6 @@ static size_t output_delimiter_length;
 541     string consisting of the input delimiter.  */
 542  static char *output_delimiter_string;
 543
 544 -/* The output delimiter string contents, if the default.  */
 545 -static char output_delimiter_default[1];
 546 -
 547  /* True if we have ever read standard input. */
 548  static bool have_read_stdin;
 549
 550 @@ -150,7 +242,7 @@ Print selected parts of lines from each FILE to standard output.\n\
 551    -f, --fields=LIST       select only these fields;  also print any line\n\
 552                              that contains no delimiter character, unless\n\
 553                              the -s option is specified\n\
 554 -  -n                      (ignored)\n\
 555 +  -n                      with -b: don't split multibyte characters\n\
 556  "), stdout);
 557        fputs (_("\
 558        --complement        complement the set of selected bytes, characters\n\
 559 @@ -250,7 +342,7 @@ cut_bytes (FILE *stream)
 560            next_item (&byte_idx);
 561            if (print_kth (byte_idx))
 562              {
 563 -              if (output_delimiter_string != output_delimiter_default)
 564 +              if (output_delimiter_specified)
 565                  {
 566                    if (print_delimiter && is_range_start_index (byte_idx))
 567                      {
 568 @@ -266,6 +358,82 @@ cut_bytes (FILE *stream)
 569      }
 570  }
 571
 572 +#if HAVE_MBRTOWC
 573 +/* This function is in use for the following case.
 574 +
 575 +   1. Read from the stream STREAM, printing to standard output any selected
 576 +   characters.
 577 +
 578 +   2. Read from stream STREAM, printing to standard output any selected bytes,
 579 +   without splitting multibyte characters.  */
 580 +
 581 +static void
 582 +cut_characters_or_cut_bytes_no_split (FILE *stream)
 583 +{
 584 +  uintmax_t idx;                /* number of bytes or characters in the line so far. */
 585 +  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
 586 +  char *bufpos;                /* Next read position of BUF. */
 587 +  size_t buflen;        /* The length of the byte sequence in buf. */
 588 +  wint_t wc;                /* A gotten wide character. */
 589 +  size_t mblength;        /* The byte size of a multibyte character which shows
 590 +                           as same character as WC. */
 591 +  mbstate_t state;        /* State of the stream. */
 592 +  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
 593 +  /* Whether to begin printing delimiters between ranges for the current line.
 594 +     Set after we've begun printing data corresponding to the first range.  */
 595 +  bool print_delimiter = false;
 596 +
 597 +  idx = 0;
 598 +  buflen = 0;
 599 +  bufpos = buf;
 600 +  memset (&state, '\0', sizeof(mbstate_t));
 601 +
 602 +  current_rp = frp;
 603 +
 604 +  while (1)
 605 +    {
 606 +      REFILL_BUFFER (buf, bufpos, buflen, stream);
 607 +
 608 +      GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
 609 +      (void) convfail;  /* ignore unused */
 610 +
 611 +      if (wc == WEOF)
 612 +        {
 613 +          if (idx > 0)
 614 +            putchar (line_delim);
 615 +          break;
 616 +        }
 617 +      else if (wc == line_delim)
 618 +        {
 619 +          putchar (line_delim);
 620 +          idx = 0;
 621 +          print_delimiter = false;
 622 +          current_rp = frp;
 623 +        }
 624 +      else
 625 +        {
 626 +          next_item (&idx);
 627 +          if (print_kth (idx))
 628 +            {
 629 +              if (output_delimiter_specified)
 630 +                {
 631 +                  if (print_delimiter && is_range_start_index (idx))
 632 +                    {
 633 +                      fwrite (output_delimiter_string, sizeof (char),
 634 +                              output_delimiter_length, stdout);
 635 +                    }
 636 +                  print_delimiter = true;
 637 +                }
 638 +              fwrite (bufpos, mblength, sizeof(char), stdout);
 639 +            }
 640 +        }
 641 +
 642 +      buflen -= mblength;
 643 +      bufpos += mblength;
 644 +    }
 645 +}
 646 +#endif
 647 +
 648  /* Read from stream STREAM, printing to standard output any selected fields.  */
 649
 650  static void
 651 @@ -411,11 +579,218 @@ cut_fields (FILE *stream)
 652      }
 653  }
 654
 655 -/* Process file FILE to standard output, using CUT_STREAM.
 656 +#if HAVE_MBRTOWC
 657 +static void
 658 +cut_fields_mb (FILE *stream)
 659 +{
 660 +  int c;
 661 +  uintmax_t field_idx;
 662 +  int found_any_selected_field;
 663 +  int buffer_first_field;
 664 +  int empty_input;
 665 +  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
 666 +  char *bufpos;                /* Next read position of BUF. */
 667 +  size_t buflen;        /* The length of the byte sequence in buf. */
 668 +  wint_t wc = 0;        /* A gotten wide character. */
 669 +  size_t mblength;        /* The byte size of a multibyte character which shows
 670 +                           as same character as WC. */
 671 +  mbstate_t state;        /* State of the stream. */
 672 +  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
 673 +
 674 +  current_rp = frp;
 675 +
 676 +  found_any_selected_field = 0;
 677 +  field_idx = 1;
 678 +  bufpos = buf;
 679 +  buflen = 0;
 680 +  memset (&state, '\0', sizeof(mbstate_t));
 681 +
 682 +  c = getc (stream);
 683 +  empty_input = (c == EOF);
 684 +  if (c != EOF)
 685 +  {
 686 +    ungetc (c, stream);
 687 +    wc = 0;
 688 +  }
 689 +  else
 690 +    wc = WEOF;
 691 +
 692 +  /* To support the semantics of the -s flag, we may have to buffer
 693 +     all of the first field to determine whether it is `delimited.'
 694 +     But that is unnecessary if all non-delimited lines must be printed
 695 +     and the first field has been selected, or if non-delimited lines
 696 +     must be suppressed and the first field has *not* been selected.
 697 +     That is because a non-delimited line has exactly one field.  */
 698 +  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
 699 +
 700 +  while (1)
 701 +    {
 702 +      if (field_idx == 1 && buffer_first_field)
 703 +        {
 704 +          int len = 0;
 705 +
 706 +          while (1)
 707 +            {
 708 +              REFILL_BUFFER (buf, bufpos, buflen, stream);
 709 +
 710 +              GET_NEXT_WC_FROM_BUFFER
 711 +                (wc, bufpos, buflen, mblength, state, convfail);
 712 +
 713 +              if (wc == WEOF)
 714 +                break;
 715 +
 716 +              field_1_buffer = xrealloc (field_1_buffer, len + mblength);
 717 +              memcpy (field_1_buffer + len, bufpos, mblength);
 718 +              len += mblength;
 719 +              buflen -= mblength;
 720 +              bufpos += mblength;
 721 +
 722 +              if (!convfail && (wc == line_delim || wc == wcdelim))
 723 +                break;
 724 +            }
 725 +
 726 +          if (len <= 0 && wc == WEOF)
 727 +            break;
 728 +
 729 +          /* If the first field extends to the end of line (it is not
 730 +             delimited) and we are printing all non-delimited lines,
 731 +             print this one.  */
 732 +          if (convfail || (!convfail && wc != wcdelim))
 733 +            {
 734 +              if (suppress_non_delimited)
 735 +                {
 736 +                  /* Empty.        */
 737 +                }
 738 +              else
 739 +                {
 740 +                  fwrite (field_1_buffer, sizeof (char), len, stdout);
 741 +                  /* Make sure the output line is newline terminated.  */
 742 +                  if (convfail || (!convfail && wc != line_delim))
 743 +                    putchar (line_delim);
 744 +                }
 745 +              continue;
 746 +            }
 747 +
 748 +          if (print_kth (1))
 749 +            {
 750 +              /* Print the field, but not the trailing delimiter.  */
 751 +              fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
 752 +              found_any_selected_field = 1;
 753 +            }
 754 +          next_item (&field_idx);
 755 +        }
 756 +
 757 +      if (wc != WEOF)
 758 +        {
 759 +          if (print_kth (field_idx))
 760 +            {
 761 +              if (found_any_selected_field)
 762 +                {
 763 +                  fwrite (output_delimiter_string, sizeof (char),
 764 +                          output_delimiter_length, stdout);
 765 +                }
 766 +              found_any_selected_field = 1;
 767 +            }
 768 +
 769 +          while (1)
 770 +            {
 771 +              REFILL_BUFFER (buf, bufpos, buflen, stream);
 772 +
 773 +              GET_NEXT_WC_FROM_BUFFER
 774 +                (wc, bufpos, buflen, mblength, state, convfail);
 775 +
 776 +              if (wc == WEOF)
 777 +                break;
 778 +              else if (!convfail && (wc == wcdelim || wc == line_delim))
 779 +                {
 780 +                  buflen -= mblength;
 781 +                  bufpos += mblength;
 782 +                  break;
 783 +                }
 784 +
 785 +              if (print_kth (field_idx))
 786 +                fwrite (bufpos, mblength, sizeof(char), stdout);
 787 +
 788 +              buflen -= mblength;
 789 +              bufpos += mblength;
 790 +            }
 791 +        }
 792 +
 793 +      if ((!convfail || wc == line_delim) && buflen < 1)
 794 +        wc = WEOF;
 795 +
 796 +      if (!convfail && wc == wcdelim)
 797 +        next_item (&field_idx);
 798 +      else if (wc == WEOF || (!convfail && wc == line_delim))
 799 +        {
 800 +          if (found_any_selected_field
 801 +              || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
 802 +            putchar (line_delim);
 803 +          if (wc == WEOF)
 804 +            break;
 805 +          field_idx = 1;
 806 +          current_rp = frp;
 807 +          found_any_selected_field = 0;
 808 +        }
 809 +    }
 810 +}
 811 +#endif
 812 +
 813 +static void
 814 +cut_stream (FILE *stream)
 815 +{
 816 +#if HAVE_MBRTOWC
 817 +  if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
 818 +    {
 819 +      switch (operating_mode)
 820 +        {
 821 +        case byte_mode:
 822 +          if (byte_mode_character_aware)
 823 +            cut_characters_or_cut_bytes_no_split (stream);
 824 +          else
 825 +            cut_bytes (stream);
 826 +          break;
 827 +
 828 +        case character_mode:
 829 +          cut_characters_or_cut_bytes_no_split (stream);
 830 +          break;
 831 +
 832 +        case field_mode:
 833 +          if (delimlen == 1)
 834 +            {
 835 +              /* Check if we have utf8 multibyte locale, so we can use this
 836 +                 optimization because of uniqueness of characters, which is
 837 +                 not true for e.g. SJIS */
 838 +              char * loc = setlocale(LC_CTYPE, NULL);
 839 +              if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
 840 +                  strstr (loc, "UTF8") || strstr (loc, "utf8")))
 841 +                {
 842 +                  cut_fields (stream);
 843 +                  break;
 844 +                }
 845 +            }
 846 +          cut_fields_mb (stream);
 847 +          break;
 848 +
 849 +        default:
 850 +          abort ();
 851 +        }
 852 +    }
 853 +  else
 854 +#endif
 855 +    {
 856 +      if (operating_mode == field_mode)
 857 +        cut_fields (stream);
 858 +      else
 859 +        cut_bytes (stream);
 860 +    }
 861 +}
 862 +
 863 +/* Process file FILE to standard output.
 864     Return true if successful.  */
 865
 866  static bool
 867 -cut_file (char const *file, void (*cut_stream) (FILE *))
 868 +cut_file (char const *file)
 869  {
 870    FILE *stream;
 871
 872 @@ -459,8 +834,8 @@ main (int argc, char **argv)
 873    int optc;
 874    bool ok;
 875    bool delim_specified = false;
 876 -  bool byte_mode = false;
 877 -  char *spec_list_string = NULL;
 878 +  char *spec_list_string IF_LINT ( = NULL);
 879 +  char mbdelim[MB_LEN_MAX + 1];
 880
 881    initialize_main (&argc, &argv);
 882    set_program_name (argv[0]);
 883 @@ -470,6 +845,8 @@ main (int argc, char **argv)
 884
 885    atexit (close_stdout);
 886
 887 +  operating_mode = undefined_mode;
 888 +
 889    /* By default, all non-delimited lines are printed.  */
 890    suppress_non_delimited = false;
 891
 892 @@ -481,35 +858,77 @@ main (int argc, char **argv)
 893        switch (optc)
 894          {
 895          case 'b':
 896 -        case 'c':
 897            /* Build the byte list. */
 898 -          byte_mode = true;
 899 -          FALLTHROUGH;
 900 +          if (operating_mode != undefined_mode)
 901 +            FATAL_ERROR (_("only one type of list may be specified"));
 902 +          operating_mode = byte_mode;
 903 +          spec_list_string = optarg;
 904 +          break;
 905 +
 906 +        case 'c':
 907 +          /* Build the character list. */
 908 +          if (operating_mode != undefined_mode)
 909 +            FATAL_ERROR (_("only one type of list may be specified"));
 910 +          operating_mode = character_mode;
 911 +          spec_list_string = optarg;
 912 +          break;
 913 +
 914          case 'f':
 915            /* Build the field list. */
 916 -          if (spec_list_string)
 917 -            FATAL_ERROR (_("only one list may be specified"));
 918 +          if (operating_mode != undefined_mode)
 919 +            FATAL_ERROR (_("only one type of list may be specified"));
 920 +          operating_mode = field_mode;
 921            spec_list_string = optarg;
 922            break;
 923
 924          case 'd':
 925            /* New delimiter. */
 926            /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
 927 -          if (optarg[0] != '\0' && optarg[1] != '\0')
 928 -            FATAL_ERROR (_("the delimiter must be a single character"));
 929 -          delim = optarg[0];
 930 -          delim_specified = true;
 931 +            {
 932 +#if HAVE_MBRTOWC
 933 +              if(MB_CUR_MAX > 1)
 934 +                {
 935 +                  mbstate_t state;
 936 +
 937 +                  memset (&state, '\0', sizeof(mbstate_t));
 938 +                  delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
 939 +
 940 +                  if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
 941 +                    ++force_singlebyte_mode;
 942 +                  else
 943 +                    {
 944 +                      delimlen = (delimlen < 1) ? 1 : delimlen;
 945 +                      if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
 946 +                        FATAL_ERROR (_("the delimiter must be a single character"));
 947 +                      memcpy (mbdelim, optarg, delimlen);
 948 +                      mbdelim[delimlen] = '\0';
 949 +                      if (delimlen == 1)
 950 +                        delim = *optarg;
 951 +                    }
 952 +                }
 953 +
 954 +              if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
 955 +#endif
 956 +                {
 957 +                  if (optarg[0] != '\0' && optarg[1] != '\0')
 958 +                    FATAL_ERROR (_("the delimiter must be a single character"));
 959 +                  delim = (unsigned char) optarg[0];
 960 +                }
 961 +            delim_specified = true;
 962 +          }
 963            break;
 964
 965          case OUTPUT_DELIMITER_OPTION:
 966 +          output_delimiter_specified = true;
 967            /* Interpret --output-delimiter='' to mean
 968               'use the NUL byte as the delimiter.'  */
 969            output_delimiter_length = (optarg[0] == '\0'
 970                                       ? 1 : strlen (optarg));
 971 -          output_delimiter_string = optarg;
 972 +          output_delimiter_string = xstrdup (optarg);
 973            break;
 974
 975          case 'n':
 976 +          byte_mode_character_aware = 1;
 977            break;
 978
 979          case 's':
 980 @@ -533,40 +952,57 @@ main (int argc, char **argv)
 981          }
 982      }
 983
 984 -  if (!spec_list_string)
 985 +  if (operating_mode == undefined_mode)
 986      FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
 987
 988 -  if (byte_mode)
 989 -    {
 990 -      if (delim_specified)
 991 -        FATAL_ERROR (_("an input delimiter may be specified only\
 992 +  if (delim_specified && operating_mode != field_mode)
 993 +    FATAL_ERROR (_("an input delimiter may be specified only\
 994   when operating on fields"));
 995
 996 -      if (suppress_non_delimited)
 997 -        FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
 998 +  if (suppress_non_delimited && operating_mode != field_mode)
 999 +    FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
1000  \tonly when operating on fields"));
1001 -    }
1002
1003    set_fields (spec_list_string,
1004 -              ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0)
1005 -               | (complement ? SETFLD_COMPLEMENT : 0)));
1006 +              ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS)
1007 +              | (complement ? SETFLD_COMPLEMENT : 0) );
1008
1009    if (!delim_specified)
1010 -    delim = '\t';
1011 +    {
1012 +      delim = '\t';
1013 +#ifdef HAVE_MBRTOWC
1014 +      wcdelim = L'\t';
1015 +      mbdelim[0] = '\t';
1016 +      mbdelim[1] = '\0';
1017 +      delimlen = 1;
1018 +#endif
1019 +    }
1020
1021    if (output_delimiter_string == NULL)
1022      {
1023 -      output_delimiter_default[0] = delim;
1024 -      output_delimiter_string = output_delimiter_default;
1025 -      output_delimiter_length = 1;
1026 +#ifdef HAVE_MBRTOWC
1027 +      if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
1028 +        {
1029 +          output_delimiter_string = xstrdup(mbdelim);
1030 +          output_delimiter_length = delimlen;
1031 +        }
1032 +
1033 +      if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
1034 +#endif
1035 +        {
1036 +          static char dummy[2];
1037 +          dummy[0] = delim;
1038 +          dummy[1] = '\0';
1039 +          output_delimiter_string = dummy;
1040 +          output_delimiter_length = 1;
1041 +        }
1042      }
1043
1044 -  void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields;
1045    if (optind == argc)
1046 -    ok = cut_file ("-", cut_stream);
1047 +    ok = cut_file ("-");
1048    else
1049      for (ok = true; optind < argc; optind++)
1050 -      ok &= cut_file (argv[optind], cut_stream);
1051 +      ok &= cut_file (argv[optind]);
1052
1053
1054    if (have_read_stdin && fclose (stdin) == EOF)
1055 diff --git a/src/expand-common.c b/src/expand-common.c
1056 index deec1bd..b39f740 100644
1057 --- a/src/expand-common.c
1058 +++ b/src/expand-common.c
1059 @@ -19,6 +19,7 @@
1060  #include <assert.h>
1061  #include <stdio.h>
1062  #include <sys/types.h>
1063 +#include <mbfile.h>
1064  #include "system.h"
1065  #include "die.h"
1066  #include "error.h"
1067 @@ -125,6 +126,119 @@ set_increment_size (uintmax_t tabval)
1068    return ok;
1069  }
1070
1071 +extern int
1072 +set_utf_locale (void)
1073 +{
1074 +      /*try using some predefined locale */
1075 +      const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"};
1076 +
1077 +      const int predef_locales_count=3;
1078 +      for (int i=0;i<predef_locales_count;i++)
1079 +        {
1080 +          if (setlocale(LC_ALL,predef_locales[i])!=NULL)
1081 +          {
1082 +            break;
1083 +          }
1084 +          else if (i==predef_locales_count-1)
1085 +          {
1086 +            return 1;
1087 +            error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
1088 +          }
1089 +        }
1090 +        return 0;
1091 +}
1092 +
1093 +extern bool
1094 +check_utf_locale(void)
1095 +{
1096 +  char* locale = setlocale (LC_CTYPE , NULL);
1097 +  if (locale == NULL)
1098 +  {
1099 +    return false;
1100 +  }
1101 +  else if (strcasestr(locale, "utf8") == NULL && strcasestr(locale, "utf-8") == NULL)
1102 +  {
1103 +    return false;
1104 +  }
1105 +  return true;
1106 +}
1107 +
1108 +extern bool
1109 +check_bom(FILE* fp, mb_file_t *mbf)
1110 +{
1111 +  int c;
1112 +
1113 +
1114 +  c=fgetc(fp);
1115 +
1116 +  /*test BOM header of the first file */
1117 +  mbf->bufcount=0;
1118 +  if (c == 0xEF)
1119 +  {
1120 +    c=fgetc(fp);
1121 +  }
1122 +  else
1123 +  {
1124 +    if (c != EOF)
1125 +    {
1126 +      ungetc(c,fp);
1127 +    }
1128 +    return false;
1129 +  }
1130 +
1131 +  if (c == 0xBB)
1132 +  {
1133 +    c=fgetc(fp);
1134 +  }
1135 +  else
1136 +  {
1137 +    if ( c!= EOF )
1138 +    {
1139 +      mbf->buf[0]=(unsigned char) 0xEF;
1140 +      mbf->bufcount=1;
1141 +      ungetc(c,fp);
1142 +      return false;
1143 +    }
1144 +    else
1145 +    {
1146 +      ungetc(0xEF,fp);
1147 +      return false;
1148 +    }
1149 +  }
1150 +  if (c == 0xBF)
1151 +  {
1152 +    mbf->bufcount=0;
1153 +    return true;
1154 +  }
1155 +  else
1156 +  {
1157 +    if (c != EOF)
1158 +    {
1159 +      mbf->buf[0]=(unsigned char) 0xEF;
1160 +      mbf->buf[1]=(unsigned char) 0xBB;
1161 +      mbf->bufcount=2;
1162 +      ungetc(c,fp);
1163 +      return false;
1164 +    }
1165 +    else
1166 +    {
1167 +      mbf->buf[0]=(unsigned char) 0xEF;
1168 +      mbf->bufcount=1;
1169 +      ungetc(0xBB,fp);
1170 +      return false;
1171 +    }
1172 +  }
1173 +  return false;
1174 +}
1175 +
1176 +extern void
1177 +print_bom(void)
1178 +{
1179 +  putc (0xEF, stdout);
1180 +  putc (0xBB, stdout);
1181 +  putc (0xBF, stdout);
1182 +}
1183 +
1184  /* Add the comma or blank separated list of tab stops STOPS
1185     to the list of tab stops.  */
1186  extern void
1187 diff --git a/src/expand-common.h b/src/expand-common.h
1188 index 5f59a0e..835b9d5 100644
1189 --- a/src/expand-common.h
1190 +++ b/src/expand-common.h
1191 @@ -25,6 +25,18 @@ extern size_t max_column_width;
1192  /* The desired exit status.  */
1193  extern int exit_status;
1194
1195 +extern int
1196 +set_utf_locale (void);
1197 +
1198 +extern bool
1199 +check_utf_locale(void);
1200 +
1201 +extern bool
1202 +check_bom(FILE* fp, mb_file_t *mbf);
1203 +
1204 +extern void
1205 +print_bom(void);
1206 +
1207  /* Add tab stop TABVAL to the end of 'tab_list'.  */
1208  extern void
1209  add_tab_stop (uintmax_t tabval);
1210 diff --git a/src/expand.c b/src/expand.c
1211 index ed78ca8..a4cefa1 100644
1212 --- a/src/expand.c
1213 +++ b/src/expand.c
1214 @@ -37,6 +37,9 @@
1215  #include <stdio.h>
1216  #include <getopt.h>
1217  #include <sys/types.h>
1218 +
1219 +#include <mbfile.h>
1220 +
1221  #include "system.h"
1222  #include "die.h"
1223
1224 @@ -97,19 +100,41 @@ expand (void)
1225  {
1226    /* Input stream.  */
1227    FILE *fp = next_file (NULL);
1228 +  mb_file_t mbf;
1229 +  mbf_char_t c;
1230 +  /* True if the starting locale is utf8.  */
1231 +  bool using_utf_locale;
1232 +
1233 +  /* True if the first file contains BOM header.  */
1234 +  bool found_bom;
1235 +  using_utf_locale=check_utf_locale();
1236
1237    if (!fp)
1238      return;
1239 +  mbf_init (mbf, fp);
1240 +  found_bom=check_bom(fp,&mbf);
1241
1242 -  while (true)
1243 +  if (using_utf_locale == false && found_bom == true)
1244 +  {
1245 +    /*try using some predefined locale */
1246 +
1247 +    if (set_utf_locale () != 0)
1248      {
1249 -      /* Input character, or EOF.  */
1250 -      int c;
1251 +      error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
1252 +    }
1253 +  }
1254 +
1255 +
1256 +  if (found_bom == true)
1257 +  {
1258 +    print_bom();
1259 +  }
1260
1261 +  while (true)
1262 +    {
1263        /* If true, perform translations.  */
1264        bool convert = true;
1265
1266 -
1267        /* The following variables have valid values only when CONVERT
1268           is true:  */
1269
1270 @@ -119,17 +144,48 @@ expand (void)
1271        /* Index in TAB_LIST of next tab stop to examine.  */
1272        size_t tab_index = 0;
1273
1274 -
1275        /* Convert a line of text.  */
1276
1277        do
1278          {
1279 -          while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
1280 -            continue;
1281 +          while (true) {
1282 +            mbf_getc (c, mbf);
1283 +            if ((mb_iseof (c)) && (fp = next_file (fp)))
1284 +              {
1285 +                mbf_init (mbf, fp);
1286 +                if (fp!=NULL)
1287 +                {
1288 +                  if (check_bom(fp,&mbf)==true)
1289 +                  {
1290 +                    /*Not the first file - check BOM header*/
1291 +                    if (using_utf_locale==false && found_bom==false)
1292 +                    {
1293 +                      /*BOM header in subsequent file but not in the first one. */
1294 +                      error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
1295 +                    }
1296 +                  }
1297 +                  else
1298 +                  {
1299 +                    if(using_utf_locale==false && found_bom==true)
1300 +                    {
1301 +                      /*First file conatined BOM header - locale was switched to UTF
1302 +                       *all subsequent files should contain BOM. */
1303 +                      error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
1304 +                    }
1305 +                  }
1306 +                }
1307 +                continue;
1308 +              }
1309 +            else
1310 +              {
1311 +                break;
1312 +              }
1313 +            }
1314 +
1315
1316            if (convert)
1317              {
1318 -              if (c == '\t')
1319 +              if (mb_iseq (c, '\t'))
1320                  {
1321                    /* Column the next input tab stop is on.  */
1322                    uintmax_t next_tab_column;
1323 @@ -148,32 +204,34 @@ expand (void)
1324                      if (putchar (' ') < 0)
1325                        die (EXIT_FAILURE, errno, _("write error"));
1326
1327 -                  c = ' ';
1328 +                  mb_setascii (&c, ' ');
1329                  }
1330 -              else if (c == '\b')
1331 +              else if (mb_iseq (c, '\b'))
1332                  {
1333                    /* Go back one column, and force recalculation of the
1334                       next tab stop.  */
1335                    column -= !!column;
1336                    tab_index -= !!tab_index;
1337                  }
1338 -              else
1339 +              /* A leading control character could make us trip over.  */
1340 +              else if (!mb_iscntrl (c))
1341                  {
1342 -                  column++;
1343 +                  column += mb_width (c);
1344                    if (!column)
1345                      die (EXIT_FAILURE, 0, _("input line is too long"));
1346                  }
1347
1348 -              convert &= convert_entire_line || !! isblank (c);
1349 +              convert &= convert_entire_line || mb_isblank (c);
1350              }
1351
1352 -          if (c < 0)
1353 +          if (mb_iseof (c))
1354              return;
1355
1356 -          if (putchar (c) < 0)
1357 +          mb_putc (c, stdout);
1358 +          if (ferror (stdout))
1359              die (EXIT_FAILURE, errno, _("write error"));
1360          }
1361 -      while (c != '\n');
1362 +      while (!mb_iseq (c, '\n'));
1363      }
1364  }
1365
1366 diff --git a/src/fold.c b/src/fold.c
1367 index f07a90b..d32dbfd 100644
1368 --- a/src/fold.c
1369 +++ b/src/fold.c
1370 @@ -22,12 +22,34 @@
1371  #include <getopt.h>
1372  #include <sys/types.h>
1373
1374 +/* Get mbstate_t, mbrtowc(), wcwidth().  */
1375 +#if HAVE_WCHAR_H
1376 +# include <wchar.h>
1377 +#endif
1378 +
1379 +/* Get iswprint(), iswblank(), wcwidth().  */
1380 +#if HAVE_WCTYPE_H
1381 +# include <wctype.h>
1382 +#endif
1383 +
1384  #include "system.h"
1385  #include "die.h"
1386  #include "error.h"
1387  #include "fadvise.h"
1388  #include "xdectoint.h"
1389
1390 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1391 +      installation; work around this configuration error.  */
1392 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
1393 +# undef MB_LEN_MAX
1394 +# define MB_LEN_MAX 16
1395 +#endif
1396 +
1397 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
1398 +#if HAVE_MBRTOWC && defined mbstate_t
1399 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1400 +#endif
1401 +
1402  #define TAB_WIDTH 8
1403
1404  /* The official name of this program (e.g., no 'g' prefix).  */
1405 @@ -35,20 +57,41 @@
1406
1407  #define AUTHORS proper_name ("David MacKenzie")
1408
1409 +#define FATAL_ERROR(Message)                                            \
1410 +  do                                                                    \
1411 +    {                                                                   \
1412 +      error (0, 0, (Message));                                          \
1413 +      usage (2);                                                        \
1414 +    }                                                                   \
1415 +  while (0)
1416 +
1417 +enum operating_mode
1418 +{
1419 +  /* Fold texts by columns that are at the given positions. */
1420 +  column_mode,
1421 +
1422 +  /* Fold texts by bytes that are at the given positions. */
1423 +  byte_mode,
1424 +
1425 +  /* Fold texts by characters that are at the given positions. */
1426 +  character_mode,
1427 +};
1428 +
1429 +/* The argument shows current mode. (Default: column_mode) */
1430 +static enum operating_mode operating_mode;
1431 +
1432  /* If nonzero, try to break on whitespace. */
1433  static bool break_spaces;
1434
1435 -/* If nonzero, count bytes, not column positions. */
1436 -static bool count_bytes;
1437 -
1438  /* If nonzero, at least one of the files we read was standard input. */
1439  static bool have_read_stdin;
1440
1441 -static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
1442 +static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
1443
1444  static struct option const longopts[] =
1445  {
1446    {"bytes", no_argument, NULL, 'b'},
1447 +  {"characters", no_argument, NULL, 'c'},
1448    {"spaces", no_argument, NULL, 's'},
1449    {"width", required_argument, NULL, 'w'},
1450    {GETOPT_HELP_OPTION_DECL},
1451 @@ -76,6 +119,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
1452
1453        fputs (_("\
1454    -b, --bytes         count bytes rather than columns\n\
1455 +  -c, --characters    count characters rather than columns\n\
1456    -s, --spaces        break at spaces\n\
1457    -w, --width=WIDTH   use WIDTH columns instead of 80\n\
1458  "), stdout);
1459 @@ -93,7 +137,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
1460  static size_t
1461  adjust_column (size_t column, char c)
1462  {
1463 -  if (!count_bytes)
1464 +  if (operating_mode != byte_mode)
1465      {
1466        if (c == '\b')
1467          {
1468 @@ -116,30 +160,14 @@ adjust_column (size_t column, char c)
1469     to stdout, with maximum line length WIDTH.
1470     Return true if successful.  */
1471
1472 -static bool
1473 -fold_file (char const *filename, size_t width)
1474 +static void
1475 +fold_text (FILE *istream, size_t width, int *saved_errno)
1476  {
1477 -  FILE *istream;
1478    int c;
1479    size_t column = 0;           /* Screen column where next char will go. */
1480    size_t offset_out = 0;       /* Index in 'line_out' for next char. */
1481    static char *line_out = NULL;
1482    static size_t allocated_out = 0;
1483 -  int saved_errno;
1484 -
1485 -  if (STREQ (filename, "-"))
1486 -    {
1487 -      istream = stdin;
1488 -      have_read_stdin = true;
1489 -    }
1490 -  else
1491 -    istream = fopen (filename, "r");
1492 -
1493 -  if (istream == NULL)
1494 -    {
1495 -      error (0, errno, "%s", quotef (filename));
1496 -      return false;
1497 -    }
1498
1499    fadvise (istream, FADVISE_SEQUENTIAL);
1500
1501 @@ -169,6 +197,15 @@ fold_file (char const *filename, size_t width)
1502                bool found_blank = false;
1503                size_t logical_end = offset_out;
1504
1505 +              /* If LINE_OUT has no wide character,
1506 +                 put a new wide character in LINE_OUT
1507 +                 if column is bigger than width. */
1508 +              if (offset_out == 0)
1509 +                {
1510 +                  line_out[offset_out++] = c;
1511 +                  continue;
1512 +                }
1513 +
1514                /* Look for the last blank. */
1515                while (logical_end)
1516                  {
1517 @@ -215,13 +252,225 @@ fold_file (char const *filename, size_t width)
1518        line_out[offset_out++] = c;
1519      }
1520
1521 -  saved_errno = errno;
1522 +  *saved_errno = errno;
1523    if (!ferror (istream))
1524 -    saved_errno = 0;
1525 +    *saved_errno = 0;
1526
1527    if (offset_out)
1528      fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1529
1530 +}
1531 +
1532 +#if HAVE_MBRTOWC
1533 +static void
1534 +fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
1535 +{
1536 +  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
1537 +  size_t buflen = 0;        /* The length of the byte sequence in buf. */
1538 +  char *bufpos = buf;         /* Next read position of BUF. */
1539 +  wint_t wc;                /* A gotten wide character. */
1540 +  size_t mblength;        /* The byte size of a multibyte character which shows
1541 +                           as same character as WC. */
1542 +  mbstate_t state, state_bak;        /* State of the stream. */
1543 +  int convfail = 0;                /* 1, when conversion is failed. Otherwise 0. */
1544 +
1545 +  static char *line_out = NULL;
1546 +  size_t offset_out = 0;        /* Index in `line_out' for next char. */
1547 +  static size_t allocated_out = 0;
1548 +
1549 +  int increment;
1550 +  size_t column = 0;
1551 +
1552 +  size_t last_blank_pos;
1553 +  size_t last_blank_column;
1554 +  int is_blank_seen;
1555 +  int last_blank_increment = 0;
1556 +  int is_bs_following_last_blank;
1557 +  size_t bs_following_last_blank_num;
1558 +  int is_cr_after_last_blank;
1559 +
1560 +#define CLEAR_FLAGS                                \
1561 +   do                                                \
1562 +     {                                                \
1563 +        last_blank_pos = 0;                        \
1564 +        last_blank_column = 0;                        \
1565 +        is_blank_seen = 0;                        \
1566 +        is_bs_following_last_blank = 0;                \
1567 +        bs_following_last_blank_num = 0;        \
1568 +        is_cr_after_last_blank = 0;                \
1569 +     }                                                \
1570 +   while (0)
1571 +
1572 +#define START_NEW_LINE                        \
1573 +   do                                        \
1574 +     {                                        \
1575 +      putchar ('\n');                        \
1576 +      column = 0;                        \
1577 +      offset_out = 0;                        \
1578 +      CLEAR_FLAGS;                        \
1579 +    }                                        \
1580 +   while (0)
1581 +
1582 +  CLEAR_FLAGS;
1583 +  memset (&state, '\0', sizeof(mbstate_t));
1584 +
1585 +  for (;; bufpos += mblength, buflen -= mblength)
1586 +    {
1587 +      if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1588 +        {
1589 +          memmove (buf, bufpos, buflen);
1590 +          buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1591 +          bufpos = buf;
1592 +        }
1593 +
1594 +      if (buflen < 1)
1595 +        break;
1596 +
1597 +      /* Get a wide character. */
1598 +      state_bak = state;
1599 +      mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
1600 +
1601 +      switch (mblength)
1602 +        {
1603 +        case (size_t)-1:
1604 +        case (size_t)-2:
1605 +          convfail++;
1606 +          state = state_bak;
1607 +          /* Fall through. */
1608 +
1609 +        case 0:
1610 +          mblength = 1;
1611 +          break;
1612 +        }
1613 +
1614 +rescan:
1615 +      if (operating_mode == byte_mode)                        /* byte mode */
1616 +        increment = mblength;
1617 +      else if (operating_mode == character_mode)        /* character mode */
1618 +        increment = 1;
1619 +      else                                                /* column mode */
1620 +        {
1621 +          if (convfail)
1622 +            increment = 1;
1623 +          else
1624 +            {
1625 +              switch (wc)
1626 +                {
1627 +                case L'\n':
1628 +                  fwrite (line_out, sizeof(char), offset_out, stdout);
1629 +                  START_NEW_LINE;
1630 +                  continue;
1631 +
1632 +                case L'\b':
1633 +                  increment = (column > 0) ? -1 : 0;
1634 +                  break;
1635 +
1636 +                case L'\r':
1637 +                  increment = -1 * column;
1638 +                  break;
1639 +
1640 +                case L'\t':
1641 +                  increment = 8 - column % 8;
1642 +                  break;
1643 +
1644 +                default:
1645 +                  increment = wcwidth (wc);
1646 +                  increment = (increment < 0) ? 0 : increment;
1647 +                }
1648 +            }
1649 +        }
1650 +
1651 +      if (column + increment > width && break_spaces && last_blank_pos)
1652 +        {
1653 +          fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1654 +          putchar ('\n');
1655 +
1656 +          offset_out = offset_out - last_blank_pos;
1657 +          column = column - last_blank_column + ((is_cr_after_last_blank)
1658 +              ? last_blank_increment : bs_following_last_blank_num);
1659 +          memmove (line_out, line_out + last_blank_pos, offset_out);
1660 +          CLEAR_FLAGS;
1661 +          goto rescan;
1662 +        }
1663 +
1664 +      if (column + increment > width && column != 0)
1665 +        {
1666 +          fwrite (line_out, sizeof(char), offset_out, stdout);
1667 +          START_NEW_LINE;
1668 +          goto rescan;
1669 +        }
1670 +
1671 +      if (allocated_out < offset_out + mblength)
1672 +        {
1673 +          line_out = X2REALLOC (line_out, &allocated_out);
1674 +        }
1675 +
1676 +      memcpy (line_out + offset_out, bufpos, mblength);
1677 +      offset_out += mblength;
1678 +      column += increment;
1679 +
1680 +      if (is_blank_seen && !convfail && wc == L'\r')
1681 +        is_cr_after_last_blank = 1;
1682 +
1683 +      if (is_bs_following_last_blank && !convfail && wc == L'\b')
1684 +        ++bs_following_last_blank_num;
1685 +      else
1686 +        is_bs_following_last_blank = 0;
1687 +
1688 +      if (break_spaces && !convfail && iswblank (wc))
1689 +        {
1690 +          last_blank_pos = offset_out;
1691 +          last_blank_column = column;
1692 +          is_blank_seen = 1;
1693 +          last_blank_increment = increment;
1694 +          is_bs_following_last_blank = 1;
1695 +          bs_following_last_blank_num = 0;
1696 +          is_cr_after_last_blank = 0;
1697 +        }
1698 +    }
1699 +
1700 +  *saved_errno = errno;
1701 +  if (!ferror (istream))
1702 +    *saved_errno = 0;
1703 +
1704 +  if (offset_out)
1705 +    fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1706 +
1707 +}
1708 +#endif
1709 +
1710 +/* Fold file FILENAME, or standard input if FILENAME is "-",
1711 +   to stdout, with maximum line length WIDTH.
1712 +   Return 0 if successful, 1 if an error occurs. */
1713 +
1714 +static bool
1715 +fold_file (char const *filename, size_t width)
1716 +{
1717 +  FILE *istream;
1718 +  int saved_errno;
1719 +
1720 +  if (STREQ (filename, "-"))
1721 +    {
1722 +      istream = stdin;
1723 +      have_read_stdin = 1;
1724 +    }
1725 +  else
1726 +    istream = fopen (filename, "r");
1727 +
1728 +  if (istream == NULL)
1729 +    {
1730 +      error (0, errno, "%s", filename);
1731 +      return 1;
1732 +    }
1733 +
1734 +  /* Define how ISTREAM is being folded. */
1735 +#if HAVE_MBRTOWC
1736 +  if (MB_CUR_MAX > 1)
1737 +    fold_multibyte_text (istream, width, &saved_errno);
1738 +  else
1739 +#endif
1740 +    fold_text (istream, width, &saved_errno);
1741 +
1742    if (STREQ (filename, "-"))
1743      clearerr (istream);
1744    else if (fclose (istream) != 0 && !saved_errno)
1745 @@ -252,7 +501,8 @@ main (int argc, char **argv)
1746
1747    atexit (close_stdout);
1748
1749 -  break_spaces = count_bytes = have_read_stdin = false;
1750 +  operating_mode = column_mode;
1751 +  break_spaces = have_read_stdin = false;
1752
1753    while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
1754      {
1755 @@ -261,7 +511,15 @@ main (int argc, char **argv)
1756        switch (optc)
1757          {
1758          case 'b':              /* Count bytes rather than columns. */
1759 -          count_bytes = true;
1760 +          if (operating_mode != column_mode)
1761 +            FATAL_ERROR (_("only one way of folding may be specified"));
1762 +          operating_mode = byte_mode;
1763 +          break;
1764 +
1765 +        case 'c':
1766 +          if (operating_mode != column_mode)
1767 +            FATAL_ERROR (_("only one way of folding may be specified"));
1768 +          operating_mode = character_mode;
1769            break;
1770
1771          case 's':              /* Break at word boundaries. */
1772 diff --git a/src/join.c b/src/join.c
1773 index f2fd172..6c7d1ed 100644
1774 --- a/src/join.c
1775 +++ b/src/join.c
1776 @@ -22,19 +22,33 @@
1777  #include <sys/types.h>
1778  #include <getopt.h>
1779
1780 +/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth().  */
1781 +#if HAVE_WCHAR_H
1782 +# include <wchar.h>
1783 +#endif
1784 +
1785 +/* Get iswblank(), towupper.  */
1786 +#if HAVE_WCTYPE_H
1787 +# include <wctype.h>
1788 +#endif
1789 +
1790  #include "system.h"
1791  #include "die.h"
1792  #include "error.h"
1793  #include "fadvise.h"
1794  #include "hard-locale.h"
1795  #include "linebuffer.h"
1796 -#include "memcasecmp.h"
1797  #include "quote.h"
1798  #include "stdio--.h"
1799  #include "xmemcoll.h"
1800  #include "xstrtol.h"
1801  #include "argmatch.h"
1802
1803 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
1804 +#if HAVE_MBRTOWC && defined mbstate_t
1805 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1806 +#endif
1807 +
1808  /* The official name of this program (e.g., no 'g' prefix).  */
1809  #define PROGRAM_NAME "join"
1810
1811 @@ -136,10 +150,12 @@ static struct outlist outlist_head;
1812  /* Last element in 'outlist', where a new element can be added.  */
1813  static struct outlist *outlist_end = &outlist_head;
1814
1815 -/* Tab character separating fields.  If negative, fields are separated
1816 -   by any nonempty string of blanks, otherwise by exactly one
1817 -   tab character whose value (when cast to unsigned char) equals TAB.  */
1818 -static int tab = -1;
1819 +/* Tab character separating fields.  If NULL, fields are separated
1820 +   by any nonempty string of blanks.  */
1821 +static char *tab = NULL;
1822 +
1823 +/* The number of bytes used for tab. */
1824 +static size_t tablen = 0;
1825
1826  /* If nonzero, check that the input is correctly ordered. */
1827  static enum
1828 @@ -280,13 +296,14 @@ xfields (struct line *line)
1829    if (ptr == lim)
1830      return;
1831
1832 -  if (0 <= tab && tab != '\n')
1833 +  if (tab != NULL)
1834      {
1835 +      unsigned char t = tab[0];
1836        char *sep;
1837 -      for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
1838 +      for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1839          extract_field (line, ptr, sep - ptr);
1840      }
1841 -  else if (tab < 0)
1842 +   else
1843      {
1844        /* Skip leading blanks before the first field.  */
1845        while (field_sep (*ptr))
1846 @@ -310,6 +327,147 @@ xfields (struct line *line)
1847    extract_field (line, ptr, lim - ptr);
1848  }
1849
1850 +#if HAVE_MBRTOWC
1851 +static void
1852 +xfields_multibyte (struct line *line)
1853 +{
1854 +  char *ptr = line->buf.buffer;
1855 +  char const *lim = ptr + line->buf.length - 1;
1856 +  wchar_t wc = 0;
1857 +  size_t mblength = 1;
1858 +  mbstate_t state, state_bak;
1859 +
1860 +  memset (&state, 0, sizeof (mbstate_t));
1861 +
1862 +  if (ptr >= lim)
1863 +    return;
1864 +
1865 +  if (tab != NULL)
1866 +    {
1867 +      char *sep = ptr;
1868 +      for (; ptr < lim; ptr = sep + mblength)
1869 +       {
1870 +         sep = ptr;
1871 +         while (sep < lim)
1872 +           {
1873 +             state_bak = state;
1874 +             mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1875 +
1876 +             if (mblength == (size_t)-1 || mblength == (size_t)-2)
1877 +               {
1878 +                 mblength = 1;
1879 +                 state = state_bak;
1880 +               }
1881 +             mblength = (mblength < 1) ? 1 : mblength;
1882 +
1883 +             if (mblength == tablen && !memcmp (sep, tab, mblength))
1884 +               break;
1885 +             else
1886 +               {
1887 +                 sep += mblength;
1888 +                 continue;
1889 +               }
1890 +           }
1891 +
1892 +         if (sep >= lim)
1893 +           break;
1894 +
1895 +         extract_field (line, ptr, sep - ptr);
1896 +       }
1897 +    }
1898 +  else
1899 +    {
1900 +      /* Skip leading blanks before the first field.  */
1901 +      while(ptr < lim)
1902 +      {
1903 +        state_bak = state;
1904 +        mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1905 +
1906 +        if (mblength == (size_t)-1 || mblength == (size_t)-2)
1907 +          {
1908 +            mblength = 1;
1909 +            state = state_bak;
1910 +            break;
1911 +          }
1912 +        mblength = (mblength < 1) ? 1 : mblength;
1913 +
1914 +        if (!iswblank(wc) && wc != '\n')
1915 +          break;
1916 +        ptr += mblength;
1917 +      }
1918 +
1919 +      do
1920 +       {
1921 +         char *sep;
1922 +         state_bak = state;
1923 +         mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1924 +         if (mblength == (size_t)-1 || mblength == (size_t)-2)
1925 +           {
1926 +             mblength = 1;
1927 +             state = state_bak;
1928 +             break;
1929 +           }
1930 +         mblength = (mblength < 1) ? 1 : mblength;
1931 +
1932 +         sep = ptr + mblength;
1933 +         while (sep < lim)
1934 +           {
1935 +             state_bak = state;
1936 +             mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1937 +             if (mblength == (size_t)-1 || mblength == (size_t)-2)
1938 +               {
1939 +                 mblength = 1;
1940 +                 state = state_bak;
1941 +                 break;
1942 +               }
1943 +             mblength = (mblength < 1) ? 1 : mblength;
1944 +
1945 +             if (iswblank (wc) || wc == '\n')
1946 +               break;
1947 +
1948 +             sep += mblength;
1949 +           }
1950 +
1951 +         extract_field (line, ptr, sep - ptr);
1952 +         if (sep >= lim)
1953 +           return;
1954 +
1955 +         state_bak = state;
1956 +         mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1957 +         if (mblength == (size_t)-1 || mblength == (size_t)-2)
1958 +           {
1959 +             mblength = 1;
1960 +             state = state_bak;
1961 +             break;
1962 +           }
1963 +         mblength = (mblength < 1) ? 1 : mblength;
1964 +
1965 +         ptr = sep + mblength;
1966 +         while (ptr < lim)
1967 +           {
1968 +             state_bak = state;
1969 +             mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1970 +             if (mblength == (size_t)-1 || mblength == (size_t)-2)
1971 +               {
1972 +                 mblength = 1;
1973 +                 state = state_bak;
1974 +                 break;
1975 +               }
1976 +             mblength = (mblength < 1) ? 1 : mblength;
1977 +
1978 +             if (!iswblank (wc) && wc != '\n')
1979 +               break;
1980 +
1981 +             ptr += mblength;
1982 +           }
1983 +       }
1984 +      while (ptr < lim);
1985 +    }
1986 +
1987 +  extract_field (line, ptr, lim - ptr);
1988 +}
1989 +#endif
1990 +
1991  static void
1992  freeline (struct line *line)
1993  {
1994 @@ -331,56 +489,133 @@ keycmp (struct line const *line1, struct line const *line2,
1995          size_t jf_1, size_t jf_2)
1996  {
1997    /* Start of field to compare in each file.  */
1998 -  char *beg1;
1999 -  char *beg2;
2000 -
2001 -  size_t len1;
2002 -  size_t len2;         /* Length of fields to compare.  */
2003 +  char *beg[2];
2004 +  char *copy[2];
2005 +  size_t len[2];       /* Length of fields to compare.  */
2006    int diff;
2007 +  int i, j;
2008 +  int mallocd = 0;
2009
2010    if (jf_1 < line1->nfields)
2011      {
2012 -      beg1 = line1->fields[jf_1].beg;
2013 -      len1 = line1->fields[jf_1].len;
2014 +      beg[0] = line1->fields[jf_1].beg;
2015 +      len[0] = line1->fields[jf_1].len;
2016      }
2017    else
2018      {
2019 -      beg1 = NULL;
2020 -      len1 = 0;
2021 +      beg[0] = NULL;
2022 +      len[0] = 0;
2023      }
2024
2025    if (jf_2 < line2->nfields)
2026      {
2027 -      beg2 = line2->fields[jf_2].beg;
2028 -      len2 = line2->fields[jf_2].len;
2029 +      beg[1] = line2->fields[jf_2].beg;
2030 +      len[1] = line2->fields[jf_2].len;
2031      }
2032    else
2033      {
2034 -      beg2 = NULL;
2035 -      len2 = 0;
2036 +      beg[1] = NULL;
2037 +      len[1] = 0;
2038      }
2039
2040 -  if (len1 == 0)
2041 -    return len2 == 0 ? 0 : -1;
2042 -  if (len2 == 0)
2043 +  if (len[0] == 0)
2044 +    return len[1] == 0 ? 0 : -1;
2045 +  if (len[1] == 0)
2046      return 1;
2047
2048    if (ignore_case)
2049      {
2050 -      /* FIXME: ignore_case does not work with NLS (in particular,
2051 -         with multibyte chars).  */
2052 -      diff = memcasecmp (beg1, beg2, MIN (len1, len2));
2053 +#ifdef HAVE_MBRTOWC
2054 +      if (MB_CUR_MAX > 1)
2055 +      {
2056 +        size_t mblength;
2057 +        wchar_t wc, uwc;
2058 +        mbstate_t state, state_bak;
2059 +
2060 +        memset (&state, '\0', sizeof (mbstate_t));
2061 +
2062 +        for (i = 0; i < 2; i++)
2063 +          {
2064 +            mallocd = 1;
2065 +            copy[i] = xmalloc (len[i] + 1);
2066 +            memset (copy[i], '\0',len[i] + 1);
2067 +
2068 +            for (j = 0; j < MIN (len[0], len[1]);)
2069 +              {
2070 +                state_bak = state;
2071 +                mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
2072 +
2073 +                switch (mblength)
2074 +                  {
2075 +                  case (size_t) -1:
2076 +                  case (size_t) -2:
2077 +                    state = state_bak;
2078 +                    /* Fall through */
2079 +                  case 0:
2080 +                    mblength = 1;
2081 +                    break;
2082 +
2083 +                  default:
2084 +                    uwc = towupper (wc);
2085 +
2086 +                    if (uwc != wc)
2087 +                      {
2088 +                        mbstate_t state_wc;
2089 +                        size_t mblen;
2090 +
2091 +                        memset (&state_wc, '\0', sizeof (mbstate_t));
2092 +                        mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
2093 +                        assert (mblen != (size_t)-1);
2094 +                      }
2095 +                    else
2096 +                      memcpy (copy[i] + j, beg[i] + j, mblength);
2097 +                  }
2098 +                j += mblength;
2099 +              }
2100 +            copy[i][j] = '\0';
2101 +          }
2102 +      }
2103 +      else
2104 +#endif
2105 +      {
2106 +        for (i = 0; i < 2; i++)
2107 +          {
2108 +            mallocd = 1;
2109 +            copy[i] = xmalloc (len[i] + 1);
2110 +
2111 +            for (j = 0; j < MIN (len[0], len[1]); j++)
2112 +              copy[i][j] = toupper (beg[i][j]);
2113 +
2114 +            copy[i][j] = '\0';
2115 +          }
2116 +      }
2117      }
2118    else
2119      {
2120 -      if (hard_LC_COLLATE)
2121 -        return xmemcoll (beg1, len1, beg2, len2);
2122 -      diff = memcmp (beg1, beg2, MIN (len1, len2));
2123 +      copy[0] = beg[0];
2124 +      copy[1] = beg[1];
2125      }
2126
2127 +  if (hard_LC_COLLATE)
2128 +    {
2129 +      diff = xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
2130 +
2131 +      if (mallocd)
2132 +        for (i = 0; i < 2; i++)
2133 +          free (copy[i]);
2134 +
2135 +      return diff;
2136 +    }
2137 +  diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
2138 +
2139 +  if (mallocd)
2140 +    for (i = 0; i < 2; i++)
2141 +      free (copy[i]);
2142 +
2143 +
2144    if (diff)
2145      return diff;
2146 -  return len1 < len2 ? -1 : len1 != len2;
2147 +  return len[0] - len[1];
2148  }
2149
2150  /* Check that successive input lines PREV and CURRENT from input file
2151 @@ -472,6 +707,11 @@ get_line (FILE *fp, struct line **linep, int which)
2152      }
2153    ++line_no[which - 1];
2154
2155 +#if HAVE_MBRTOWC
2156 +  if (MB_CUR_MAX > 1)
2157 +    xfields_multibyte (line);
2158 +  else
2159 +#endif
2160    xfields (line);
2161
2162    if (prevline[which - 1])
2163 @@ -567,21 +807,28 @@ prfield (size_t n, struct line const *line)
2164
2165  /* Output all the fields in line, other than the join field.  */
2166
2167 +#define PUT_TAB_CHAR                                                   \
2168 +  do                                                                   \
2169 +    {                                                                  \
2170 +      (tab != NULL) ?                                                  \
2171 +       fwrite(tab, sizeof(char), tablen, stdout) : putchar (' ');      \
2172 +    }                                                                  \
2173 +  while (0)
2174 +
2175  static void
2176  prfields (struct line const *line, size_t join_field, size_t autocount)
2177  {
2178    size_t i;
2179    size_t nfields = autoformat ? autocount : line->nfields;
2180 -  char output_separator = tab < 0 ? ' ' : tab;
2181
2182    for (i = 0; i < join_field && i < nfields; ++i)
2183      {
2184 -      putchar (output_separator);
2185 +      PUT_TAB_CHAR;
2186        prfield (i, line);
2187      }
2188    for (i = join_field + 1; i < nfields; ++i)
2189      {
2190 -      putchar (output_separator);
2191 +      PUT_TAB_CHAR;
2192        prfield (i, line);
2193      }
2194  }
2195 @@ -592,7 +839,6 @@ static void
2196  prjoin (struct line const *line1, struct line const *line2)
2197  {
2198    const struct outlist *outlist;
2199 -  char output_separator = tab < 0 ? ' ' : tab;
2200    size_t field;
2201    struct line const *line;
2202
2203 @@ -626,7 +872,7 @@ prjoin (struct line const *line1, struct line const *line2)
2204            o = o->next;
2205            if (o == NULL)
2206              break;
2207 -          putchar (output_separator);
2208 +          PUT_TAB_CHAR;
2209          }
2210        putchar (eolchar);
2211      }
2212 @@ -1102,20 +1348,43 @@ main (int argc, char **argv)
2213
2214          case 't':
2215            {
2216 -            unsigned char newtab = optarg[0];
2217 +            char *newtab = NULL;
2218 +            size_t newtablen;
2219 +            newtab = xstrdup (optarg);
2220 +#if HAVE_MBRTOWC
2221 +            if (MB_CUR_MAX > 1)
2222 +              {
2223 +                mbstate_t state;
2224 +
2225 +                memset (&state, 0, sizeof (mbstate_t));
2226 +                newtablen = mbrtowc (NULL, newtab,
2227 +                                     strnlen (newtab, MB_LEN_MAX),
2228 +                                     &state);
2229 +                if (newtablen == (size_t) 0
2230 +                    || newtablen == (size_t) -1
2231 +                    || newtablen == (size_t) -2)
2232 +                  newtablen = 1;
2233 +              }
2234 +            else
2235 +#endif
2236 +              newtablen = 1;
2237              if (! newtab)
2238 -              newtab = '\n'; /* '' => process the whole line.  */
2239 +              newtab = (char*)"\n"; /* '' => process the whole line.  */
2240              else if (optarg[1])
2241                {
2242 -                if (STREQ (optarg, "\\0"))
2243 -                  newtab = '\0';
2244 -                else
2245 -                  die (EXIT_FAILURE, 0, _("multi-character tab %s"),
2246 -                       quote (optarg));
2247 +                if (newtablen == 1 && newtab[1])
2248 +                {
2249 +                  if (STREQ (newtab, "\\0"))
2250 +                     newtab[0] = '\0';
2251 +                }
2252 +              }
2253 +            if (tab != NULL && strcmp (tab, newtab))
2254 +              {
2255 +                free (newtab);
2256 +                die (EXIT_FAILURE, 0, _("incompatible tabs"));
2257                }
2258 -            if (0 <= tab && tab != newtab)
2259 -              die (EXIT_FAILURE, 0, _("incompatible tabs"));
2260              tab = newtab;
2261 +            tablen = newtablen;
2262            }
2263            break;
2264
2265 diff --git a/src/local.mk b/src/local.mk
2266 index e1d15ce..1a5ffaa 100644
2267 --- a/src/local.mk
2268 +++ b/src/local.mk
2269 @@ -434,8 +434,8 @@ src_base32_CPPFLAGS = -DBASE_TYPE=32 $(AM_CPPFLAGS)
2270  src_basenc_SOURCES = src/basenc.c
2271  src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS)
2272
2273 -src_expand_SOURCES = src/expand.c src/expand-common.c
2274 -src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
2275 +src_expand_SOURCES = src/expand.c src/expand-common.c lib/mbfile.c
2276 +src_unexpand_SOURCES = src/unexpand.c src/expand-common.c lib/mbfile.c
2277
2278  src_wc_SOURCES = src/wc.c
2279  if USE_AVX2_WC_LINECOUNT
2280 diff --git a/src/pr.c b/src/pr.c
2281 index 4c17c00..b4fab1c 100644
2282 --- a/src/pr.c
2283 +++ b/src/pr.c
2284 @@ -311,6 +311,24 @@
2285
2286  #include <getopt.h>
2287  #include <sys/types.h>
2288 +
2289 +/* Get MB_LEN_MAX.  */
2290 +#include <limits.h>
2291 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
2292 +   installation; work around this configuration error.  */
2293 +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
2294 +# define MB_LEN_MAX 16
2295 +#endif
2296 +
2297 +/* Get MB_CUR_MAX.  */
2298 +#include <stdlib.h>
2299 +
2300 +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
2301 +/* Get mbstate_t, mbrtowc(), wcwidth().  */
2302 +#if HAVE_WCHAR_H
2303 +# include <wchar.h>
2304 +#endif
2305 +
2306  #include "system.h"
2307  #include "die.h"
2308  #include "error.h"
2309 @@ -325,6 +343,18 @@
2310  #include "xstrtol-error.h"
2311  #include "xdectoint.h"
2312
2313 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
2314 +#if HAVE_MBRTOWC && defined mbstate_t
2315 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
2316 +#endif
2317 +
2318 +#ifndef HAVE_DECL_WCWIDTH
2319 +"this configure-time declaration test was not run"
2320 +#endif
2321 +#if !HAVE_DECL_WCWIDTH
2322 +extern int wcwidth ();
2323 +#endif
2324 +
2325  /* The official name of this program (e.g., no 'g' prefix).  */
2326  #define PROGRAM_NAME "pr"
2327
2328 @@ -417,7 +447,20 @@ struct COLUMN
2329
2330  typedef struct COLUMN COLUMN;
2331
2332 -static int char_to_clump (char c);
2333 +/* Funtion pointers to switch functions for single byte locale or for
2334 +   multibyte locale. If multibyte functions do not exist in your sysytem,
2335 +   these pointers always point the function for single byte locale. */
2336 +static void (*print_char) (char c);
2337 +static int (*char_to_clump) (char c);
2338 +
2339 +/* Functions for single byte locale. */
2340 +static void print_char_single (char c);
2341 +static int char_to_clump_single (char c);
2342 +
2343 +/* Functions for multibyte locale. */
2344 +static void print_char_multi (char c);
2345 +static int char_to_clump_multi (char c);
2346 +
2347  static bool read_line (COLUMN *p);
2348  static bool print_page (void);
2349  static bool print_stored (COLUMN *p);
2350 @@ -429,6 +472,7 @@ static void add_line_number (COLUMN *p);
2351  static void getoptnum (char const *n_str, int min, int *num,
2352                         char const *errfmt);
2353  static void getoptarg (char *arg, char switch_char, char *character,
2354 +                       int *character_length, int *character_width,
2355                         int *number);
2356  static void print_files (int number_of_files, char **av);
2357  static void init_parameters (int number_of_files);
2358 @@ -442,7 +486,6 @@ static void store_char (char c);
2359  static void pad_down (unsigned int lines);
2360  static void read_rest_of_line (COLUMN *p);
2361  static void skip_read (COLUMN *p, int column_number);
2362 -static void print_char (char c);
2363  static void cleanup (void);
2364  static void print_sep_string (void);
2365  static void separator_string (char const *optarg_S);
2366 @@ -454,7 +497,7 @@ static COLUMN *column_vector;
2367     we store the leftmost columns contiguously in buff.
2368     To print a line from buff, get the index of the first character
2369     from line_vector[i], and print up to line_vector[i + 1]. */
2370 -static char *buff;
2371 +static unsigned char *buff;
2372
2373  /* Index of the position in buff where the next character
2374     will be stored. */
2375 @@ -558,7 +601,7 @@ static int chars_per_column;
2376  static bool untabify_input = false;
2377
2378  /* (-e) The input tab character. */
2379 -static char input_tab_char = '\t';
2380 +static char input_tab_char[MB_LEN_MAX] = "\t";
2381
2382  /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
2383     where the leftmost column is 1. */
2384 @@ -568,7 +611,10 @@ static int chars_per_input_tab = 8;
2385  static bool tabify_output = false;
2386
2387  /* (-i) The output tab character. */
2388 -static char output_tab_char = '\t';
2389 +static char output_tab_char[MB_LEN_MAX] = "\t";
2390 +
2391 +/* (-i) The byte length of output tab character. */
2392 +static int output_tab_char_length = 1;
2393
2394  /* (-i) The width of the output tab. */
2395  static int chars_per_output_tab = 8;
2396 @@ -638,7 +684,13 @@ static int line_number;
2397  static bool numbered_lines = false;
2398
2399  /* (-n) Character which follows each line number. */
2400 -static char number_separator = '\t';
2401 +static char number_separator[MB_LEN_MAX] = "\t";
2402 +
2403 +/* (-n) The byte length of the character which follows each line number. */
2404 +static int number_separator_length = 1;
2405 +
2406 +/* (-n) The character width of the character which follows each line number. */
2407 +static int number_separator_width = 0;
2408
2409  /* (-n) line counting starts with 1st line of input file (not with 1st
2410     line of 1st page printed). */
2411 @@ -691,6 +743,7 @@ static bool use_col_separator = false;
2412     -a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */
2413  static char const *col_sep_string = "";
2414  static int col_sep_length = 0;
2415 +static int col_sep_width = 0;
2416  static char *column_separator = (char *) " ";
2417  static char *line_separator = (char *) "\t";
2418
2419 @@ -853,6 +906,13 @@ separator_string (char const *optarg_S)
2420      integer_overflow ();
2421    col_sep_length = len;
2422    col_sep_string = optarg_S;
2423 +
2424 +#if HAVE_MBRTOWC
2425 +  if (MB_CUR_MAX > 1)
2426 +    col_sep_width = mbswidth (col_sep_string, 0);
2427 +  else
2428 +#endif
2429 +    col_sep_width = col_sep_length;
2430  }
2431
2432  int
2433 @@ -877,6 +937,21 @@ main (int argc, char **argv)
2434
2435    atexit (close_stdout);
2436
2437 +/* Define which functions are used, the ones for single byte locale or the ones
2438 +   for multibyte locale. */
2439 +#if HAVE_MBRTOWC
2440 +  if (MB_CUR_MAX > 1)
2441 +    {
2442 +      print_char = print_char_multi;
2443 +      char_to_clump = char_to_clump_multi;
2444 +    }
2445 +  else
2446 +#endif
2447 +    {
2448 +      print_char = print_char_single;
2449 +      char_to_clump = char_to_clump_single;
2450 +    }
2451 +
2452    n_files = 0;
2453    file_names = (argc > 1
2454                  ? xnmalloc (argc - 1, sizeof (char *))
2455 @@ -953,8 +1028,12 @@ main (int argc, char **argv)
2456            break;
2457          case 'e':
2458            if (optarg)
2459 -            getoptarg (optarg, 'e', &input_tab_char,
2460 -                       &chars_per_input_tab);
2461 +            {
2462 +              int dummy_length, dummy_width;
2463 +
2464 +              getoptarg (optarg, 'e', input_tab_char, &dummy_length,
2465 +                         &dummy_width, &chars_per_input_tab);
2466 +            }
2467            /* Could check tab width > 0. */
2468            untabify_input = true;
2469            break;
2470 @@ -967,8 +1046,12 @@ main (int argc, char **argv)
2471            break;
2472          case 'i':
2473            if (optarg)
2474 -            getoptarg (optarg, 'i', &output_tab_char,
2475 -                       &chars_per_output_tab);
2476 +            {
2477 +              int dummy_width;
2478 +
2479 +              getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
2480 +                         &dummy_width, &chars_per_output_tab);
2481 +            }
2482            /* Could check tab width > 0. */
2483            tabify_output = true;
2484            break;
2485 @@ -986,8 +1069,8 @@ main (int argc, char **argv)
2486          case 'n':
2487            numbered_lines = true;
2488            if (optarg)
2489 -            getoptarg (optarg, 'n', &number_separator,
2490 -                       &chars_per_number);
2491 +            getoptarg (optarg, 'n', number_separator, &number_separator_length,
2492 +                       &number_separator_width, &chars_per_number);
2493            break;
2494          case 'N':
2495            skip_count = false;
2496 @@ -1012,6 +1095,7 @@ main (int argc, char **argv)
2497            /* Reset an additional input of -s, -S dominates -s */
2498            col_sep_string = "";
2499            col_sep_length = 0;
2500 +          col_sep_width = 0;
2501            use_col_separator = true;
2502            if (optarg)
2503              separator_string (optarg);
2504 @@ -1166,10 +1250,45 @@ getoptnum (char const *n_str, int min, int *num, char const *err)
2505     a number. */
2506
2507  static void
2508 -getoptarg (char *arg, char switch_char, char *character, int *number)
2509 +getoptarg (char *arg, char switch_char, char *character, int *character_length,
2510 +           int *character_width, int *number)
2511  {
2512    if (!ISDIGIT (*arg))
2513 -    *character = *arg++;
2514 +    {
2515 +#ifdef HAVE_MBRTOWC
2516 +      if (MB_CUR_MAX > 1)        /* for multibyte locale. */
2517 +        {
2518 +          wchar_t wc;
2519 +          size_t mblength;
2520 +          int width;
2521 +          mbstate_t state = {'\0'};
2522 +
2523 +          mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
2524 +
2525 +          if (mblength == (size_t)-1 || mblength == (size_t)-2)
2526 +            {
2527 +              *character_length = 1;
2528 +              *character_width = 1;
2529 +            }
2530 +          else
2531 +            {
2532 +              *character_length = (mblength < 1) ? 1 : mblength;
2533 +              width = wcwidth (wc);
2534 +              *character_width = (width < 0) ? 0 : width;
2535 +            }
2536 +
2537 +          strncpy (character, arg, *character_length);
2538 +          arg += *character_length;
2539 +        }
2540 +      else                        /* for single byte locale. */
2541 +#endif
2542 +        {
2543 +          *character = *arg++;
2544 +          *character_length = 1;
2545 +          *character_width = 1;
2546 +        }
2547 +    }
2548 +
2549    if (*arg)
2550      {
2551        long int tmp_long;
2552 @@ -1191,6 +1310,11 @@ static void
2553  init_parameters (int number_of_files)
2554  {
2555    int chars_used_by_number = 0;
2556 +  int mb_len = 1;
2557 +#if HAVE_MBRTOWC
2558 +  if (MB_CUR_MAX > 1)
2559 +    mb_len = MB_LEN_MAX;
2560 +#endif
2561
2562    lines_per_body = lines_per_page - lines_per_header - lines_per_footer;
2563    if (lines_per_body <= 0)
2564 @@ -1228,7 +1352,7 @@ init_parameters (int number_of_files)
2565            else
2566              col_sep_string = column_separator;
2567
2568 -          col_sep_length = 1;
2569 +          col_sep_length = col_sep_width = 1;
2570            use_col_separator = true;
2571          }
2572        /* It's rather pointless to define a TAB separator with column
2573 @@ -1260,11 +1384,11 @@ init_parameters (int number_of_files)
2574               + TAB_WIDTH (chars_per_input_tab, chars_per_number);   */
2575
2576        /* Estimate chars_per_text without any margin and keep it constant. */
2577 -      if (number_separator == '\t')
2578 +      if (number_separator[0] == '\t')
2579          number_width = (chars_per_number
2580                          + TAB_WIDTH (chars_per_default_tab, chars_per_number));
2581        else
2582 -        number_width = chars_per_number + 1;
2583 +        number_width = chars_per_number + number_separator_width;
2584
2585        /* The number is part of the column width unless we are
2586           printing files in parallel. */
2587 @@ -1273,7 +1397,7 @@ init_parameters (int number_of_files)
2588      }
2589
2590    int sep_chars, useful_chars;
2591 -  if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_length, &sep_chars))
2592 +  if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_width, &sep_chars))
2593      sep_chars = INT_MAX;
2594    if (INT_SUBTRACT_WRAPV (chars_per_line - chars_used_by_number, sep_chars,
2595                            &useful_chars))
2596 @@ -1296,7 +1420,7 @@ init_parameters (int number_of_files)
2597       We've to use 8 as the lower limit, if we use chars_per_default_tab = 8
2598       to expand a tab which is not an input_tab-char. */
2599    free (clump_buff);
2600 -  clump_buff = xmalloc (MAX (8, chars_per_input_tab));
2601 +  clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab));
2602  }
2603
2604  /* Open the necessary files,
2605 @@ -1402,7 +1526,7 @@ init_funcs (void)
2606
2607    /* Enlarge p->start_position of first column to use the same form of
2608       padding_not_printed with all columns. */
2609 -  h = h + col_sep_length;
2610 +  h = h + col_sep_width;
2611
2612    /* This loop takes care of all but the rightmost column. */
2613
2614 @@ -1436,7 +1560,7 @@ init_funcs (void)
2615          }
2616        else
2617          {
2618 -          h = h_next + col_sep_length;
2619 +          h = h_next + col_sep_width;
2620            h_next = h + chars_per_column;
2621          }
2622      }
2623 @@ -1733,9 +1857,9 @@ static void
2624  align_column (COLUMN *p)
2625  {
2626    padding_not_printed = p->start_position;
2627 -  if (col_sep_length < padding_not_printed)
2628 +  if (col_sep_width < padding_not_printed)
2629      {
2630 -      pad_across_to (padding_not_printed - col_sep_length);
2631 +      pad_across_to (padding_not_printed - col_sep_width);
2632        padding_not_printed = ANYWHERE;
2633      }
2634
2635 @@ -2010,13 +2134,13 @@ store_char (char c)
2636        /* May be too generous. */
2637        buff = X2REALLOC (buff, &buff_allocated);
2638      }
2639 -  buff[buff_current++] = c;
2640 +  buff[buff_current++] = (unsigned char) c;
2641  }
2642
2643  static void
2644  add_line_number (COLUMN *p)
2645  {
2646 -  int i;
2647 +  int i, j;
2648    char *s;
2649    int num_width;
2650
2651 @@ -2033,22 +2157,24 @@ add_line_number (COLUMN *p)
2652        /* Tabification is assumed for multiple columns, also for n-separators,
2653           but 'default n-separator = TAB' hasn't been given priority over
2654           equal column_width also specified by POSIX. */
2655 -      if (number_separator == '\t')
2656 +      if (number_separator[0] == '\t')
2657          {
2658            i = number_width - chars_per_number;
2659            while (i-- > 0)
2660              (p->char_func) (' ');
2661          }
2662        else
2663 -        (p->char_func) (number_separator);
2664 +        for (j = 0; j < number_separator_length; j++)
2665 +          (p->char_func) (number_separator[j]);
2666      }
2667    else
2668      /* To comply with POSIX, we avoid any expansion of default TAB
2669         separator with a single column output. No column_width requirement
2670         has to be considered. */
2671      {
2672 -      (p->char_func) (number_separator);
2673 -      if (number_separator == '\t')
2674 +      for (j = 0; j < number_separator_length; j++)
2675 +        (p->char_func) (number_separator[j]);
2676 +      if (number_separator[0] == '\t')
2677          output_position = POS_AFTER_TAB (chars_per_output_tab,
2678                            output_position);
2679      }
2680 @@ -2207,7 +2333,7 @@ print_white_space (void)
2681    while (goal - h_old > 1
2682           && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2683      {
2684 -      putchar (output_tab_char);
2685 +      fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
2686        h_old = h_new;
2687      }
2688    while (++h_old <= goal)
2689 @@ -2227,6 +2353,7 @@ print_sep_string (void)
2690  {
2691    char const *s = col_sep_string;
2692    int l = col_sep_length;
2693 +  int not_space_flag;
2694
2695    if (separators_not_printed <= 0)
2696      {
2697 @@ -2238,6 +2365,7 @@ print_sep_string (void)
2698      {
2699        for (; separators_not_printed > 0; --separators_not_printed)
2700          {
2701 +          not_space_flag = 0;
2702            while (l-- > 0)
2703              {
2704                /* 3 types of sep_strings: spaces only, spaces and chars,
2705 @@ -2251,12 +2379,15 @@ print_sep_string (void)
2706                  }
2707                else
2708                  {
2709 +                  not_space_flag = 1;
2710                    if (spaces_not_printed > 0)
2711                      print_white_space ();
2712                    putchar (*s++);
2713 -                  ++output_position;
2714                  }
2715              }
2716 +          if (not_space_flag)
2717 +            output_position += col_sep_width;
2718 +
2719            /* sep_string ends with some spaces */
2720            if (spaces_not_printed > 0)
2721              print_white_space ();
2722 @@ -2284,7 +2415,7 @@ print_clump (COLUMN *p, int n, char *clump)
2723     required number of tabs and spaces. */
2724
2725  static void
2726 -print_char (char c)
2727 +print_char_single (char c)
2728  {
2729    if (tabify_output)
2730      {
2731 @@ -2308,6 +2439,74 @@ print_char (char c)
2732    putchar (c);
2733  }
2734
2735 +#ifdef HAVE_MBRTOWC
2736 +static void
2737 +print_char_multi (char c)
2738 +{
2739 +  static size_t mbc_pos = 0;
2740 +  static char mbc[MB_LEN_MAX] = {'\0'};
2741 +  static mbstate_t state = {'\0'};
2742 +  mbstate_t state_bak;
2743 +  wchar_t wc;
2744 +  size_t mblength;
2745 +  int width;
2746 +
2747 +  if (tabify_output)
2748 +    {
2749 +      state_bak = state;
2750 +      mbc[mbc_pos++] = c;
2751 +      mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2752 +
2753 +      while (mbc_pos > 0)
2754 +        {
2755 +          switch (mblength)
2756 +            {
2757 +            case (size_t)-2:
2758 +              state = state_bak;
2759 +              return;
2760 +
2761 +            case (size_t)-1:
2762 +              state = state_bak;
2763 +              ++output_position;
2764 +              putchar (mbc[0]);
2765 +              memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2766 +              --mbc_pos;
2767 +              break;
2768 +
2769 +            case 0:
2770 +              mblength = 1;
2771 +
2772 +            default:
2773 +              if (wc == L' ')
2774 +                {
2775 +                  memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2776 +                  --mbc_pos;
2777 +                  ++spaces_not_printed;
2778 +                  return;
2779 +                }
2780 +              else if (spaces_not_printed > 0)
2781 +                print_white_space ();
2782 +
2783 +              /* Nonprintables are assumed to have width 0, except L'\b'. */
2784 +              if ((width = wcwidth (wc)) < 1)
2785 +                {
2786 +                  if (wc == L'\b')
2787 +                    --output_position;
2788 +                }
2789 +              else
2790 +                output_position += width;
2791 +
2792 +              fwrite (mbc, sizeof(char), mblength, stdout);
2793 +              memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2794 +              mbc_pos -= mblength;
2795 +            }
2796 +        }
2797 +      return;
2798 +    }
2799 +  putchar (c);
2800 +}
2801 +#endif
2802 +
2803  /* Skip to page PAGE before printing.
2804     PAGE may be larger than total number of pages. */
2805
2806 @@ -2485,9 +2684,9 @@ read_line (COLUMN *p)
2807            align_empty_cols = false;
2808          }
2809
2810 -      if (col_sep_length < padding_not_printed)
2811 +      if (col_sep_width < padding_not_printed)
2812          {
2813 -          pad_across_to (padding_not_printed - col_sep_length);
2814 +          pad_across_to (padding_not_printed - col_sep_width);
2815            padding_not_printed = ANYWHERE;
2816          }
2817
2818 @@ -2556,7 +2755,7 @@ print_stored (COLUMN *p)
2819    COLUMN *q;
2820
2821    int line = p->current_line++;
2822 -  char *first = &buff[line_vector[line]];
2823 +  unsigned char *first = &buff[line_vector[line]];
2824    /* FIXME
2825       UMR: Uninitialized memory read:
2826       * This is occurring while in:
2827 @@ -2568,7 +2767,7 @@ print_stored (COLUMN *p)
2828       xmalloc        [xmalloc.c:94]
2829       init_store_cols [pr.c:1648]
2830       */
2831 -  char *last = &buff[line_vector[line + 1]];
2832 +  unsigned char *last = &buff[line_vector[line + 1]];
2833
2834    pad_vertically = true;
2835
2836 @@ -2588,9 +2787,9 @@ print_stored (COLUMN *p)
2837          }
2838      }
2839
2840 -  if (col_sep_length < padding_not_printed)
2841 +  if (col_sep_width < padding_not_printed)
2842      {
2843 -      pad_across_to (padding_not_printed - col_sep_length);
2844 +      pad_across_to (padding_not_printed - col_sep_width);
2845        padding_not_printed = ANYWHERE;
2846      }
2847
2848 @@ -2603,8 +2802,8 @@ print_stored (COLUMN *p)
2849    if (spaces_not_printed == 0)
2850      {
2851        output_position = p->start_position + end_vector[line];
2852 -      if (p->start_position - col_sep_length == chars_per_margin)
2853 -        output_position -= col_sep_length;
2854 +      if (p->start_position - col_sep_width == chars_per_margin)
2855 +        output_position -= col_sep_width;
2856      }
2857
2858    return true;
2859 @@ -2623,7 +2822,7 @@ print_stored (COLUMN *p)
2860     number of characters is 1.) */
2861
2862  static int
2863 -char_to_clump (char c)
2864 +char_to_clump_single (char c)
2865  {
2866    unsigned char uc = c;
2867    char *s = clump_buff;
2868 @@ -2633,10 +2832,10 @@ char_to_clump (char c)
2869    int chars;
2870    int chars_per_c = 8;
2871
2872 -  if (c == input_tab_char)
2873 +  if (c == input_tab_char[0])
2874      chars_per_c = chars_per_input_tab;
2875
2876 -  if (c == input_tab_char || c == '\t')
2877 +  if (c == input_tab_char[0] || c == '\t')
2878      {
2879        width = TAB_WIDTH (chars_per_c, input_position);
2880
2881 @@ -2717,6 +2916,164 @@ char_to_clump (char c)
2882    return chars;
2883  }
2884
2885 +#ifdef HAVE_MBRTOWC
2886 +static int
2887 +char_to_clump_multi (char c)
2888 +{
2889 +  static size_t mbc_pos = 0;
2890 +  static char mbc[MB_LEN_MAX] = {'\0'};
2891 +  static mbstate_t state = {'\0'};
2892 +  mbstate_t state_bak;
2893 +  wchar_t wc;
2894 +  size_t mblength;
2895 +  int wc_width;
2896 +  register char *s = clump_buff;
2897 +  register int i, j;
2898 +  char esc_buff[4];
2899 +  int width;
2900 +  int chars;
2901 +  int chars_per_c = 8;
2902 +
2903 +  state_bak = state;
2904 +  mbc[mbc_pos++] = c;
2905 +  mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2906 +
2907 +  width = 0;
2908 +  chars = 0;
2909 +  while (mbc_pos > 0)
2910 +    {
2911 +      switch (mblength)
2912 +        {
2913 +        case (size_t)-2:
2914 +          state = state_bak;
2915 +          return 0;
2916 +
2917 +        case (size_t)-1:
2918 +          state = state_bak;
2919 +          mblength = 1;
2920 +
2921 +          if (use_esc_sequence || use_cntrl_prefix)
2922 +            {
2923 +              width = +4;
2924 +              chars = +4;
2925 +              *s++ = '\\';
2926 +              sprintf (esc_buff, "%03o", (unsigned char) mbc[0]);
2927 +              for (i = 0; i <= 2; ++i)
2928 +                *s++ = (int) esc_buff[i];
2929 +            }
2930 +          else
2931 +            {
2932 +              width += 1;
2933 +              chars += 1;
2934 +              *s++ = mbc[0];
2935 +            }
2936 +          break;
2937 +
2938 +        case 0:
2939 +          mblength = 1;
2940 +                /* Fall through */
2941 +
2942 +        default:
2943 +          if (memcmp (mbc, input_tab_char, mblength) == 0)
2944 +            chars_per_c = chars_per_input_tab;
2945 +
2946 +          if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2947 +            {
2948 +              int  width_inc;
2949 +
2950 +              width_inc = TAB_WIDTH (chars_per_c, input_position);
2951 +              width += width_inc;
2952 +
2953 +              if (untabify_input)
2954 +                {
2955 +                  for (i = width_inc; i; --i)
2956 +                    *s++ = ' ';
2957 +                  chars += width_inc;
2958 +                }
2959 +              else
2960 +                {
2961 +                  for (i = 0; i <  mblength; i++)
2962 +                    *s++ = mbc[i];
2963 +                  chars += mblength;
2964 +                }
2965 +            }
2966 +          else if ((wc_width = wcwidth (wc)) < 1)
2967 +            {
2968 +              if (use_esc_sequence)
2969 +                {
2970 +                  for (i = 0; i < mblength; i++)
2971 +                    {
2972 +                      width += 4;
2973 +                      chars += 4;
2974 +                      *s++ = '\\';
2975 +                      sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
2976 +                      for (j = 0; j <= 2; ++j)
2977 +                        *s++ = (int) esc_buff[j];
2978 +                    }
2979 +                }
2980 +              else if (use_cntrl_prefix)
2981 +                {
2982 +                  if (wc < 0200)
2983 +                    {
2984 +                      width += 2;
2985 +                      chars += 2;
2986 +                      *s++ = '^';
2987 +                      *s++ = wc ^ 0100;
2988 +                    }
2989 +                  else
2990 +                    {
2991 +                      for (i = 0; i < mblength; i++)
2992 +                        {
2993 +                          width += 4;
2994 +                          chars += 4;
2995 +                          *s++ = '\\';
2996 +                          sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
2997 +                          for (j = 0; j <= 2; ++j)
2998 +                            *s++ = (int) esc_buff[j];
2999 +                        }
3000 +                    }
3001 +                }
3002 +              else if (wc == L'\b')
3003 +                {
3004 +                  width += -1;
3005 +                  chars += 1;
3006 +                  *s++ = c;
3007 +                }
3008 +              else
3009 +                {
3010 +                  width += 0;
3011 +                  chars += mblength;
3012 +                  for (i = 0; i < mblength; i++)
3013 +                    *s++ = mbc[i];
3014 +                }
3015 +            }
3016 +          else
3017 +            {
3018 +              width += wc_width;
3019 +              chars += mblength;
3020 +              for (i = 0; i < mblength; i++)
3021 +                *s++ = mbc[i];
3022 +            }
3023 +        }
3024 +      memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
3025 +      mbc_pos -= mblength;
3026 +    }
3027 +
3028 +  /* Too many backspaces must put us in position 0 -- never negative. */
3029 +  if (width < 0 && input_position == 0)
3030 +    {
3031 +      chars = 0;
3032 +      input_position = 0;
3033 +    }
3034 +  else if (width < 0 && input_position <= -width)
3035 +    input_position = 0;
3036 +  else
3037 +   input_position += width;
3038 +
3039 +  return chars;
3040 +}
3041 +#endif
3042 +
3043  /* We've just printed some files and need to clean up things before
3044     looking for more options and printing the next batch of files.
3045
3046 diff --git a/src/sort.c b/src/sort.c
3047 index 3b775d6..a0ba243 100644
3048 --- a/src/sort.c
3049 +++ b/src/sort.c
3050 @@ -29,6 +29,14 @@
3051  #include <sys/wait.h>
3052  #include <signal.h>
3053  #include <assert.h>
3054 +#if HAVE_WCHAR_H
3055 +# include <wchar.h>
3056 +#endif
3057 +/* Get isw* functions. */
3058 +#if HAVE_WCTYPE_H
3059 +# include <wctype.h>
3060 +#endif
3061 +
3062  #include "system.h"
3063  #include "argmatch.h"
3064  #include "die.h"
3065 @@ -159,14 +167,39 @@ static int thousands_sep;
3066  /* We currently ignore multi-byte grouping chars.  */
3067  static bool thousands_sep_ignored;
3068
3069 +/* True if -f is specified.  */
3070 +static bool folding;
3071 +
3072  /* Nonzero if the corresponding locales are hard.  */
3073  static bool hard_LC_COLLATE;
3074 -#if HAVE_NL_LANGINFO
3075 +#if HAVE_LANGINFO_CODESET
3076  static bool hard_LC_TIME;
3077  #endif
3078
3079  #define NONZERO(x) ((x) != 0)
3080
3081 +/* get a multibyte character's byte length. */
3082 +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE)                        \
3083 +  do                                                                        \
3084 +    {                                                                        \
3085 +      wchar_t wc;                                                        \
3086 +      mbstate_t state_bak;                                                \
3087 +                                                                        \
3088 +      state_bak = STATE;                                                \
3089 +      mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE);                        \
3090 +                                                                        \
3091 +      switch (MBLENGTH)                                                        \
3092 +        {                                                                \
3093 +        case (size_t)-1:                                                \
3094 +        case (size_t)-2:                                                \
3095 +          STATE = state_bak;                                                \
3096 +                /* Fall through. */                                        \
3097 +        case 0:                                                                \
3098 +          MBLENGTH = 1;                                                        \
3099 +      }                                                                        \
3100 +    }                                                                        \
3101 +  while (0)
3102 +
3103  /* The kind of blanks for '-b' to skip in various options. */
3104  enum blanktype { bl_start, bl_end, bl_both };
3105
3106 @@ -343,13 +376,11 @@ static bool stable;
3107  /* An int value outside char range.  */
3108  enum { NON_CHAR = CHAR_MAX + 1 };
3109
3110 -/* If TAB has this value, blanks separate fields.  */
3111 -enum { TAB_DEFAULT = CHAR_MAX + 1 };
3112 -
3113 -/* Tab character separating fields.  If TAB_DEFAULT, then fields are
3114 +/* Tab character separating fields.  If tab_length is 0, then fields are
3115     separated by the empty string between a non-blank character and a blank
3116     character. */
3117 -static int tab = TAB_DEFAULT;
3118 +static char tab[MB_LEN_MAX + 1];
3119 +static size_t tab_length = 0;
3120
3121  /* Flag to remove consecutive duplicate lines from the output.
3122     Only the last of a sequence of equal lines will be output. */
3123 @@ -805,6 +836,46 @@ reap_all (void)
3124      reap (-1);
3125  }
3126
3127 +/* Function pointers. */
3128 +static void
3129 +(*inittables) (void);
3130 +static char *
3131 +(*begfield) (const struct line*, const struct keyfield *);
3132 +static char *
3133 +(*limfield) (const struct line*, const struct keyfield *);
3134 +static void
3135 +(*skipblanks) (char **ptr, char *lim);
3136 +static int
3137 +(*getmonth) (char const *, size_t, char **);
3138 +static int
3139 +(*keycompare) (const struct line *, const struct line *);
3140 +static int
3141 +(*numcompare) (const char *, const char *);
3142 +
3143 +/* Test for white space multibyte character.
3144 +   Set LENGTH the byte length of investigated multibyte character. */
3145 +#if HAVE_MBRTOWC
3146 +static int
3147 +ismbblank (const char *str, size_t len, size_t *length)
3148 +{
3149 +  size_t mblength;
3150 +  wchar_t wc;
3151 +  mbstate_t state;
3152 +
3153 +  memset (&state, '\0', sizeof(mbstate_t));
3154 +  mblength = mbrtowc (&wc, str, len, &state);
3155 +
3156 +  if (mblength == (size_t)-1 || mblength == (size_t)-2)
3157 +    {
3158 +      *length = 1;
3159 +      return 0;
3160 +    }
3161 +
3162 +  *length = (mblength < 1) ? 1 : mblength;
3163 +  return iswblank (wc) || wc == '\n';
3164 +}
3165 +#endif
3166 +
3167  /* Clean up any remaining temporary files.  */
3168
3169  static void
3170 @@ -1272,7 +1343,7 @@ zaptemp (char const *name)
3171    free (node);
3172  }
3173
3174 -#if HAVE_NL_LANGINFO
3175 +#if HAVE_LANGINFO_CODESET
3176
3177  static int
3178  struct_month_cmp (void const *m1, void const *m2)
3179 @@ -1287,7 +1358,7 @@ struct_month_cmp (void const *m1, void const *m2)
3180  /* Initialize the character class tables. */
3181
3182  static void
3183 -inittables (void)
3184 +inittables_uni (void)
3185  {
3186    size_t i;
3187
3188 @@ -1299,7 +1370,7 @@ inittables (void)
3189        fold_toupper[i] = toupper (i);
3190      }
3191
3192 -#if HAVE_NL_LANGINFO
3193 +#if HAVE_LANGINFO_CODESET
3194    /* If we're not in the "C" locale, read different names for months.  */
3195    if (hard_LC_TIME)
3196      {
3197 @@ -1381,6 +1452,84 @@ specify_nmerge (int oi, char c, char const *s)
3198      xstrtol_fatal (e, oi, c, long_options, s);
3199  }
3200
3201 +#if HAVE_MBRTOWC
3202 +static void
3203 +inittables_mb (void)
3204 +{
3205 +  int i, j, k, l;
3206 +  char *name, *s, *lc_time, *lc_ctype;
3207 +  size_t s_len, mblength;
3208 +  char mbc[MB_LEN_MAX];
3209 +  wchar_t wc, pwc;
3210 +  mbstate_t state_mb, state_wc;
3211 +
3212 +  lc_time = setlocale (LC_TIME, "");
3213 +  if (lc_time)
3214 +    lc_time = xstrdup (lc_time);
3215 +
3216 +  lc_ctype = setlocale (LC_CTYPE, "");
3217 +  if (lc_ctype)
3218 +    lc_ctype = xstrdup (lc_ctype);
3219 +
3220 +  if (lc_time && lc_ctype)
3221 +    /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert
3222 +     * the names of months to upper case */
3223 +    setlocale (LC_CTYPE, lc_time);
3224 +
3225 +  for (i = 0; i < MONTHS_PER_YEAR; i++)
3226 +    {
3227 +      s = (char *) nl_langinfo (ABMON_1 + i);
3228 +      s_len = strlen (s);
3229 +      monthtab[i].name = name = (char *) xmalloc (s_len + 1);
3230 +      monthtab[i].val = i + 1;
3231 +
3232 +      memset (&state_mb, '\0', sizeof (mbstate_t));
3233 +      memset (&state_wc, '\0', sizeof (mbstate_t));
3234 +
3235 +      for (j = 0; j < s_len;)
3236 +        {
3237 +          if (!ismbblank (s + j, s_len - j, &mblength))
3238 +            break;
3239 +          j += mblength;
3240 +        }
3241 +
3242 +      for (k = 0; j < s_len;)
3243 +        {
3244 +          mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
3245 +          assert (mblength != (size_t)-1 && mblength != (size_t)-2);
3246 +          if (mblength == 0)
3247 +            break;
3248 +
3249 +          pwc = towupper (wc);
3250 +          if (pwc == wc)
3251 +            {
3252 +              memcpy (mbc, s + j, mblength);
3253 +              j += mblength;
3254 +            }
3255 +          else
3256 +            {
3257 +              j += mblength;
3258 +              mblength = wcrtomb (mbc, pwc, &state_wc);
3259 +              assert (mblength != (size_t)0 && mblength != (size_t)-1);
3260 +            }
3261 +
3262 +          for (l = 0; l < mblength; l++)
3263 +            name[k++] = mbc[l];
3264 +        }
3265 +      name[k] = '\0';
3266 +    }
3267 +  qsort ((void *) monthtab, MONTHS_PER_YEAR,
3268 +      sizeof (struct month), struct_month_cmp);
3269 +
3270 +  if (lc_time && lc_ctype)
3271 +    /* restore the original locales */
3272 +    setlocale (LC_CTYPE, lc_ctype);
3273 +
3274 +  free (lc_ctype);
3275 +  free (lc_time);
3276 +}
3277 +#endif
3278 +
3279  /* Specify the amount of main memory to use when sorting.  */
3280  static void
3281  specify_sort_size (int oi, char c, char const *s)
3282 @@ -1612,7 +1761,7 @@ buffer_linelim (struct buffer const *buf)
3283     by KEY in LINE. */
3284
3285  static char *
3286 -begfield (struct line const *line, struct keyfield const *key)
3287 +begfield_uni (const struct line *line, const struct keyfield *key)
3288  {
3289    char *ptr = line->text, *lim = ptr + line->length - 1;
3290    size_t sword = key->sword;
3291 @@ -1621,10 +1770,10 @@ begfield (struct line const *line, struct keyfield const *key)
3292    /* The leading field separator itself is included in a field when -t
3293       is absent.  */
3294
3295 -  if (tab != TAB_DEFAULT)
3296 +  if (tab_length)
3297      while (ptr < lim && sword--)
3298        {
3299 -        while (ptr < lim && *ptr != tab)
3300 +        while (ptr < lim && *ptr != tab[0])
3301            ++ptr;
3302          if (ptr < lim)
3303            ++ptr;
3304 @@ -1650,12 +1799,71 @@ begfield (struct line const *line, struct keyfield const *key)
3305    return ptr;
3306  }
3307
3308 +#if HAVE_MBRTOWC
3309 +static char *
3310 +begfield_mb (const struct line *line, const struct keyfield *key)
3311 +{
3312 +  int i;
3313 +  char *ptr = line->text, *lim = ptr + line->length - 1;
3314 +  size_t sword = key->sword;
3315 +  size_t schar = key->schar;
3316 +  size_t mblength;
3317 +  mbstate_t state;
3318 +
3319 +  memset (&state, '\0', sizeof(mbstate_t));
3320 +
3321 +  if (tab_length)
3322 +    while (ptr < lim && sword--)
3323 +      {
3324 +        while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
3325 +          {
3326 +            GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3327 +            ptr += mblength;
3328 +          }
3329 +        if (ptr < lim)
3330 +          {
3331 +            GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3332 +            ptr += mblength;
3333 +          }
3334 +      }
3335 +  else
3336 +    while (ptr < lim && sword--)
3337 +      {
3338 +        while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3339 +          ptr += mblength;
3340 +        if (ptr < lim)
3341 +          {
3342 +            GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3343 +            ptr += mblength;
3344 +          }
3345 +        while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
3346 +          ptr += mblength;
3347 +      }
3348 +
3349 +  if (key->skipsblanks)
3350 +    while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3351 +      ptr += mblength;
3352 +
3353 +  for (i = 0; i < schar; i++)
3354 +    {
3355 +      GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3356 +
3357 +      if (ptr + mblength > lim)
3358 +        break;
3359 +      else
3360 +        ptr += mblength;
3361 +    }
3362 +
3363 +  return ptr;
3364 +}
3365 +#endif
3366 +
3367  /* Return the limit of (a pointer to the first character after) the field
3368     in LINE specified by KEY. */
3369
3370  ATTRIBUTE_PURE
3371  static char *
3372 -limfield (struct line const *line, struct keyfield const *key)
3373 +limfield_uni (struct line const *line, struct keyfield const *key)
3374  {
3375    char *ptr = line->text, *lim = ptr + line->length - 1;
3376    size_t eword = key->eword, echar = key->echar;
3377 @@ -1670,10 +1878,10 @@ limfield (struct line const *line, struct keyfield const *key)
3378       'beginning' is the first character following the delimiting TAB.
3379       Otherwise, leave PTR pointing at the first 'blank' character after
3380       the preceding field.  */
3381 -  if (tab != TAB_DEFAULT)
3382 +  if (tab_length)
3383      while (ptr < lim && eword--)
3384        {
3385 -        while (ptr < lim && *ptr != tab)
3386 +        while (ptr < lim && *ptr != tab[0])
3387            ++ptr;
3388          if (ptr < lim && (eword || echar))
3389            ++ptr;
3390 @@ -1719,10 +1927,10 @@ limfield (struct line const *line, struct keyfield const *key)
3391       */
3392
3393    /* Make LIM point to the end of (one byte past) the current field.  */
3394 -  if (tab != TAB_DEFAULT)
3395 +  if (tab_length)
3396      {
3397        char *newlim;
3398 -      newlim = memchr (ptr, tab, lim - ptr);
3399 +      newlim = memchr (ptr, tab[0], lim - ptr);
3400        if (newlim)
3401          lim = newlim;
3402      }
3403 @@ -1753,6 +1961,130 @@ limfield (struct line const *line, struct keyfield const *key)
3404    return ptr;
3405  }
3406
3407 +#if HAVE_MBRTOWC
3408 +static char * _GL_ATTRIBUTE_PURE
3409 +limfield_mb (const struct line *line, const struct keyfield *key)
3410 +{
3411 +  char *ptr = line->text, *lim = ptr + line->length - 1;
3412 +  size_t eword = key->eword, echar = key->echar;
3413 +  int i;
3414 +  size_t mblength;
3415 +  mbstate_t state;
3416 +
3417 +  if (echar == 0)
3418 +    eword++; /* skip all of end field. */
3419 +
3420 +  memset (&state, '\0', sizeof(mbstate_t));
3421 +
3422 +  if (tab_length)
3423 +    while (ptr < lim && eword--)
3424 +      {
3425 +        while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
3426 +          {
3427 +            GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3428 +            ptr += mblength;
3429 +          }
3430 +        if (ptr < lim && (eword | echar))
3431 +          {
3432 +            GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3433 +            ptr += mblength;
3434 +          }
3435 +      }
3436 +  else
3437 +    while (ptr < lim && eword--)
3438 +      {
3439 +        while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3440 +          ptr += mblength;
3441 +        if (ptr < lim)
3442 +          {
3443 +            GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3444 +            ptr += mblength;
3445 +          }
3446 +        while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
3447 +          ptr += mblength;
3448 +      }
3449 +
3450 +
3451 +# ifdef POSIX_UNSPECIFIED
3452 +  /* Make LIM point to the end of (one byte past) the current field.  */
3453 +  if (tab_length)
3454 +    {
3455 +      char *newlim, *p;
3456 +
3457 +      newlim = NULL;
3458 +      for (p = ptr; p < lim;)
3459 +         {
3460 +          if (memcmp (p, tab, tab_length) == 0)
3461 +            {
3462 +              newlim = p;
3463 +              break;
3464 +            }
3465 +
3466 +          GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3467 +          p += mblength;
3468 +        }
3469 +    }
3470 +  else
3471 +    {
3472 +      char *newlim;
3473 +      newlim = ptr;
3474 +
3475 +      while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
3476 +        newlim += mblength;
3477 +      if (ptr < lim)
3478 +        {
3479 +          GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3480 +          ptr += mblength;
3481 +        }
3482 +      while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
3483 +        newlim += mblength;
3484 +      lim = newlim;
3485 +    }
3486 +# endif
3487 +
3488 +  if (echar != 0)
3489 +  {
3490 +    /* If we're skipping leading blanks, don't start counting characters
3491 +     *      until after skipping past any leading blanks.  */
3492 +    if (key->skipeblanks)
3493 +      while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
3494 +        ptr += mblength;
3495 +
3496 +    memset (&state, '\0', sizeof(mbstate_t));
3497 +
3498 +    /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
3499 +    for (i = 0; i < echar; i++)
3500 +     {
3501 +        GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
3502 +
3503 +        if (ptr + mblength > lim)
3504 +          break;
3505 +        else
3506 +          ptr += mblength;
3507 +      }
3508 +  }
3509 +
3510 +  return ptr;
3511 +}
3512 +#endif
3513 +
3514 +static void
3515 +skipblanks_uni (char **ptr, char *lim)
3516 +{
3517 +  while (*ptr < lim && blanks[to_uchar (**ptr)])
3518 +    ++(*ptr);
3519 +}
3520 +
3521 +#if HAVE_MBRTOWC
3522 +static void
3523 +skipblanks_mb (char **ptr, char *lim)
3524 +{
3525 +  size_t mblength;
3526 +  while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength))
3527 +    (*ptr) += mblength;
3528 +}
3529 +#endif
3530 +
3531  /* Fill BUF reading from FP, moving buf->left bytes from the end
3532     of buf->buf to the beginning first.  If EOF is reached and the
3533     file wasn't terminated by a newline, supply one.  Set up BUF's line
3534 @@ -1839,8 +2171,22 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
3535                    else
3536                      {
3537                        if (key->skipsblanks)
3538 -                        while (blanks[to_uchar (*line_start)])
3539 -                          line_start++;
3540 +                        {
3541 +#if HAVE_MBRTOWC
3542 +                          if (MB_CUR_MAX > 1)
3543 +                            {
3544 +                              size_t mblength;
3545 +                              while (line_start < line->keylim &&
3546 +                                     ismbblank (line_start,
3547 +                                                line->keylim - line_start,
3548 +                                                &mblength))
3549 +                                line_start += mblength;
3550 +                            }
3551 +                          else
3552 +#endif
3553 +                          while (blanks[to_uchar (*line_start)])
3554 +                            line_start++;
3555 +                        }
3556                        line->keybeg = line_start;
3557                      }
3558                  }
3559 @@ -1976,12 +2322,10 @@ find_unit_order (char const *number)
3560
3561  ATTRIBUTE_PURE
3562  static int
3563 -human_numcompare (char const *a, char const *b)
3564 +human_numcompare (char *a, char *b)
3565  {
3566 -  while (blanks[to_uchar (*a)])
3567 -    a++;
3568 -  while (blanks[to_uchar (*b)])
3569 -    b++;
3570 +  skipblanks(&a, a + strlen(a));
3571 +  skipblanks(&b, b + strlen(b));
3572
3573    int diff = find_unit_order (a) - find_unit_order (b);
3574    return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
3575 @@ -1993,7 +2337,7 @@ human_numcompare (char const *a, char const *b)
3576
3577  ATTRIBUTE_PURE
3578  static int
3579 -numcompare (char const *a, char const *b)
3580 +numcompare_uni (const char *a, const char *b)
3581  {
3582    while (blanks[to_uchar (*a)])
3583      a++;
3584 @@ -2003,6 +2347,25 @@ numcompare (char const *a, char const *b)
3585    return strnumcmp (a, b, decimal_point, thousands_sep);
3586  }
3587
3588 +#if HAVE_MBRTOWC
3589 +static int
3590 +numcompare_mb (const char *a, const char *b)
3591 +{
3592 +  size_t mblength, len;
3593 +  len = strlen (a); /* okay for UTF-8 */
3594 +  while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
3595 +    {
3596 +      a += mblength;
3597 +      len -= mblength;
3598 +    }
3599 +  len = strlen (b); /* okay for UTF-8 */
3600 +  while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
3601 +    b += mblength;
3602 +
3603 +  return strnumcmp (a, b, decimal_point, thousands_sep);
3604 +}
3605 +#endif /* HAV_EMBRTOWC */
3606 +
3607  /* Work around a problem whereby the long double value returned by glibc's
3608     strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
3609     A and B before calling strtold.  FIXME: remove this function if
3610 @@ -2053,7 +2416,7 @@ general_numcompare (char const *sa, char const *sb)
3611     Return 0 if the name in S is not recognized.  */
3612
3613  static int
3614 -getmonth (char const *month, char **ea)
3615 +getmonth_uni (char const *month, size_t len, char **ea)
3616  {
3617    size_t lo = 0;
3618    size_t hi = MONTHS_PER_YEAR;
3619 @@ -2329,15 +2692,14 @@ debug_key (struct line const *line, struct keyfield const *key)
3620            char saved = *lim;
3621            *lim = '\0';
3622
3623 -          while (blanks[to_uchar (*beg)])
3624 -            beg++;
3625 +          skipblanks (&beg, lim);
3626
3627            char *tighter_lim = beg;
3628
3629            if (lim < beg)
3630              tighter_lim = lim;
3631            else if (key->month)
3632 -            getmonth (beg, &tighter_lim);
3633 +            getmonth (beg, lim-beg, &tighter_lim);
3634            else if (key->general_numeric)
3635              ignore_value (strtold (beg, &tighter_lim));
3636            else if (key->numeric || key->human_numeric)
3637 @@ -2483,7 +2845,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
3638        /* Warn about significant leading blanks.  */
3639        bool implicit_skip = key_numeric (key) || key->month;
3640        bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y  */
3641 -      if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
3642 +      if (!zero_width && !gkey_only && !tab_length && !line_offset
3643            && ((!key->skipsblanks && !implicit_skip)
3644                || (!key->skipsblanks && key->schar)
3645                || (!key->skipeblanks && key->echar)))
3646 @@ -2531,9 +2893,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
3647    bool number_locale_warned = false;
3648    if (basic_numeric_field_span)
3649      {
3650 -      if (tab == TAB_DEFAULT
3651 -          ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
3652 -          : tab == thousands_sep)
3653 +      if (tab_length
3654 +          ? tab[0] == thousands_sep
3655 +          : thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep))))
3656          {
3657            error (0, 0,
3658                   _("field separator %s is treated as a "
3659 @@ -2544,9 +2906,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
3660      }
3661    if (basic_numeric_field_span || general_numeric_field_span)
3662      {
3663 -      if (tab == TAB_DEFAULT
3664 -          ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
3665 -          : tab == decimal_point)
3666 +      if (tab_length
3667 +          ? tab[0] == decimal_point
3668 +          : thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point))))
3669          {
3670            error (0, 0,
3671                   _("field separator %s is treated as a "
3672 @@ -2554,19 +2916,19 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
3673                   quote (((char []) {decimal_point, 0})));
3674            number_locale_warned = true;
3675          }
3676 -      else if (tab == '-')
3677 +      else if (tab_length && tab[0] == '-')
3678          {
3679            error (0, 0,
3680                   _("field separator %s is treated as a "
3681                     "minus sign in numbers"),
3682 -                 quote (((char []) {tab, 0})));
3683 +                 quote (((char []) {tab[0], 0})));
3684          }
3685 -      else if (general_numeric_field_span && tab == '+')
3686 +      else if (general_numeric_field_span && tab_length && tab[0] == '+')
3687          {
3688            error (0, 0,
3689                   _("field separator %s is treated as a "
3690                     "plus sign in numbers"),
3691 -                 quote (((char []) {tab, 0})));
3692 +                 quote (((char []) {tab[0], 0})));
3693          }
3694      }
3695
3696 @@ -2577,7 +2939,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
3697      {
3698        error (0, 0,
3699               _("%snumbers use %s as a decimal point in this locale"),
3700 -             tab == decimal_point ? "" : _("note "),
3701 +             (tab_length && tab[0] == decimal_point) ? "" : _("note "),
3702               quote (((char []) {decimal_point, 0})));
3703
3704      }
3705 @@ -2610,11 +2972,87 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
3706      error (0, 0, _("option '-r' only applies to last-resort comparison"));
3707  }
3708
3709 +#if HAVE_MBRTOWC
3710 +static int
3711 +getmonth_mb (const char *s, size_t len, char **ea)
3712 +{
3713 +  char *month;
3714 +  register size_t i;
3715 +  register int lo = 0, hi = MONTHS_PER_YEAR, result;
3716 +  char *tmp;
3717 +  size_t wclength, mblength;
3718 +  const char *pp;
3719 +  const wchar_t *wpp;
3720 +  wchar_t *month_wcs;
3721 +  mbstate_t state;
3722 +
3723 +  while (len > 0 && ismbblank (s, len, &mblength))
3724 +    {
3725 +      s += mblength;
3726 +      len -= mblength;
3727 +    }
3728 +
3729 +  if (len == 0)
3730 +    return 0;
3731 +
3732 +  if (SIZE_MAX - len < 1)
3733 +    xalloc_die ();
3734 +
3735 +  month = (char *) xnmalloc (len + 1, MB_CUR_MAX);
3736 +
3737 +  pp = tmp = (char *) xnmalloc (len + 1, MB_CUR_MAX);
3738 +  memcpy (tmp, s, len);
3739 +  tmp[len] = '\0';
3740 +  wpp = month_wcs = (wchar_t *) xnmalloc (len + 1, sizeof (wchar_t));
3741 +  memset (&state, '\0', sizeof (mbstate_t));
3742 +
3743 +  wclength = mbsrtowcs (month_wcs, &pp, len + 1, &state);
3744 +  if (wclength == (size_t)-1 || pp != NULL)
3745 +    error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s));
3746 +
3747 +  for (i = 0; i < wclength; i++)
3748 +    {
3749 +      month_wcs[i] = towupper(month_wcs[i]);
3750 +      if (iswblank (month_wcs[i]))
3751 +        {
3752 +          month_wcs[i] = L'\0';
3753 +          break;
3754 +        }
3755 +    }
3756 +
3757 +  mblength = wcsrtombs (month, &wpp, (len + 1) * MB_CUR_MAX, &state);
3758 +  assert (mblength != (-1) && wpp == NULL);
3759 +
3760 +  do
3761 +    {
3762 +      int ix = (lo + hi) / 2;
3763 +
3764 +      if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3765 +        hi = ix;
3766 +      else
3767 +        lo = ix;
3768 +    }
3769 +  while (hi - lo > 1);
3770 +
3771 +  result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3772 +      ? monthtab[lo].val : 0);
3773 +
3774 +  if (ea && result)
3775 +     *ea = (char*) s + strlen (monthtab[lo].name);
3776 +
3777 +  free (month);
3778 +  free (tmp);
3779 +  free (month_wcs);
3780 +
3781 +  return result;
3782 +}
3783 +#endif
3784 +
3785  /* Compare two lines A and B trying every key in sequence until there
3786     are no more keys or a difference is found. */
3787
3788  static int
3789 -keycompare (struct line const *a, struct line const *b)
3790 +keycompare_uni (const struct line *a, const struct line *b)
3791  {
3792    struct keyfield *key = keylist;
3793
3794 @@ -2699,7 +3137,7 @@ keycompare (struct line const *a, struct line const *b)
3795            else if (key->human_numeric)
3796              diff = human_numcompare (ta, tb);
3797            else if (key->month)
3798 -            diff = getmonth (ta, NULL) - getmonth (tb, NULL);
3799 +            diff = getmonth (ta, tlena, NULL) - getmonth (tb, tlenb, NULL);
3800            else if (key->random)
3801              diff = compare_random (ta, tlena, tb, tlenb);
3802            else if (key->version)
3803 @@ -2815,6 +3253,211 @@ keycompare (struct line const *a, struct line const *b)
3804    return key->reverse ? -diff : diff;
3805  }
3806
3807 +#if HAVE_MBRTOWC
3808 +static int
3809 +keycompare_mb (const struct line *a, const struct line *b)
3810 +{
3811 +  struct keyfield *key = keylist;
3812 +
3813 +  /* For the first iteration only, the key positions have been
3814 +     precomputed for us. */
3815 +  char *texta = a->keybeg;
3816 +  char *textb = b->keybeg;
3817 +  char *lima = a->keylim;
3818 +  char *limb = b->keylim;
3819 +
3820 +  size_t mblength_a, mblength_b;
3821 +  wchar_t wc_a, wc_b;
3822 +  mbstate_t state_a, state_b;
3823 +
3824 +  int diff = 0;
3825 +
3826 +  memset (&state_a, '\0', sizeof(mbstate_t));
3827 +  memset (&state_b, '\0', sizeof(mbstate_t));
3828 +  /* Ignore keys with start after end.  */
3829 +  if (a->keybeg - a->keylim > 0)
3830 +    return 0;
3831 +
3832 +
3833 +              /* Ignore and/or translate chars before comparing.  */
3834 +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE)        \
3835 +  do                                                                        \
3836 +    {                                                                        \
3837 +      wchar_t uwc;                                                        \
3838 +      char mbc[MB_LEN_MAX];                                                \
3839 +      mbstate_t state_wc;                                                \
3840 +                                                                        \
3841 +      for (NEW_LEN = i = 0; i < LEN;)                                        \
3842 +        {                                                                \
3843 +          mbstate_t state_bak;                                                \
3844 +                                                                        \
3845 +          state_bak = STATE;                                                \
3846 +          MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE);                \
3847 +                                                                        \
3848 +          if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1                \
3849 +              || MBLENGTH == 0)                                                \
3850 +            {                                                                \
3851 +              if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1)        \
3852 +                STATE = state_bak;                                        \
3853 +              if (!ignore)                                                \
3854 +                COPY[NEW_LEN++] = TEXT[i];                                \
3855 +              i++;                                                         \
3856 +              continue;                                                        \
3857 +            }                                                                \
3858 +                                                                        \
3859 +          if (ignore)                                                        \
3860 +            {                                                                \
3861 +              if ((ignore == nonprinting && !iswprint (WC))                \
3862 +                   || (ignore == nondictionary                                \
3863 +                       && !iswalnum (WC) && !iswblank (WC)))                \
3864 +                {                                                        \
3865 +                  i += MBLENGTH;                                        \
3866 +                  continue;                                                \
3867 +                }                                                        \
3868 +            }                                                                \
3869 +                                                                        \
3870 +          if (translate)                                                \
3871 +            {                                                                \
3872 +                                                                        \
3873 +              uwc = towupper(WC);                                        \
3874 +              if (WC == uwc)                                                \
3875 +                {                                                        \
3876 +                  memcpy (mbc, TEXT + i, MBLENGTH);                        \
3877 +                  i += MBLENGTH;                                        \
3878 +                }                                                        \
3879 +              else                                                        \
3880 +                {                                                        \
3881 +                  i += MBLENGTH;                                        \
3882 +                  WC = uwc;                                                \
3883 +                  memset (&state_wc, '\0', sizeof (mbstate_t));                \
3884 +                                                                        \
3885 +                  MBLENGTH = wcrtomb (mbc, WC, &state_wc);                \
3886 +                  assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0);        \
3887 +                }                                                        \
3888 +                                                                        \
3889 +              for (j = 0; j < MBLENGTH; j++)                                \
3890 +                COPY[NEW_LEN++] = mbc[j];                                \
3891 +            }                                                                \
3892 +          else                                                                \
3893 +            for (j = 0; j < MBLENGTH; j++)                                \
3894 +              COPY[NEW_LEN++] = TEXT[i++];                                \
3895 +        }                                                                \
3896 +      COPY[NEW_LEN] = '\0';                                                \
3897 +    }                                                                        \
3898 +  while (0)
3899 +
3900 +      /* Actually compare the fields. */
3901 +
3902 +  for (;;)
3903 +    {
3904 +      /* Find the lengths. */
3905 +      size_t lena = lima <= texta ? 0 : lima - texta;
3906 +      size_t lenb = limb <= textb ? 0 : limb - textb;
3907 +
3908 +      char enda IF_LINT (= 0);
3909 +      char endb IF_LINT (= 0);
3910 +
3911 +      char const *translate = key->translate;
3912 +      bool const *ignore = key->ignore;
3913 +
3914 +      if (ignore || translate)
3915 +        {
3916 +          if (SIZE_MAX - lenb - 2 < lena)
3917 +            xalloc_die ();
3918 +          char *copy_a = (char *) xnmalloc (lena + lenb + 2, MB_CUR_MAX);
3919 +          char *copy_b = copy_a + lena * MB_CUR_MAX + 1;
3920 +          size_t new_len_a, new_len_b;
3921 +          size_t i, j;
3922 +
3923 +          IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3924 +                        wc_a, mblength_a, state_a);
3925 +          IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3926 +                        wc_b, mblength_b, state_b);
3927 +          texta = copy_a; textb = copy_b;
3928 +          lena = new_len_a; lenb = new_len_b;
3929 +        }
3930 +      else
3931 +        {
3932 +          /* Use the keys in-place, temporarily null-terminated.  */
3933 +          enda = texta[lena]; texta[lena] = '\0';
3934 +          endb = textb[lenb]; textb[lenb] = '\0';
3935 +        }
3936 +
3937 +      if (key->random)
3938 +        diff = compare_random (texta, lena, textb, lenb);
3939 +      else if (key->numeric | key->general_numeric | key->human_numeric)
3940 +        {
3941 +          char savea = *lima, saveb = *limb;
3942 +
3943 +          *lima = *limb = '\0';
3944 +          diff = (key->numeric ? numcompare (texta, textb)
3945 +                  : key->general_numeric ? general_numcompare (texta, textb)
3946 +                  : human_numcompare (texta, textb));
3947 +          *lima = savea, *limb = saveb;
3948 +        }
3949 +      else if (key->version)
3950 +        diff = filevercmp (texta, textb);
3951 +      else if (key->month)
3952 +        diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
3953 +      else if (lena == 0)
3954 +        diff = - NONZERO (lenb);
3955 +      else if (lenb == 0)
3956 +        diff = 1;
3957 +      else if (hard_LC_COLLATE && !folding)
3958 +        {
3959 +          diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1);
3960 +        }
3961 +      else
3962 +        {
3963 +          diff = memcmp (texta, textb, MIN (lena, lenb));
3964 +          if (diff == 0)
3965 +            diff = lena < lenb ? -1 : lena != lenb;
3966 +        }
3967 +
3968 +      if (ignore || translate)
3969 +        free (texta);
3970 +      else
3971 +        {
3972 +          texta[lena] = enda;
3973 +          textb[lenb] = endb;
3974 +        }
3975 +
3976 +      if (diff)
3977 +        goto not_equal;
3978 +
3979 +      key = key->next;
3980 +      if (! key)
3981 +        break;
3982 +
3983 +      /* Find the beginning and limit of the next field.  */
3984 +      if (key->eword != -1)
3985 +        lima = limfield (a, key), limb = limfield (b, key);
3986 +      else
3987 +        lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3988 +
3989 +      if (key->sword != -1)
3990 +        texta = begfield (a, key), textb = begfield (b, key);
3991 +      else
3992 +        {
3993 +          texta = a->text, textb = b->text;
3994 +          if (key->skipsblanks)
3995 +            {
3996 +              while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
3997 +                texta += mblength_a;
3998 +              while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
3999 +                textb += mblength_b;
4000 +            }
4001 +        }
4002 +    }
4003 +
4004 +not_equal:
4005 +  if (key && key->reverse)
4006 +    return -diff;
4007 +  else
4008 +    return diff;
4009 +}
4010 +#endif
4011 +
4012  /* Compare two lines A and B, returning negative, zero, or positive
4013     depending on whether A compares less than, equal to, or greater than B. */
4014
4015 @@ -2842,7 +3485,7 @@ compare (struct line const *a, struct line const *b)
4016      diff = - NONZERO (blen);
4017    else if (blen == 0)
4018      diff = 1;
4019 -  else if (hard_LC_COLLATE)
4020 +  else if (hard_LC_COLLATE && !folding)
4021      {
4022        /* xmemcoll0 is a performance enhancement as
4023           it will not unconditionally write '\0' after the
4024 @@ -4226,6 +4869,7 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4025            break;
4026          case 'f':
4027            key->translate = fold_toupper;
4028 +          folding = true;
4029            break;
4030          case 'g':
4031            key->general_numeric = true;
4032 @@ -4305,7 +4949,7 @@ main (int argc, char **argv)
4033    initialize_exit_failure (SORT_FAILURE);
4034
4035    hard_LC_COLLATE = hard_locale (LC_COLLATE);
4036 -#if HAVE_NL_LANGINFO
4037 +#if HAVE_LANGINFO_CODESET
4038    hard_LC_TIME = hard_locale (LC_TIME);
4039  #endif
4040
4041 @@ -4328,6 +4972,29 @@ main (int argc, char **argv)
4042        thousands_sep = NON_CHAR;
4043    }
4044
4045 +#if HAVE_MBRTOWC
4046 +  if (MB_CUR_MAX > 1)
4047 +    {
4048 +      inittables = inittables_mb;
4049 +      begfield = begfield_mb;
4050 +      limfield = limfield_mb;
4051 +      skipblanks = skipblanks_mb;
4052 +      getmonth = getmonth_mb;
4053 +      keycompare = keycompare_mb;
4054 +      numcompare = numcompare_mb;
4055 +    }
4056 +  else
4057 +#endif
4058 +    {
4059 +      inittables = inittables_uni;
4060 +      begfield = begfield_uni;
4061 +      limfield = limfield_uni;
4062 +      skipblanks = skipblanks_uni;
4063 +      getmonth = getmonth_uni;
4064 +      keycompare = keycompare_uni;
4065 +      numcompare = numcompare_uni;
4066 +    }
4067 +
4068    have_read_stdin = false;
4069    inittables ();
4070
4071 @@ -4602,13 +5269,34 @@ main (int argc, char **argv)
4072
4073          case 't':
4074            {
4075 -            char newtab = optarg[0];
4076 -            if (! newtab)
4077 +            char newtab[MB_LEN_MAX + 1];
4078 +            size_t newtab_length = 1;
4079 +            strncpy (newtab, optarg, MB_LEN_MAX);
4080 +            if (! newtab[0])
4081                die (SORT_FAILURE, 0, _("empty tab"));
4082 -            if (optarg[1])
4083 +#if HAVE_MBRTOWC
4084 +            if (MB_CUR_MAX > 1)
4085 +              {
4086 +                wchar_t wc;
4087 +                mbstate_t state;
4088 +
4089 +                memset (&state, '\0', sizeof (mbstate_t));
4090 +                newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
4091 +                                                               MB_LEN_MAX),
4092 +                                         &state);
4093 +                switch (newtab_length)
4094 +                  {
4095 +                  case (size_t) -1:
4096 +                  case (size_t) -2:
4097 +                  case 0:
4098 +                    newtab_length = 1;
4099 +                  }
4100 +              }
4101 +#endif
4102 +            if (newtab_length == 1 && optarg[1])
4103                {
4104                  if (STREQ (optarg, "\\0"))
4105 -                  newtab = '\0';
4106 +                  newtab[0] = '\0';
4107                  else
4108                    {
4109                      /* Provoke with 'sort -txx'.  Complain about
4110 @@ -4619,9 +5307,11 @@ main (int argc, char **argv)
4111                           quote (optarg));
4112                    }
4113                }
4114 -            if (tab != TAB_DEFAULT && tab != newtab)
4115 +            if (tab_length && (tab_length != newtab_length
4116 +                        || memcmp (tab, newtab, tab_length) != 0))
4117                die (SORT_FAILURE, 0, _("incompatible tabs"));
4118 -            tab = newtab;
4119 +            memcpy (tab, newtab, newtab_length);
4120 +            tab_length = newtab_length;
4121            }
4122            break;
4123
4124 diff --git a/src/unexpand.c b/src/unexpand.c
4125 index 7d6100f..04cd646 100644
4126 --- a/src/unexpand.c
4127 +++ b/src/unexpand.c
4128 @@ -38,6 +38,9 @@
4129  #include <stdio.h>
4130  #include <getopt.h>
4131  #include <sys/types.h>
4132 +
4133 +#include <mbfile.h>
4134 +
4135  #include "system.h"
4136  #include "die.h"
4137
4138 @@ -106,24 +109,47 @@ unexpand (void)
4139  {
4140    /* Input stream.  */
4141    FILE *fp = next_file (NULL);
4142 +  mb_file_t mbf;
4143
4144    /* The array of pending blanks.  In non-POSIX locales, blanks can
4145       include characters other than spaces, so the blanks must be
4146       stored, not merely counted.  */
4147 -  char *pending_blank;
4148 +  mbf_char_t *pending_blank;
4149 +  /* True if the starting locale is utf8.  */
4150 +  bool using_utf_locale;
4151 +
4152 +  /* True if the first file contains BOM header.  */
4153 +  bool found_bom;
4154 +  using_utf_locale=check_utf_locale();
4155
4156    if (!fp)
4157      return;
4158 +  mbf_init (mbf, fp);
4159 +  found_bom=check_bom(fp,&mbf);
4160 +
4161 +  if (using_utf_locale == false && found_bom == true)
4162 +  {
4163 +    /*try using some predefined locale */
4164
4165 +    if (set_utf_locale () != 0)
4166 +    {
4167 +      error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
4168 +    }
4169 +  }
4170    /* The worst case is a non-blank character, then one blank, then a
4171       tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
4172       allocate MAX_COLUMN_WIDTH bytes to store the blanks.  */
4173 -  pending_blank = xmalloc (max_column_width);
4174 +  pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t));
4175 +
4176 +  if (found_bom == true)
4177 +  {
4178 +    print_bom();
4179 +  }
4180
4181    while (true)
4182      {
4183        /* Input character, or EOF.  */
4184 -      int c;
4185 +      mbf_char_t c;
4186
4187        /* If true, perform translations.  */
4188        bool convert = true;
4189 @@ -157,12 +183,44 @@ unexpand (void)
4190
4191        do
4192          {
4193 -          while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
4194 -            continue;
4195 +          while (true) {
4196 +            mbf_getc (c, mbf);
4197 +            if ((mb_iseof (c)) && (fp = next_file (fp)))
4198 +              {
4199 +                mbf_init (mbf, fp);
4200 +                if (fp!=NULL)
4201 +                {
4202 +                  if (check_bom(fp,&mbf)==true)
4203 +                  {
4204 +                    /*Not the first file - check BOM header*/
4205 +                    if (using_utf_locale==false && found_bom==false)
4206 +                    {
4207 +                      /*BOM header in subsequent file but not in the first one. */
4208 +                      error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
4209 +                    }
4210 +                  }
4211 +                  else
4212 +                  {
4213 +                    if(using_utf_locale==false && found_bom==true)
4214 +                    {
4215 +                      /*First file conatined BOM header - locale was switched to UTF
4216 +                       *all subsequent files should contain BOM. */
4217 +                      error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
4218 +                    }
4219 +                  }
4220 +                }
4221 +                continue;
4222 +              }
4223 +            else
4224 +              {
4225 +                break;
4226 +              }
4227 +            }
4228 +
4229
4230            if (convert)
4231              {
4232 -              bool blank = !! isblank (c);
4233 +              bool blank = mb_isblank (c);
4234
4235                if (blank)
4236                  {
4237 @@ -179,16 +237,16 @@ unexpand (void)
4238                        if (next_tab_column < column)
4239                          die (EXIT_FAILURE, 0, _("input line is too long"));
4240
4241 -                      if (c == '\t')
4242 +                      if (mb_iseq (c, '\t'))
4243                          {
4244                            column = next_tab_column;
4245
4246                            if (pending)
4247 -                            pending_blank[0] = '\t';
4248 +                            mb_setascii (&pending_blank[0], '\t');
4249                          }
4250                        else
4251                          {
4252 -                          column++;
4253 +                          column += mb_width (c);
4254
4255                            if (! (prev_blank && column == next_tab_column))
4256                              {
4257 @@ -196,13 +254,14 @@ unexpand (void)
4258                                   will be replaced by tabs.  */
4259                                if (column == next_tab_column)
4260                                  one_blank_before_tab_stop = true;
4261 -                              pending_blank[pending++] = c;
4262 +                              mb_copy (&pending_blank[pending++], &c);
4263                                prev_blank = true;
4264                                continue;
4265                              }
4266
4267                            /* Replace the pending blanks by a tab or two.  */
4268 -                          pending_blank[0] = c = '\t';
4269 +                          mb_setascii (&c, '\t');
4270 +                          mb_setascii (&pending_blank[0], '\t');
4271                          }
4272
4273                        /* Discard pending blanks, unless it was a single
4274 @@ -210,7 +269,7 @@ unexpand (void)
4275                        pending = one_blank_before_tab_stop;
4276                      }
4277                  }
4278 -              else if (c == '\b')
4279 +              else if (mb_iseq (c, '\b'))
4280                  {
4281                    /* Go back one column, and force recalculation of the
4282                       next tab stop.  */
4283 @@ -218,9 +277,9 @@ unexpand (void)
4284                    next_tab_column = column;
4285                    tab_index -= !!tab_index;
4286                  }
4287 -              else
4288 +              else if (!mb_iseq (c, '\n'))
4289                  {
4290 -                  column++;
4291 +                  column += mb_width (c);
4292                    if (!column)
4293                      die (EXIT_FAILURE, 0, _("input line is too long"));
4294                  }
4295 @@ -228,8 +287,11 @@ unexpand (void)
4296                if (pending)
4297                  {
4298                    if (pending > 1 && one_blank_before_tab_stop)
4299 -                    pending_blank[0] = '\t';
4300 -                  if (fwrite (pending_blank, 1, pending, stdout) != pending)
4301 +                    mb_setascii (&pending_blank[0], '\t');
4302 +
4303 +                  for (int n = 0; n < pending; ++n)
4304 +                    mb_putc (pending_blank[n], stdout);
4305 +                  if (ferror (stdout))
4306                      die (EXIT_FAILURE, errno, _("write error"));
4307                    pending = 0;
4308                    one_blank_before_tab_stop = false;
4309 @@ -239,16 +301,17 @@ unexpand (void)
4310                convert &= convert_entire_line || blank;
4311              }
4312
4313 -          if (c < 0)
4314 +          if (mb_iseof (c))
4315              {
4316                free (pending_blank);
4317                return;
4318              }
4319
4320 -          if (putchar (c) < 0)
4321 +          mb_putc (c, stdout);
4322 +          if (ferror (stdout))
4323              die (EXIT_FAILURE, errno, _("write error"));
4324          }
4325 -      while (c != '\n');
4326 +      while (!mb_iseq (c, '\n'));
4327      }
4328  }
4329
4330 diff --git a/src/uniq.c b/src/uniq.c
4331 index e5996f0..871d47c 100644
4332 --- a/src/uniq.c
4333 +++ b/src/uniq.c
4334 @@ -21,6 +21,17 @@
4335  #include <getopt.h>
4336  #include <sys/types.h>
4337
4338 +/* Get mbstate_t, mbrtowc(). */
4339 +#if HAVE_WCHAR_H
4340 +# include <wchar.h>
4341 +#endif
4342 +
4343 +/* Get isw* functions. */
4344 +#if HAVE_WCTYPE_H
4345 +# include <wctype.h>
4346 +#endif
4347 +#include <assert.h>
4348 +
4349  #include "system.h"
4350  #include "argmatch.h"
4351  #include "linebuffer.h"
4352 @@ -33,6 +44,18 @@
4353  #include "memcasecmp.h"
4354  #include "quote.h"
4355
4356 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
4357 +   installation; work around this configuration error.  */
4358 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
4359 +# define MB_LEN_MAX 16
4360 +#endif
4361 +
4362 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
4363 +#if HAVE_MBRTOWC && defined mbstate_t
4364 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
4365 +#endif
4366 +
4367 +
4368  /* The official name of this program (e.g., no 'g' prefix).  */
4369  #define PROGRAM_NAME "uniq"
4370
4371 @@ -139,6 +162,10 @@ enum
4372    GROUP_OPTION = CHAR_MAX + 1
4373  };
4374
4375 +/* Function pointers. */
4376 +static char *
4377 +(*find_field) (struct linebuffer *line);
4378 +
4379  static struct option const longopts[] =
4380  {
4381    {"count", no_argument, NULL, 'c'},
4382 @@ -254,7 +281,7 @@ size_opt (char const *opt, char const *msgid)
4383
4384  ATTRIBUTE_PURE
4385  static char *
4386 -find_field (struct linebuffer const *line)
4387 +find_field_uni (struct linebuffer *line)
4388  {
4389    size_t count;
4390    char const *lp = line->buffer;
4391 @@ -274,6 +301,83 @@ find_field (struct linebuffer const *line)
4392    return line->buffer + i;
4393  }
4394
4395 +#if HAVE_MBRTOWC
4396 +
4397 +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL)  \
4398 +  do                                                                        \
4399 +    {                                                                        \
4400 +      mbstate_t state_bak;                                                \
4401 +                                                                        \
4402 +      CONVFAIL = 0;                                                        \
4403 +      state_bak = *STATEP;                                                \
4404 +                                                                        \
4405 +      MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP);                \
4406 +                                                                        \
4407 +      switch (MBLENGTH)                                                        \
4408 +        {                                                                \
4409 +        case (size_t)-2:                                                \
4410 +        case (size_t)-1:                                                \
4411 +          *STATEP = state_bak;                                                \
4412 +          CONVFAIL++;                                                        \
4413 +          /* Fall through */                                                \
4414 +        case 0:                                                                \
4415 +          MBLENGTH = 1;                                                        \
4416 +        }                                                                \
4417 +    }                                                                        \
4418 +  while (0)
4419 +
4420 +static char *
4421 +find_field_multi (struct linebuffer *line)
4422 +{
4423 +  size_t count;
4424 +  char *lp = line->buffer;
4425 +  size_t size = line->length - 1;
4426 +  size_t pos;
4427 +  size_t mblength;
4428 +  wchar_t wc;
4429 +  mbstate_t *statep;
4430 +  int convfail = 0;
4431 +
4432 +  pos = 0;
4433 +  statep = &(line->state);
4434 +
4435 +  /* skip fields. */
4436 +  for (count = 0; count < skip_fields && pos < size; count++)
4437 +    {
4438 +      while (pos < size)
4439 +        {
4440 +          MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
4441 +
4442 +          if (convfail || !(iswblank (wc) || wc == '\n'))
4443 +            {
4444 +              pos += mblength;
4445 +              break;
4446 +            }
4447 +          pos += mblength;
4448 +        }
4449 +
4450 +      while (pos < size)
4451 +        {
4452 +          MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
4453 +
4454 +          if (!convfail && (iswblank (wc) || wc == '\n'))
4455 +            break;
4456 +
4457 +          pos += mblength;
4458 +        }
4459 +    }
4460 +
4461 +  /* skip fields. */
4462 +  for (count = 0; count < skip_chars && pos < size; count++)
4463 +    {
4464 +      MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
4465 +      pos += mblength;
4466 +    }
4467 +
4468 +  return lp + pos;
4469 +}
4470 +#endif
4471 +
4472  /* Return false if two strings OLD and NEW match, true if not.
4473     OLD and NEW point not to the beginnings of the lines
4474     but rather to the beginnings of the fields to compare.
4475 @@ -494,6 +598,19 @@ main (int argc, char **argv)
4476
4477    atexit (close_stdout);
4478
4479 +#if HAVE_MBRTOWC
4480 +  if (MB_CUR_MAX > 1)
4481 +    {
4482 +      find_field = find_field_multi;
4483 +    }
4484 +  else
4485 +#endif
4486 +    {
4487 +      find_field = find_field_uni;
4488 +    }
4489 +
4490 +
4491 +
4492    skip_chars = 0;
4493    skip_fields = 0;
4494    check_chars = SIZE_MAX;
4495 diff --git a/tests/Coreutils.pm b/tests/Coreutils.pm
4496 index fad7ab9..c9021a6 100644
4497 --- a/tests/Coreutils.pm
4498 +++ b/tests/Coreutils.pm
4499 @@ -264,6 +264,9 @@ sub run_tests ($$$$$)
4500        # Yes, this is an arbitrary limit.  If it causes trouble,
4501        # consider removing it.
4502        my $max = 30;
4503 +      # The downstream i18n multi-byte tests have a "-mb" suffix.
4504 +      # Therefore add 3 to the maximum test name length.
4505 +      $max += 3;
4506        if ($max < length $test_name)
4507          {
4508            warn "$program_name: $test_name: test name is too long (> $max)\n";
4509 diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh
4510 new file mode 100755
4511 index 0000000..dd6007c
4512 --- /dev/null
4513 +++ b/tests/expand/mb.sh
4514 @@ -0,0 +1,183 @@
4515 +#!/bin/sh
4516 +
4517 +# Copyright (C) 2012-2015 Free Software Foundation, Inc.
4518 +
4519 +# This program is free software: you can redistribute it and/or modify
4520 +# it under the terms of the GNU General Public License as published by
4521 +# the Free Software Foundation, either version 3 of the License, or
4522 +# (at your option) any later version.
4523 +
4524 +# This program is distributed in the hope that it will be useful,
4525 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
4526 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
4527 +# GNU General Public License for more details.
4528 +
4529 +# You should have received a copy of the GNU General Public License
4530 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
4531 +
4532 +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4533 +print_ver_ expand
4534 +
4535 +export LC_ALL=en_US.UTF-8
4536 +
4537 +#input containing multibyte characters
4538 +cat <<\EOF > in || framework_failure_
4539 +1234567812345678123456781
4540 +.       .       .       .
4541 +a      b       c       d
4542 +.       .       .       .
4543 +ä     ö      ü      ß
4544 +.       .       .       .
4545 +EOF
4546 +env printf '   äöü\t.    öüä.   \tä xx\n' >> in || framework_failure_
4547 +
4548 +cat <<\EOF > exp || framework_failure_
4549 +1234567812345678123456781
4550 +.       .       .       .
4551 +a       b       c       d
4552 +.       .       .       .
4553 +ä       ö       ü       ß
4554 +.       .       .       .
4555 +   äöü  .    öüä.       ä xx
4556 +EOF
4557 +
4558 +expand < in > out || fail=1
4559 +compare exp out > /dev/null 2>&1 || fail=1
4560 +
4561 +#multiple files as an input
4562 +cat <<\EOF >> exp || framework_failure_
4563 +1234567812345678123456781
4564 +.       .       .       .
4565 +a       b       c       d
4566 +.       .       .       .
4567 +ä       ö       ü       ß
4568 +.       .       .       .
4569 +   äöü  .    öüä.       ä xx
4570 +EOF
4571 +
4572 +expand ./in ./in > out || fail=1
4573 +compare exp out > /dev/null 2>&1 || fail=1
4574 +
4575 +#test characters with display widths != 1
4576 +env printf '12345678
4577 +e\t|ascii(1)
4578 +\u00E9\t|composed(1)
4579 +e\u0301\t|decomposed(1)
4580 +\u3000\t|ideo-space(2)
4581 +\uFF0D\t|full-hypen(2)
4582 +' > in || framework_failure_
4583 +
4584 +env printf '12345678
4585 +e       |ascii(1)
4586 +\u00E9       |composed(1)
4587 +e\u0301       |decomposed(1)
4588 +\u3000      |ideo-space(2)
4589 +\uFF0D      |full-hypen(2)
4590 +' > exp || framework_failure_
4591 +
4592 +expand < in > out || fail=1
4593 +compare exp out > /dev/null 2>&1 || fail=1
4594 +
4595 +#shouldn't fail with "input line too long"
4596 +#when a line starts with a control character
4597 +env printf '\n' > in || framework_failure_
4598 +
4599 +expand < in > out || fail=1
4600 +compare in out > /dev/null 2>&1 || fail=1
4601 +
4602 +#non-Unicode characters interspersed between Unicode ones
4603 +env printf '12345678
4604 +\t\xFF|
4605 +\xFF\t|
4606 +\t\xFFä|
4607 +ä\xFF\t|
4608 +\tä\xFF|
4609 +\xFF\tä|
4610 +äbcdef\xFF\t|
4611 +' > in || framework_failure_
4612 +
4613 +env printf '12345678
4614 +        \xFF|
4615 +\xFF       |
4616 +        \xFFä|
4617 +ä\xFF      |
4618 +        ä\xFF|
4619 +\xFF       ä|
4620 +äbcdef\xFF |
4621 +' > exp || framework_failure_
4622 +
4623 +expand < in > out || fail=1
4624 +compare exp out > /dev/null 2>&1 || fail=1
4625 +
4626 +
4627 +
4628 +#BOM header test 1
4629 +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
4630 +1234567812345678123456781
4631 +.       .       .       .
4632 +a      b       c       d
4633 +.       .       .       .
4634 +ä     ö      ü      ß
4635 +.       .       .       .
4636 +EOF
4637 +env printf '   äöü\t.    öüä.   \tä xx\n' >> in || framework_failure_
4638 +
4639 +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
4640 +1234567812345678123456781
4641 +.       .       .       .
4642 +a       b       c       d
4643 +.       .       .       .
4644 +ä       ö       ü       ß
4645 +.       .       .       .
4646 +   äöü  .    öüä.       ä xx
4647 +EOF
4648 +
4649 +
4650 +expand < in > out || fail=1
4651 +compare exp out > /dev/null 2>&1 || fail=1
4652 +
4653 +LANG=C expand < in > out || fail=1
4654 +compare exp out > /dev/null 2>&1 || fail=1
4655 +
4656 +LC_ALL=C expand < in > out || fail=1
4657 +compare exp out > /dev/null 2>&1 || fail=1
4658 +
4659 +
4660 +printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
4661 +1234567812345678123456781
4662 +.       .       .       .
4663 +a      b       c       d
4664 +.       .       .       .
4665 +ä     ö      ü      ß
4666 +.       .       .       .
4667 +EOF
4668 +env printf '   äöü\t.    öüä.   \tä xx\n' >> in1 || framework_failure_
4669 +
4670 +
4671 +printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
4672 +1234567812345678123456781
4673 +.       .       .       .
4674 +a       b       c       d
4675 +.       .       .       .
4676 +ä       ö       ü       ß
4677 +.       .       .       .
4678 +   äöü  .    öüä.       ä xx
4679 +1234567812345678123456781
4680 +.       .       .       .
4681 +a       b       c       d
4682 +.       .       .       .
4683 +ä       ö       ü       ß
4684 +.       .       .       .
4685 +   äöü  .    öüä.       ä xx
4686 +EOF
4687 +
4688 +expand in1 in1 > out || fail=1
4689 +compare exp out > /dev/null 2>&1 || fail=1
4690 +
4691 +LANG=C expand in1 in1  > out || fail=1
4692 +compare exp out > /dev/null 2>&1 || fail=1
4693 +
4694 +LC_ALL=C expand in1 in1 > out || fail=1
4695 +compare exp out > /dev/null 2>&1 || fail=1
4696 +
4697 +exit $fail
4698 diff --git a/tests/i18n/sort.sh b/tests/i18n/sort.sh
4699 new file mode 100755
4700 index 0000000..26c95de
4701 --- /dev/null
4702 +++ b/tests/i18n/sort.sh
4703 @@ -0,0 +1,29 @@
4704 +#!/bin/sh
4705 +# Verify sort's multi-byte support.
4706 +
4707 +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4708 +print_ver_ sort
4709 +
4710 +export LC_ALL=en_US.UTF-8
4711 +locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
4712 +  || skip_ "No UTF-8 locale available"
4713 +
4714 +# Enable heap consistency checkng on older systems
4715 +export MALLOC_CHECK_=2
4716 +
4717 +
4718 +# check buffer overflow issue due to
4719 +# expanding multi-byte representation due to case conversion
4720 +# https://bugzilla.suse.com/show_bug.cgi?id=928749
4721 +cat <<EOF > exp
4722 +.
4723 +ɑ
4724 +EOF
4725 +cat <<EOF | sort -f > out || fail=1
4726 +.
4727 +ɑ
4728 +EOF
4729 +compare exp out || { fail=1; cat out; }
4730 +
4731 +
4732 +Exit $fail
4733 diff --git a/tests/local.mk b/tests/local.mk
4734 index 0f77786..dbe1843 100644
4735 --- a/tests/local.mk
4736 +++ b/tests/local.mk
4737 @@ -377,6 +377,8 @@ all_tests =                                 \
4738    tests/misc/sort-discrim.sh                   \
4739    tests/misc/sort-files0-from.pl               \
4740    tests/misc/sort-float.sh                     \
4741 +  tests/misc/sort-mb-tests.sh                  \
4742 +  tests/i18n/sort.sh                           \
4743    tests/misc/sort-h-thousands-sep.sh           \
4744    tests/misc/sort-merge.pl                     \
4745    tests/misc/sort-merge-fdlimit.sh             \
4746 @@ -576,6 +578,7 @@ all_tests =                                 \
4747    tests/du/threshold.sh                                \
4748    tests/du/trailing-slash.sh                   \
4749    tests/du/two-args.sh                         \
4750 +  tests/expand/mb.sh                           \
4751    tests/id/gnu-zero-uids.sh                    \
4752    tests/id/no-context.sh                       \
4753    tests/id/context.sh                          \
4754 @@ -727,6 +730,7 @@ all_tests =                                 \
4755    tests/touch/read-only.sh                     \
4756    tests/touch/relative.sh                      \
4757    tests/touch/trailing-slash.sh                        \
4758 +  tests/unexpand/mb.sh                         \
4759    $(all_root_tests)
4760
4761  # See tests/factor/create-test.sh.
4762 diff --git a/tests/misc/expand.pl b/tests/misc/expand.pl
4763 index 7a77e6f..27f6652 100755
4764 --- a/tests/misc/expand.pl
4765 +++ b/tests/misc/expand.pl
4766 @@ -27,6 +27,15 @@ my $prog = 'expand';
4767  # Turn off localization of executable's output.
4768  @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4769
4770 +#comment out next line to disable multibyte tests
4771 +my $mb_locale = $ENV{LOCALE_FR_UTF8};
4772 +! defined $mb_locale || $mb_locale eq 'none'
4773 + and $mb_locale = 'C';
4774 +
4775 +my $prog = 'expand';
4776 +my $try = "Try \`$prog --help' for more information.\n";
4777 +my $inval = "$prog: invalid byte, character or field list\n$try";
4778 +
4779  my @Tests =
4780    (
4781     ['t1', '--tabs=3',     {IN=>"a\tb"}, {OUT=>"a  b"}],
4782 @@ -168,6 +177,8 @@ my @Tests =
4783
4784
4785     # Test errors
4786 +   # FIXME: The following tests contain ‘quoting’ specific to LC_MESSAGES
4787 +   # So we force LC_MESSAGES=C to make them pass.
4788     ['e1', '--tabs="a"', {IN=>''}, {OUT=>''}, {EXIT=>1},
4789      {ERR => "$prog: tab size contains invalid character(s): 'a'\n"}],
4790     ['e2', "-t $UINTMAX_OFLOW", {IN=>''}, {OUT=>''}, {EXIT=>1},
4791 @@ -184,6 +195,37 @@ my @Tests =
4792      {ERR => "$prog: '/' specifier not at start of number: '/'\n"}],
4793    );
4794
4795 +if ($mb_locale ne 'C')
4796 +  {
4797 +    # Duplicate each test vector, appending "-mb" to the test name and
4798 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4799 +    # provide coverage for the distro-added multi-byte code paths.
4800 +    my @new;
4801 +    foreach my $t (@Tests)
4802 +      {
4803 +        my @new_t = @$t;
4804 +        my $test_name = shift @new_t;
4805 +
4806 +        # Depending on whether expand is multi-byte-patched,
4807 +        # it emits different diagnostics:
4808 +        #   non-MB: invalid byte or field list
4809 +        #   MB:     invalid byte, character or field list
4810 +        # Adjust the expected error output accordingly.
4811 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4812 +            (@new_t))
4813 +          {
4814 +            my $sub = {ERR_SUBST => 's/, character//'};
4815 +            push @new_t, $sub;
4816 +            push @$t, $sub;
4817 +          }
4818 +        push @new, ["$test_name-mb", @new_t, {ENV => "LANG=$mb_locale LC_MESSAGES=C"}];
4819 +      }
4820 +    push @Tests, @new;
4821 +  }
4822 +
4823 +
4824 +@Tests = triple_test \@Tests;
4825 +
4826  my $save_temps = $ENV{DEBUG};
4827  my $verbose = $ENV{VERBOSE};
4828
4829 diff --git a/tests/misc/fold.pl b/tests/misc/fold.pl
4830 index 2834f92..bc1616a 100755
4831 --- a/tests/misc/fold.pl
4832 +++ b/tests/misc/fold.pl
4833 @@ -20,9 +20,18 @@ use strict;
4834
4835  (my $program_name = $0) =~ s|.*/||;
4836
4837 +my $prog = 'fold';
4838 +my $try = "Try \`$prog --help' for more information.\n";
4839 +my $inval = "$prog: invalid byte, character or field list\n$try";
4840 +
4841  # Turn off localization of executable's output.
4842  @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
4843
4844 +# uncommented to enable multibyte paths
4845 +my $mb_locale = $ENV{LOCALE_FR_UTF8};
4846 +! defined $mb_locale || $mb_locale eq 'none'
4847 + and $mb_locale = 'C';
4848 +
4849  my @Tests =
4850    (
4851     ['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}],
4852 @@ -31,9 +40,48 @@ my @Tests =
4853     ['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}],
4854    );
4855
4856 +# Add _POSIX2_VERSION=199209 to the environment of each test
4857 +# that uses an old-style option like +1.
4858 +if ($mb_locale ne 'C')
4859 +  {
4860 +    # Duplicate each test vector, appending "-mb" to the test name and
4861 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4862 +    # provide coverage for the distro-added multi-byte code paths.
4863 +    my @new;
4864 +    foreach my $t (@Tests)
4865 +      {
4866 +        my @new_t = @$t;
4867 +        my $test_name = shift @new_t;
4868 +
4869 +        # Depending on whether fold is multi-byte-patched,
4870 +        # it emits different diagnostics:
4871 +        #   non-MB: invalid byte or field list
4872 +        #   MB:     invalid byte, character or field list
4873 +        # Adjust the expected error output accordingly.
4874 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4875 +            (@new_t))
4876 +          {
4877 +            my $sub = {ERR_SUBST => 's/, character//'};
4878 +            push @new_t, $sub;
4879 +            push @$t, $sub;
4880 +          }
4881 +        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4882 +      }
4883 +    push @Tests, @new;
4884 +  }
4885 +
4886 +@Tests = triple_test \@Tests;
4887 +
4888 +# Remember that triple_test creates from each test with exactly one "IN"
4889 +# file two more tests (.p and .r suffix on name) corresponding to reading
4890 +# input from a file and from a pipe.  The pipe-reading test would fail
4891 +# due to a race condition about 1 in 20 times.
4892 +# Remove the IN_PIPE version of the "output-is-input" test above.
4893 +# The others aren't susceptible because they have three inputs each.
4894 +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
4895 +
4896  my $save_temps = $ENV{DEBUG};
4897  my $verbose = $ENV{VERBOSE};
4898
4899 -my $prog = 'fold';
4900  my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
4901  exit $fail;
4902 diff --git a/tests/misc/join.pl b/tests/misc/join.pl
4903 index 06ad777..be40204 100755
4904 --- a/tests/misc/join.pl
4905 +++ b/tests/misc/join.pl
4906 @@ -25,6 +25,15 @@ my $limits = getlimits ();
4907
4908  my $prog = 'join';
4909
4910 +my $try = "Try \`$prog --help' for more information.\n";
4911 +my $inval = "$prog: invalid byte, character or field list\n$try";
4912 +
4913 +my $mb_locale;
4914 +#Comment out next line to disable multibyte tests
4915 +$mb_locale = $ENV{LOCALE_FR_UTF8};
4916 +! defined $mb_locale || $mb_locale eq 'none'
4917 +  and $mb_locale = 'C';
4918 +
4919  my $delim = chr 0247;
4920  sub t_subst ($)
4921  {
4922 @@ -333,8 +342,49 @@ foreach my $t (@tv)
4923      push @Tests, $new_ent;
4924    }
4925
4926 +# Add _POSIX2_VERSION=199209 to the environment of each test
4927 +# that uses an old-style option like +1.
4928 +if ($mb_locale ne 'C')
4929 +  {
4930 +    # Duplicate each test vector, appending "-mb" to the test name and
4931 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
4932 +    # provide coverage for the distro-added multi-byte code paths.
4933 +    my @new;
4934 +    foreach my $t (@Tests)
4935 +      {
4936 +        my @new_t = @$t;
4937 +        my $test_name = shift @new_t;
4938 +
4939 +        # Depending on whether join is multi-byte-patched,
4940 +        # it emits different diagnostics:
4941 +        #   non-MB: invalid byte or field list
4942 +        #   MB:     invalid byte, character or field list
4943 +        # Adjust the expected error output accordingly.
4944 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
4945 +            (@new_t))
4946 +          {
4947 +            my $sub = {ERR_SUBST => 's/, character//'};
4948 +            push @new_t, $sub;
4949 +            push @$t, $sub;
4950 +          }
4951 +        #Adjust the output some error messages including test_name for mb
4952 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
4953 +             (@new_t))
4954 +          {
4955 +            my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
4956 +            push @new_t, $sub2;
4957 +            push @$t, $sub2;
4958 +          }
4959 +        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
4960 +      }
4961 +    push @Tests, @new;
4962 +  }
4963 +
4964  @Tests = triple_test \@Tests;
4965
4966 +#skip invalid-j-mb test, it is failing because of the format
4967 +@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
4968 +
4969  my $save_temps = $ENV{DEBUG};
4970  my $verbose = $ENV{VERBOSE};
4971
4972 diff --git a/tests/misc/sort-mb-tests.sh b/tests/misc/sort-mb-tests.sh
4973 new file mode 100755
4974 index 0000000..11836ba
4975 --- /dev/null
4976 +++ b/tests/misc/sort-mb-tests.sh
4977 @@ -0,0 +1,45 @@
4978 +#!/bin/sh
4979 +# Verify sort's multi-byte support.
4980 +
4981 +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
4982 +print_ver_ sort
4983 +
4984 +export LC_ALL=en_US.UTF-8
4985 +locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
4986 +  || skip_ "No UTF-8 locale available"
4987 +
4988 +
4989 +cat <<EOF > exp
4990 +Banana＠5
4991 +Apple＠10
4992 +Citrus＠20
4993 +Cherry＠30
4994 +EOF
4995 +
4996 +cat <<EOF | sort -t ＠ -k2 -n > out || fail=1
4997 +Apple＠10
4998 +Banana＠5
4999 +Citrus＠20
5000 +Cherry＠30
5001 +EOF
5002 +
5003 +compare exp out || { fail=1; cat out; }
5004 +
5005 +
5006 +cat <<EOF > exp
5007 +Citrus＠ＡＡ20＠＠5
5008 +Cherry＠ＡＡ30＠＠10
5009 +Apple＠ＡＡ10＠＠20
5010 +Banana＠ＡＡ5＠＠30
5011 +EOF
5012 +
5013 +cat <<EOF | sort -t ＠ -k4 -n > out || fail=1
5014 +Apple＠ＡＡ10＠＠20
5015 +Banana＠ＡＡ5＠＠30
5016 +Citrus＠ＡＡ20＠＠5
5017 +Cherry＠ＡＡ30＠＠10
5018 +EOF
5019 +
5020 +compare exp out || { fail=1; cat out; }
5021 +
5022 +Exit $fail
5023 diff --git a/tests/misc/sort-merge.pl b/tests/misc/sort-merge.pl
5024 index 7eb4574..eda884c 100755
5025 --- a/tests/misc/sort-merge.pl
5026 +++ b/tests/misc/sort-merge.pl
5027 @@ -26,6 +26,15 @@ my $prog = 'sort';
5028  # Turn off localization of executable's output.
5029  @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
5030
5031 +my $mb_locale;
5032 +# uncommented according to upstream commit enabling multibyte paths
5033 +$mb_locale = $ENV{LOCALE_FR_UTF8};
5034 +! defined $mb_locale || $mb_locale eq 'none'
5035 + and $mb_locale = 'C';
5036 +
5037 +my $try = "Try \`$prog --help' for more information.\n";
5038 +my $inval = "$prog: invalid byte, character or field list\n$try";
5039 +
5040  # three empty files and one that says 'foo'
5041  my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
5042
5043 @@ -77,6 +86,39 @@ my @Tests =
5044          {OUT=>$big_input}],
5045      );
5046
5047 +# Add _POSIX2_VERSION=199209 to the environment of each test
5048 +# that uses an old-style option like +1.
5049 +if ($mb_locale ne 'C')
5050 +  {
5051 +    # Duplicate each test vector, appending "-mb" to the test name and
5052 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5053 +    # provide coverage for the distro-added multi-byte code paths.
5054 +    my @new;
5055 +    foreach my $t (@Tests)
5056 +      {
5057 +        my @new_t = @$t;
5058 +        my $test_name = shift @new_t;
5059 +
5060 +        # Depending on whether sort is multi-byte-patched,
5061 +        # it emits different diagnostics:
5062 +        #   non-MB: invalid byte or field list
5063 +        #   MB:     invalid byte, character or field list
5064 +        # Adjust the expected error output accordingly.
5065 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5066 +            (@new_t))
5067 +          {
5068 +            my $sub = {ERR_SUBST => 's/, character//'};
5069 +            push @new_t, $sub;
5070 +            push @$t, $sub;
5071 +          }
5072 +        next if ($test_name =~ "nmerge-.");
5073 +        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5074 +      }
5075 +    push @Tests, @new;
5076 +  }
5077 +
5078 +@Tests = triple_test \@Tests;
5079 +
5080  my $save_temps = $ENV{DEBUG};
5081  my $verbose = $ENV{VERBOSE};
5082
5083 diff --git a/tests/misc/sort.pl b/tests/misc/sort.pl
5084 index 0b0adca..fd27821 100755
5085 --- a/tests/misc/sort.pl
5086 +++ b/tests/misc/sort.pl
5087 @@ -24,10 +24,15 @@ my $prog = 'sort';
5088  # Turn off localization of executable's output.
5089  @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
5090
5091 -my $mb_locale = $ENV{LOCALE_FR_UTF8};
5092 +my $mb_locale;
5093 +#Comment out next line to disable multibyte tests
5094 +$mb_locale = $ENV{LOCALE_FR_UTF8};
5095  ! defined $mb_locale || $mb_locale eq 'none'
5096    and $mb_locale = 'C';
5097
5098 +my $try = "Try \`$prog --help' for more information.\n";
5099 +my $inval = "$prog: invalid byte, character or field list\n$try";
5100 +
5101  # Since each test is run with a file name and with redirected stdin,
5102  # the name in the diagnostic is either the file name or "-".
5103  # Normalize each diagnostic to use '-'.
5104 @@ -423,6 +428,38 @@ foreach my $t (@Tests)
5105        }
5106    }
5107
5108 +if ($mb_locale ne 'C')
5109 +   {
5110 +    # Duplicate each test vector, appending "-mb" to the test name and
5111 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5112 +    # provide coverage for the distro-added multi-byte code paths.
5113 +    my @new;
5114 +    foreach my $t (@Tests)
5115 +       {
5116 +        my @new_t = @$t;
5117 +        my $test_name = shift @new_t;
5118 +
5119 +        # Depending on whether sort is multi-byte-patched,
5120 +        # it emits different diagnostics:
5121 +        #   non-MB: invalid byte or field list
5122 +        #   MB:     invalid byte, character or field list
5123 +        # Adjust the expected error output accordingly.
5124 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5125 +            (@new_t))
5126 +          {
5127 +            my $sub = {ERR_SUBST => 's/, character//'};
5128 +            push @new_t, $sub;
5129 +            push @$t, $sub;
5130 +          }
5131 +        #disable several failing tests until investigation, disable all tests with envvars set
5132 +        next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t));
5133 +        next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a");
5134 +        next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules.
5135 +        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5136 +       }
5137 +    push @Tests, @new;
5138 +   }
5139 +
5140  @Tests = triple_test \@Tests;
5141
5142  # Remember that triple_test creates from each test with exactly one "IN"
5143 @@ -432,6 +469,7 @@ foreach my $t (@Tests)
5144  # Remove the IN_PIPE version of the "output-is-input" test above.
5145  # The others aren't susceptible because they have three inputs each.
5146  @Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
5147 +@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests;
5148
5149  my $save_temps = $ENV{DEBUG};
5150  my $verbose = $ENV{VERBOSE};
5151 diff --git a/tests/misc/unexpand.pl b/tests/misc/unexpand.pl
5152 index 2e1906f..fe66012 100755
5153 --- a/tests/misc/unexpand.pl
5154 +++ b/tests/misc/unexpand.pl
5155 @@ -27,6 +27,14 @@ my $limits = getlimits ();
5156
5157  my $prog = 'unexpand';
5158
5159 +# comment out next line to disable multibyte tests
5160 +my $mb_locale = $ENV{LOCALE_FR_UTF8};
5161 +! defined $mb_locale || $mb_locale eq 'none'
5162 + and $mb_locale = 'C';
5163 +
5164 +my $try = "Try \`$prog --help' for more information.\n";
5165 +my $inval = "$prog: invalid byte, character or field list\n$try";
5166 +
5167  my @Tests =
5168      (
5169       ['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}],
5170 @@ -128,6 +136,37 @@ my @Tests =
5171       ['ts2', '-t5,8', {IN=>"x\t \t y\n"},    {OUT=>"x\t\t y\n"}],
5172      );
5173
5174 +if ($mb_locale ne 'C')
5175 +  {
5176 +    # Duplicate each test vector, appending "-mb" to the test name and
5177 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5178 +    # provide coverage for the distro-added multi-byte code paths.
5179 +    my @new;
5180 +    foreach my $t (@Tests)
5181 +      {
5182 +        my @new_t = @$t;
5183 +        my $test_name = shift @new_t;
5184 +
5185 +        # Depending on whether unexpand is multi-byte-patched,
5186 +        # it emits different diagnostics:
5187 +        #   non-MB: invalid byte or field list
5188 +        #   MB:     invalid byte, character or field list
5189 +        # Adjust the expected error output accordingly.
5190 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5191 +            (@new_t))
5192 +          {
5193 +            my $sub = {ERR_SUBST => 's/, character//'};
5194 +            push @new_t, $sub;
5195 +            push @$t, $sub;
5196 +          }
5197 +        next if ($test_name =~ 'b-1');
5198 +        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5199 +      }
5200 +    push @Tests, @new;
5201 +  }
5202 +
5203 +@Tests = triple_test \@Tests;
5204 +
5205  my $save_temps = $ENV{DEBUG};
5206  my $verbose = $ENV{VERBOSE};
5207
5208 diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
5209 index aa163cd..91d617d 100755
5210 --- a/tests/misc/uniq.pl
5211 +++ b/tests/misc/uniq.pl
5212 @@ -23,9 +23,17 @@ my $limits = getlimits ();
5213  my $prog = 'uniq';
5214  my $try = "Try '$prog --help' for more information.\n";
5215
5216 +my $inval = "$prog: invalid byte, character or field list\n$try";
5217 +
5218  # Turn off localization of executable's output.
5219  @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
5220
5221 +my $mb_locale;
5222 +#Comment out next line to disable multibyte tests
5223 +$mb_locale = $ENV{LOCALE_FR_UTF8};
5224 +! defined $mb_locale || $mb_locale eq 'none'
5225 +  and $mb_locale = 'C';
5226 +
5227  # When possible, create a "-z"-testing variant of each test.
5228  sub add_z_variants($)
5229  {
5230 @@ -262,6 +270,53 @@ foreach my $t (@Tests)
5231        and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
5232    }
5233
5234 +if ($mb_locale ne 'C')
5235 +  {
5236 +    # Duplicate each test vector, appending "-mb" to the test name and
5237 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5238 +    # provide coverage for the distro-added multi-byte code paths.
5239 +    my @new;
5240 +    foreach my $t (@Tests)
5241 +      {
5242 +        my @new_t = @$t;
5243 +        my $test_name = shift @new_t;
5244 +
5245 +        # Depending on whether uniq is multi-byte-patched,
5246 +        # it emits different diagnostics:
5247 +        #   non-MB: invalid byte or field list
5248 +        #   MB:     invalid byte, character or field list
5249 +        # Adjust the expected error output accordingly.
5250 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5251 +            (@new_t))
5252 +          {
5253 +            my $sub = {ERR_SUBST => 's/, character//'};
5254 +            push @new_t, $sub;
5255 +            push @$t, $sub;
5256 +          }
5257 +        # In test #145, replace the each ‘...’ by '...'.
5258 +        if ($test_name =~ "145")
5259 +          {
5260 +            my $sub = { ERR_SUBST => "s/‘([^’]+)’/'\$1'/g"};
5261 +            push @new_t, $sub;
5262 +            push @$t, $sub;
5263 +          }
5264 +        next if (   $test_name =~ "schar"
5265 +                 or $test_name =~ "^obs-plus"
5266 +                 or $test_name =~ "119");
5267 +        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5268 +      }
5269 +    push @Tests, @new;
5270 +   }
5271 +
5272 +# Remember that triple_test creates from each test with exactly one "IN"
5273 +# file two more tests (.p and .r suffix on name) corresponding to reading
5274 +# input from a file and from a pipe.  The pipe-reading test would fail
5275 +# due to a race condition about 1 in 20 times.
5276 +# Remove the IN_PIPE version of the "output-is-input" test above.
5277 +# The others aren't susceptible because they have three inputs each.
5278 +
5279 +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
5280 +
5281  @Tests = add_z_variants \@Tests;
5282  @Tests = triple_test \@Tests;
5283
5284 diff --git a/tests/pr/pr-tests.pl b/tests/pr/pr-tests.pl
5285 index 7ac6d4c..ae6cc35 100755
5286 --- a/tests/pr/pr-tests.pl
5287 +++ b/tests/pr/pr-tests.pl
5288 @@ -24,6 +24,15 @@ use strict;
5289  my $prog = 'pr';
5290  my $normalize_strerror = "s/': .*/'/";
5291
5292 +my $mb_locale;
5293 +#Uncomment the following line to enable multibyte tests
5294 +$mb_locale = $ENV{LOCALE_FR_UTF8};
5295 +! defined $mb_locale || $mb_locale eq 'none'
5296 +  and $mb_locale = 'C';
5297 +
5298 +my $try = "Try \`$prog --help' for more information.\n";
5299 +my $inval = "$prog: invalid byte, character or field list\n$try";
5300 +
5301  my @tv = (
5302
5303  # -b option is no longer an official option. But it's still working to
5304 @@ -512,8 +521,48 @@ push @Tests,
5305      {IN=>"x\tx\tx\tx\tx\nx\tx\tx\tx\tx\n"},
5306       {OUT=>"x\tx\tx\tx\tx\tx\tx\tx\tx\tx\n"} ];
5307
5308 +# Add _POSIX2_VERSION=199209 to the environment of each test
5309 +# that uses an old-style option like +1.
5310 +if ($mb_locale ne 'C')
5311 +  {
5312 +    # Duplicate each test vector, appending "-mb" to the test name and
5313 +    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
5314 +    # provide coverage for the distro-added multi-byte code paths.
5315 +    my @new;
5316 +    foreach my $t (@Tests)
5317 +      {
5318 +        my @new_t = @$t;
5319 +        my $test_name = shift @new_t;
5320 +
5321 +        # Depending on whether pr is multi-byte-patched,
5322 +        # it emits different diagnostics:
5323 +        #   non-MB: invalid byte or field list
5324 +        #   MB:     invalid byte, character or field list
5325 +        # Adjust the expected error output accordingly.
5326 +        if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
5327 +            (@new_t))
5328 +          {
5329 +            my $sub = {ERR_SUBST => 's/, character//'};
5330 +            push @new_t, $sub;
5331 +            push @$t, $sub;
5332 +          }
5333 +        #temporarily skip some failing tests
5334 +        next if ($test_name =~ "col-0" or $test_name =~ "col-inval" or $test_name =~ "asan1");
5335 +        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
5336 +      }
5337 +    push @Tests, @new;
5338 +  }
5339 +
5340  @Tests = triple_test \@Tests;
5341
5342 +# Remember that triple_test creates from each test with exactly one "IN"
5343 +# file two more tests (.p and .r suffix on name) corresponding to reading
5344 +# input from a file and from a pipe.  The pipe-reading test would fail
5345 +# due to a race condition about 1 in 20 times.
5346 +# Remove the IN_PIPE version of the "output-is-input" test above.
5347 +# The others aren't susceptible because they have three inputs each.
5348 +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
5349 +
5350  my $save_temps = $ENV{DEBUG};
5351  my $verbose = $ENV{VERBOSE};
5352
5353 diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh
5354 new file mode 100755
5355 index 0000000..8a82d74
5356 --- /dev/null
5357 +++ b/tests/unexpand/mb.sh
5358 @@ -0,0 +1,172 @@
5359 +#!/bin/sh
5360 +
5361 +# Copyright (C) 2012-2015 Free Software Foundation, Inc.
5362 +
5363 +# This program is free software: you can redistribute it and/or modify
5364 +# it under the terms of the GNU General Public License as published by
5365 +# the Free Software Foundation, either version 3 of the License, or
5366 +# (at your option) any later version.
5367 +
5368 +# This program is distributed in the hope that it will be useful,
5369 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
5370 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
5371 +# GNU General Public License for more details.
5372 +
5373 +# You should have received a copy of the GNU General Public License
5374 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
5375 +
5376 +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
5377 +print_ver_ unexpand
5378 +
5379 +export LC_ALL=en_US.UTF-8
5380 +
5381 +#input containing multibyte characters
5382 +cat > in <<\EOF
5383 +1234567812345678123456781
5384 +.       .       .       .
5385 +a       b       c       d
5386 +.       .       .       .
5387 +ä       ö       ü       ß
5388 +.       .       .       .
5389 +   äöü  .    öüä.       ä xx
5390 +EOF
5391 +
5392 +cat > exp <<\EOF
5393 +1234567812345678123456781
5394 +.      .       .       .
5395 +a      b       c       d
5396 +.      .       .       .
5397 +ä     ö      ü      ß
5398 +.      .       .       .
5399 +   äöü      .    öüä.    ä xx
5400 +EOF
5401 +
5402 +unexpand -a < in > out || fail=1
5403 +compare exp out > /dev/null 2>&1 || fail=1
5404 +
5405 +
5406 +#multiple files as an input
5407 +cat >> exp <<\EOF
5408 +1234567812345678123456781
5409 +.      .       .       .
5410 +a      b       c       d
5411 +.      .       .       .
5412 +ä     ö      ü      ß
5413 +.      .       .       .
5414 +   äöü      .    öüä.    ä xx
5415 +EOF
5416 +
5417 +
5418 +unexpand -a ./in ./in > out || fail=1
5419 +compare exp out > /dev/null 2>&1 || fail=1
5420 +
5421 +#test characters with a display width larger than 1
5422 +
5423 +env printf '12345678
5424 +e       |ascii(1)
5425 +\u00E9       |composed(1)
5426 +e\u0301       |decomposed(1)
5427 +\u3000      |ideo-space(2)
5428 +\uFF0D      |full-hypen(2)
5429 +' > in || framework_failure_
5430 +
5431 +env printf '12345678
5432 +e\t|ascii(1)
5433 +\u00E9\t|composed(1)
5434 +e\u0301\t|decomposed(1)
5435 +\u3000\t|ideo-space(2)
5436 +\uFF0D\t|full-hypen(2)
5437 +' > exp || framework_failure_
5438 +
5439 +unexpand -a < in > out || fail=1
5440 +compare exp out > /dev/null 2>&1 || fail=1
5441 +
5442 +#test input where a blank of width > 1 is not being substituted
5443 +in="$(LC_ALL=en_US.UTF-8 printf ' \u3000  ö       ü       ß')"
5444 +exp=' 　  ö       ü      ß'
5445 +
5446 +unexpand -a < in > out || fail=1
5447 +compare exp out > /dev/null 2>&1 || fail=1
5448 +
5449 +#non-Unicode characters interspersed between Unicode ones
5450 +env printf '12345678
5451 +        \xFF|
5452 +\xFF       |
5453 +        \xFFä|
5454 +ä\xFF      |
5455 +        ä\xFF|
5456 +\xFF       ä|
5457 +äbcdef\xFF |
5458 +' > in || framework_failure_
5459 +
5460 +env printf '12345678
5461 +\t\xFF|
5462 +\xFF\t|
5463 +\t\xFFä|
5464 +ä\xFF\t|
5465 +\tä\xFF|
5466 +\xFF\tä|
5467 +äbcdef\xFF\t|
5468 +' > exp || framework_failure_
5469 +
5470 +unexpand -a < in > out || fail=1
5471 +compare exp out > /dev/null 2>&1 || fail=1
5472 +
5473 +#BOM header test 1
5474 +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
5475 +1234567812345678123456781
5476 +.       .       .       .
5477 +a       b       c       d
5478 +.       .       .       .
5479 +ä       ö       ü       ß
5480 +.       .       .       .
5481 +   äöü  .    öüä.       ä xx
5482 +EOF
5483 +env printf '   äöü\t.    öüä.   \tä xx\n' >> in || framework_failure_
5484 +
5485 +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
5486 +1234567812345678123456781
5487 +.      .       .       .
5488 +a      b       c       d
5489 +.      .       .       .
5490 +ä     ö      ü      ß
5491 +.      .       .       .
5492 +   äöü      .    öüä.    ä xx
5493 +EOF
5494 +
5495 +unexpand < in > out || fail=1
5496 +compare exp out > /dev/null 2>&1 || fail=1
5497 +
5498 +LANG=C unexpand < in > out || fail=1
5499 +compare exp out > /dev/null 2>&1 || fail=1
5500 +
5501 +LC_ALL=C unexpand < in > out || fail=1
5502 +compare exp out > /dev/null 2>&1 || fail=1
5503 +
5504 +
5505 +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
5506 +1234567812345678123456781
5507 +.      .       .       .
5508 +a      b       c       d
5509 +.      .       .       .
5510 +ä     ö      ü      ß
5511 +.      .       .       .
5512 +   äöü      .    öüä.    ä xx
5513 +1234567812345678123456781
5514 +.      .       .       .
5515 +a      b       c       d
5516 +.      .       .       .
5517 +ä     ö      ü      ß
5518 +.      .       .       .
5519 +   äöü      .    öüä.    ä xx
5520 +EOF
5521 +
5522 +
5523 +unexpand in in > out || fail=1
5524 +compare exp out > /dev/null 2>&1 || fail=1
5525 +
5526 +LANG=C unexpand in in > out || fail=1
5527 +compare exp out > /dev/null 2>&1 || fail=1
5528 +
5529 +LC_ALL=C unexpand in in > out || fail=1
5530 +compare exp out > /dev/null 2>&1 || fail=1
5531 --
5532 2.34.1
5533